MetalPerformancePrimitives iOS xcode26.1 b2

#MetalPerformancePrimitives.framework

diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2025-09-08 05:44:05
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2025-09-26 16:15:07
@@ -8,22 +8,36 @@
 //             C = A*B + C;
 // A and B can be tensor_handle, tensor_offset, and tensor_inline.
 // C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
-// Data type combinations supported by this operation are as follows
-//   A           B         C
+// Data type combinations supported by this operation are as follows:
+//
+//  A          B         C
+//  ---------------------------
+//  half       half      half
+//  half       int8_t    half
+//  int8_t     half      half
+//  half       half      float
+//  half       float     float
+//  half       int8_t    float
+//  float      half      float
+//  float      float     float
+//  float      int8_t    float
+//  int8_t     half      float
+//  int8_t     float     float
 //  int8_t     int8_t    int32_t
-//  int8_t     int8_t    float
-//  int8_t     int8_t    half
-//  uint8_t    int8_t    int32_t
-//  uint8_t    int8_t    float
-//  uint8_t    int8_t    half
-//  int8_t     uint8_t   int32_t
-//  int8_t     uint8_t   float
-//  int8_t     uint8_t   half
-//  uint8_t    uint8_t   int32_t
-//  uint8_t    uint8_t   float
-//  uint8_t    uint8_t   half
-//   half       half     float
-//   half       half     half
+//  bfloat     bfloat    bfloat
+//  bfloat     bfloat    float
+//  bfloat     float     float
+//  bfloat     int8_t    bfloat
+//  bfloat     int8_t    float
+//  float      bfloat    float
+//  int8_t     bfloat    bfloat
+//  int8_t     bfloat    float
+//  bfloat     half      bfloat
+//  bfloat     half      half
+//  bfloat     half      float
+//  half       bfloat    bfloat
+//  half       bfloat    half
+//  half       bfloat    float
 //
 // Basic usage is in the following example which takes M x K matrix A of type
 // half, K x N matrix B of type half, both in device memory and produces M x N
@@ -43,72 +57,43 @@
 //    [encoder dispatchThreadgroups:threadgroups
 //    threadPerThreadgroups:MTLSizeMake(simdgroupWidth*4, 1, 1)];
 //
-// kernel void simpleMatMul(tensor<device half,  dextents<int32_t, 2>,
-// tensor_handle> A,
-//                          tensor<device half,  dextents<int32_t, 2>,
-//                          tensor_handle> B, tensor<device float,
-//                          dextents<int32_t, 2>, tensor_handle> C, constant
-//                          uint& M, constant uint& N, constant uint& K, uint2
-//                          tgid [[threadgroup_position_in_grid]])
+// kernel void simpleMatMul(tensor<device half,  dextents<int32_t, 2>> A,
+//                          tensor<device half,  dextents<int32_t, 2>> B,
+//                          tensor<device float, dextents<int32_t, 2>> C,
+//                          constant uint& M, constant uint& N, constant uint& K,
+//                          uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     // descriptor to create matmul operation that does 64x32 times 32x32
-//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
-//     //m outer dim of local tile
-//                                                           32, //n outer dim
-//                                                           of local tile
-//                                                            0, //k inner
-//                                                            dimension. 0 means
-//                                                               //operation
-//                                                               will read K
-//                                                               from
-//                                                               //input tensor
-//                                                               //K =
-//                                                               A.extents().extent(0)
-//                                                               or
-//                                                               B.extents().extent(1)
-//                                                               for NN
-//                                                               //K =
-//                                                               A.extents().extent(0)
-//                                                               or
-//                                                               B.extents().extent(0)
-//                                                               for NT
-//                                                               //and so on..
-//                                                        false, //transpse_left
-//                                                        = false for NN and NT
-//                                                        and true for TN and TT
-//                                                        false,
-//                                                        //transpse_right =
-//                                                        false for NN and TN
-//                                                        and true for NT and TT
-//                                                        false,
-//                                                        //relaxed_precision =
-//                                                        false, set it to true
-//                                                        to allow
-//                                                        implementation
-//                                                               //sacrifice
-//                                                               accurancy for
-//                                                               performance.
-//                                                          );
+//     // producing 64x32
+//     constexpr auto matmulDescriptor =
+//         matmul2d_descriptor(64, // m outer dim of local tile
+//                             32, // n outer dim of local tile
+//                             static_cast<int>(dynamic_extent), // k inner dimension. dynamic_extent means operation will read K from input tensor
+//                                                               // K = A.extents().extent(0) or B.extents().extent(1) for NN
+//                                                               // K = A.extents().extent(0) or B.extents().extent(0) for NT
+//                                                               // and so on..
+//                             false,  // transpse_left = false for NN and NT and true for TN and TT
+//                             false,  // transpse_right = false for NN and TN and true for NT and TT
+//                             false); // relaxed_precision = false, set it to true to allow implementation
+//                                     // to sacrifice accurancy for performance.
 //
 //    // create matmul op from above descriptor with 4 SIMD-Groups. All 4
-//    SIMD-Groups in this threadgroup will execute this
+//    // SIMD-Groups in this threadgroup will execute this
 //    // matmul cooperatively. More on this scope below.
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//    matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
 //    // Following three lines of code create appropriate slice for this thread
-//    group to work on.
-//    // E.g. A.offset below creates a tensor<device half, dextents<int32_t, 2>,
-//    tensor_offset>
+//    // group to work on. E.g. A.slice below creates a
+//    // tensor<device half, dextents<int32_t, 2>, tensor_offset>
 //    // which has same extents as original tensor A but origin shifted to
-//    (0,tgid.y*64) i.e.
-//    // mA[x,y] == A[x,tgid.y*64+y]
+//    // (0,tgid.y*64) i.e. mA[x,y] == A[x,tgid.y*64+y]
 //
-//    auto mA = A.offset(0, tgid.y*64);
-//    auto mB = B.offset(tgid.x*32, 0);
-//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//    auto mA = A.slice(0, tgid.y*64);
+//    auto mB = B.slice(tgid.x*32, 0);
+//    auto mC = C.slice(tgid.x*32, tgid.y*64);
 //
-//     // execute the operation. Assumes C is is initialized to zero.
-//     op.run(mA, mB, mC);
+//    // execute the operation. Assumes C is is initialized to zero.
+//    op.run(mA, mB, mC);
 // }
 //
 // Above matrix multiplication implementation will do edge checking for all
@@ -117,23 +102,19 @@
 // bounds check. In high performance code we can avoid edge checking for inside
 // thread groups and get better performance
 //
-// kernel void matMul(tensor<device half,  dextents<int32_t, 2>, tensor_handle>
-// A,
-//                    tensor<device half,  dextents<int32_t, 2>, tensor_handle>
-//                    B, tensor<device float, dextents<int32_t, 2>,
-//                    tensor_handle> C, constant uint& M, constant uint& N,
-//                    constant uint& K, uint2 tgid
-//                    [[threadgroup_position_in_grid]])
+// kernel void matMul(tensor<device half,  dextents<int32_t, 2>> A,
+//                    tensor<device half,  dextents<int32_t, 2>> B,
+//                    tensor<device float, dextents<int32_t, 2>> C,
+//                    constant uint& M, constant uint& N,
+//                    constant uint& K, uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     // descriptor to create matmul operation that does 64x32 times 32x32
-//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//     // producing 64x32
+//     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
 //                                                           32,
-//                                                            0,
-//                                                        false,
-//                                                        false,
-//                                                        false);
+//                                                           static_cast<int>(dynamic_extent));
 //
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//     matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
 //    // Inside thredgroup in both outer dimensions M and N.
 //    if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
@@ -146,41 +127,32 @@
 //    }
 //    else
 //    {
-//      auto tA = A.offset(0, tgid.y*64);
-//      auto tB = B.offset(tgid.x*32, 0);
-//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//      auto tA = A.slice(0, tgid.y*64);
+//      auto tB = B.slice(tgid.x*32, 0);
+//      auto tC = C.slice(tgid.x*32, tgid.y*64);
 //
 //      op.run(tA, tB, tC);
 //    }
 // }
 //
-// User can also take ownership of looping over in reduction or k-dimension by
+// User can also take ownership of looping over reduction or k-dimension by
 // choosing appropriate chunk size in k (called k-tile or tilek) For following
-// example, we choose 16. kernel void matMulKLoop(tensor<device half,
-// dextents<int32_t, 2>, tensor_handle> A,
-//                         tensor<device half,  dextents<int32_t, 2>,
-//                         tensor_handle> B, tensor<device float,
-//                         dextents<int32_t, 2>, tensor_handle> C, constant
-//                         uint& M, constant uint& N, constant uint& K, uint2
-//                         tgid [[threadgroup_position_in_grid]])
+// example, we choose 16.
+// kernel void matMulKLoop(tensor<device half, dextents<int32_t, 2>> A,
+//                         tensor<device half, dextents<int32_t, 2>> B,
+//                         tensor<device float, dextents<int32_t, 2>> C,
+//                         constant uint& M, constant uint& N, constant uint& K,
+//                         uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     // descriptor to create matmul operation that does 64x32 times 32x32
-//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//     // producing 64x32
+//     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
 //                                                           32,
-//                                                           16, // tilek = 16,
-//                                                           we loop over K in
-//                                                           chucks of 16
-//                                                               // rather than
-//                                                               letting matmul
-//                                                               op run method
-//                                                               looping over K
-//                                                               // internally
-//                                                               choosing tileK
-//                                                        false,
-//                                                        false,
-//                                                        false);
+//                                                           16); // tilek = 16, we loop over K in chunks of 16 rather than
+//                                                                // letting matmul op run method looping over K
+//                                                                // internally choose tileK
 //
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//     matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
 //     constexpr int tilek = 16;
 //
@@ -203,9 +175,9 @@
 //    }
 //    else
 //    {
-//      auto tA = A.offset(0, tgid.y*64);
-//      auto tB = B.offset(tgid.x*32, 0);
-//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//      auto tA = A.slice(0, tgid.y*64);
+//      auto tB = B.slice(tgid.x*32, 0);
+//      auto tC = C.slice(tgid.x*32, tgid.y*64);
 //
 //      op.run(tA, tB, tC);
 //    }
@@ -219,90 +191,80 @@
 // performance and power. User can apply post processing in-register where GEMM
 // output is computed using cooperative_tensor. Unlike tensor_handle,
 // tensor_offset and tensor_inline which are non-owning meaning these are
-// wrappers around resource in device, threadgroup or thread addressspce,
+// wrappers around resource in device, threadgroup or thread address space,
 // cooperative_tensor owns thread private data and divides the data for entire
-// tensor among threads (participating the scope of operation) in implementation
+// tensor among threads (participating in the scope of operation) in implementation
 // defined manner. This thread private memory is allocated at construction of
 // cooperative_tensor and deallocated when this cooperative_tensor goes out of
 // scope. The layout of cooperative_tensor depends on operation, data type,
 // number of threads in opscope with which op was created. Note that
 // cooperative_tensor created from an op is only valid for threads that are part
-// of opscope on which op was created. Though the layout of cooperative_tensor
+// of execution scope on which op was created. Though the layout of cooperative_tensor
 // is implementation defined, we provide accessor functions as shown in the
 // example below
 //
-// kernel void simpleMatMulCooperative(tensor<device half,  dextents<int32_t,
-// 2>, tensor_handle> A,
-//                          tensor<device half,  dextents<int32_t, 2>,
-//                          tensor_handle> B, tensor<device float,
-//                          dextents<int32_t, 2>, tensor_handle> C,
-//                          tensor<device half, dextents<int32_t, 2>,
-//                          tensor_handle> bias, constant uint& M, constant
-//                          uint& N, constant uint& K, uint2 tgid
-//                          [[threadgroup_position_in_grid]])
+// kernel void simpleMatMulCooperative(tensor<device half, dextents<int32_t, 2>> A,
+//                                     tensor<device half, dextents<int32_t, 2>> B,
+//                                     tensor<device float, dextents<int32_t, 2>> C,
+//                                     tensor<device half, dextents<int32_t, 2>> bias,
+//                                     constant uint& M, constant uint& N, constant uint& K,
+//                                     uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
 //                                                           32,
-//                                                            0,
-//                                                        false,
-//                                                        false,
-//                                                        false);
+//                                                           static_cast<int>(dynamic_extent));
 //
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//     matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
-//    auto mA = A.offset(0, tgid.y*64);
-//    auto mB = B.offset(tgid.x*32, 0);
-//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//    auto mA = A.slice(0, tgid.y*64);
+//    auto mB = B.slice(tgid.x*32, 0);
+//    auto mC = C.slice(tgid.x*32, tgid.y*64);
 //
 //    // This creates cooperative destination tensor of float element type.
 //    // Since matmul op above descriptor is created with 4 SIMD-Groups,
-//    coopeartive tensor will divide data among the threads on these
+//    // cooperative tensor will divide data among the threads in these
 //    // 4 SIMD-Groups. The layout of data among lanes is implementation defined
-//    and not all threads and even all elements within a thread need
-//    // be valid. We provide valid element check shown below which developer
-//    should use to guard their access to elements of cooperative_tensor
+//    // and not all threads and even all elements within a thread need
+//    // be valid. Use the valid element check shown below to guard
+//    // access to elements of cooperative_tensor
 //
-//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA),
-//    decltype(mB), float>();
+//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
 //
 //    // Loop over all the elements of cooperative_tensor thread elements owned
-//    by "this" thread and initialize to zero.
-//    // Its imperative for performance to include "unroll pragma" so compiler
-//    fully unrolls the loop.
+//    // by "this" thread and initialize to zero.
+//    // It is imperative for performance to include "unroll pragma" so compiler
+//    // fully unrolls the loop.
 //
 //    #pragma unroll full
-//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
-//
-//      if(cT.mask(i))
+//    for (uint16_t i = 0, i < cT.get_capacity(); ++i) {
+//      if(cT.get_mask(i))
 //        cT[i] = 0;
 //    }
 //
-//    // execute the operation. All threads computes the matmul cooperatively
-//    and results are written to cooperative_tensor. op.run(mA, mB, cT);
+//    // execute the operation. All threads compute the matmul cooperatively
+//    // and results are written to cooperative_tensor.
+//    op.run(mA, mB, cT);
 //
-//   // create cooperative bias tensor with same layout as destination
-//   cooperative_tensor of matmul auto biasT =
-//   matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB),
-//   float>();
+//    // create cooperative bias tensor with same layout as destination
+//    // cooperative_tensor of matmul
+//    auto biasT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
 //
-//   // load data from bias tensor_handle into biasT cooperative_tensor using
-//   layout and distribution of element among threads of scope
-//   // on which matmul was created.
-//   biasT.load(bias);
+//    // load data from bias tensor_handle into biasT cooperative_tensor using
+//    // layout and distribution of element among threads of scope on which matmul was created.
+//    biasT.load(bias);
 //
 //    #pragma unroll full
-//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//    for (uint16_t i = 0, i < cT.get_capacity(); ++i) {
 //
-//      if(cT.mask(i)) {
+//      if(cT.get_mask(i)) {
 //        //add bias
 //        cT[i] += biasT[i];
 //
 //        // get the 2-dimensional local coordinate of this thread's i-th
-//        element in destination local coordinate system (in this example
+//        // element in destination local coordinate system (in this example
 //        // 32 x 64 tile).
-//        auto ids = cT.multidimensional_indices(i);
-//        cT[i] = foo(cT[i], idx); // do some operation based on coordinate
-//        values
+//        auto ids = cT.get_multidimensional_index(i);
+//        cT[i] = foo(cT[i], idx); // do some operation based on coordinate values
 //      }
 //    }
 //
@@ -313,29 +275,28 @@
 // Note on scope of operation
 // ==========================
 // A tensor operation may be executed on a single thread entirely or
-// cooperatively among a set of SIMD groups. We call these set of threads
-// "execution scop" of the tensor operation. A tensor ops must be created with
-// execution scope provided as template argument. All the threads in this
+// cooperatively among a set of SIMD groups. The set of threads is called the
+// "execution scope" of the tensor operation. A tensor op must be created with
+// an execution scope provided as template argument. All the threads in this
 // execution scope must enter the run method i.e. call to run methods must be
 // "execution scope" uniform. Use the following types to configure the execution
-// modes of each operation: metal::execution_thread: the operation will be run on a
-// single thread.
-//                 Fragment shaders only support this execution scope.
-// metal::execution_simdgroup - the operation will be run cooperatively by all threads in
-// this SIMD group.
-//                     May be used for finer control over tiling by slicing
-//                     tensors with SIMD IDs.
-// opscope_SIMD-Groups<N> - the operation will be executed cooperatively by N
-// SIMD groups.
-//                          Must be used when all threads in a threadgroup are
-//                          cooperatively performing the operation.
+// modes of each operation:
+//     metal::execution_thread: The operation will be run on a single thread.
+//                              Fragment shaders only support this execution scope.
+//     metal::execution_simdgroup: The operation will be run cooperatively by all
+//                                 threads in the SIMD group. May be used for finer
+//                                 control over tiling by slicing tensors with SIMD IDs.
+//     metal::execution_simdgroups<N>: The operation will be executed cooperatively by N
+//                                     SIMD groups. Must be used when all threads in a
+//                                     threadgroup are cooperatively performing the operation.
+//
 // It is undefined behavior if the number of SIMD groups dispatched does not
 // match the number of SIMD groups that the operation was configured with.
 //
-// Even though each thread in execution scope can potentially independently
-// enter and exit run method, developer cannot assume that threads in execution
-// scope are working completely independently i.e. tensor operation run
-// implementation may need for (for correctness or performance) synchronize
+// Even though each thread in the execution scope can potentially independently
+// enter and exit run method, the threads in the execution scope are working
+// do not necessarily operate completely independently. For example, the tensor
+// operation may need for (for correctness or performance) to synchronize
 // among the threads in execution scope it was created with.
 //
 //
@@ -383,7 +344,7 @@
   mode matmul_mode;
 
 public:
-  constexpr matmul2d_descriptor(int __m, int __n, int __k = dynamic_length_v<int>,
+  constexpr matmul2d_descriptor(int __m, int __n, int __k = static_cast<int>(metal::dynamic_extent),
                                 bool __transpose_left = false,
                                 bool __transpose_right = false,
                                 bool __relaxed_precision = false,
@@ -404,9 +365,9 @@
 {
   static const constant ElementType sum_identity = (ElementType)0;
   static const constant ElementType max_identity =
-      metal::numeric_limits<ElementType>::lowest;
+      metal::numeric_limits<ElementType>::lowest();
   static const constant ElementType min_identity =
-      metal::numeric_limits<ElementType>::max;
+      metal::numeric_limits<ElementType>::max();
 };
 
 #include "__impl/MPPTensorOpsMatMul2dImpl.h"
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2025-09-17 08:16:34
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2025-09-26 16:27:14
@@ -156,6 +156,26 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype,
     int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
@@ -211,6 +231,24 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype,
     int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int threads);
 
 extern "C" EXTERNALLY_DEFINED_ATTR size_t
 __tensorops_impl_matmul2d_op_cooperative_reduction_destination_data_size(
@@ -359,6 +397,26 @@
     int,
     __tensor_ops_detail::__tensor_ops_datatype leftDataType,
     __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f16(
@@ -420,6 +478,26 @@
     int,
     __tensor_ops_detail::__tensor_ops_datatype leftDataType,
     __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16(
@@ -448,6 +526,15 @@
     __reduction_operation,
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__thread_void_t,
+    bfloat,
+    __reduction_operation,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16(
@@ -476,6 +563,15 @@
     __reduction_operation,
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__thread_void_t,
+    bfloat,
+    __reduction_operation,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
@@ -2640,6 +2736,287 @@
     __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
         destinationDescType);
 
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+
 template <__matmul2d_descriptor descriptor,
           __matmul2d_cooperative_operand_index operand_index, typename scope,
           typename left_operand, typename right_operand, typename element_type,
@@ -2652,9 +3029,10 @@
                 "only destination can be cooperative tensor");
   static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
                     __tensor_ops_detail::__is_same_v<element_type, half> ||
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
                     __tensor_ops_detail::__is_same_v<element_type, int32_t>,
                 "cooperative tensor data type can only be one of "
-                "float/half/int32_t");
+                "float/half/bfloat/int32_t");
 
   static constant constexpr __tensor_ops_detail::__rank_t rank = 2;
   using element_t = element_type;
@@ -2827,6 +3205,21 @@
         static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_b16(
+            desc, storage, source, sourceDescType, sourceRank, leftDataType,
+            rightDataType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_b16(
+            desc, storage, source, sourceDescType, sourceRank, leftDataType,
+            rightDataType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -2913,6 +3306,22 @@
         static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_b16(
+            desc, storage, destination, destinationDescType, leftDataType,
+            rightDataType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_b16(
+            desc, storage, destination, destinationDescType, leftDataType,
+            rightDataType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -3089,9 +3498,10 @@
 {
   static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
                     __tensor_ops_detail::__is_same_v<element_type, half> ||
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
                     __tensor_ops_detail::__is_same_v<element_type, int32_t>,
                 "cooperative tensor data type can only be one of "
-                "float/half/int32_t");
+                "float/half/bfloat/int32_t");
 
   static constant constexpr __tensor_ops_detail::__rank_t rank = 1;
   using element_t = element_type;
@@ -3259,6 +3669,19 @@
         static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16(
+            desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16(
+            desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -3339,6 +3762,20 @@
         static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16(
+            desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16(
+            desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -4460,6 +4897,552 @@
               __tensor_ops_detail::__assert_false_v<destinationPtrType>,
               "Unsupported address space");
       }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
       else
         static_assert(
             __tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5394,6 +6377,552 @@
               __tensor_ops_detail::__assert_false_v<destinationPtrType>,
               "Unsupported address space");
       }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
       else
         static_assert(
             __tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5881,6 +7410,272 @@
               __tensor_ops_detail::__assert_false_v<destinationPtrType>,
               "Unsupported address space");
       }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
       else
         static_assert(
             __tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5938,6 +7733,9 @@
   else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, float>)
     __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32(
         desc, src, dst, identity, op, leftDataType, rightDataType);
+  else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, bfloat>)
+    __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16(
+        desc, src, dst, identity, op, leftDataType, rightDataType);
   else
     static_assert(__tensor_ops_detail::__assert_false_v<ElementType>,
                   "Unsupported type");
@@ -5992,6 +7790,9 @@
   else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, float>)
     __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32(
         desc, src, dst, identity, op, leftDataType, rightDataType);
+  else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, bfloat>)
+    __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16(
+        desc, src, dst, identity, op, leftDataType, rightDataType);
   else
     static_assert(__tensor_ops_detail::__assert_false_v<ElementType>,
                   "Unsupported type");
@@ -5999,8 +7800,8 @@
 
 template <class SrcElementType, class DstElementType, class SrcExtents, class DstExtents, class SrcLayout, class DstLayout>
 inline bool __is_iterator_compatible(
-    thread metal::cooperative_tensor<SrcElementType, SrcExtents, SrcLayout> &sourceT,
-    thread metal::cooperative_tensor<DstElementType, DstExtents, DstLayout> &destT)
+    const thread metal::cooperative_tensor<SrcElementType, SrcExtents, SrcLayout> &sourceT,
+    const thread metal::cooperative_tensor<DstElementType, DstExtents, DstLayout> &destT)
 {
   if (!SrcLayout::is_matmul2d_cooperative_destination_layout ||
       !DstLayout::is_matmul2d_reduction_cooperative_destination_layout ||
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h	2025-09-08 05:52:50
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h	2025-09-26 17:51:19
@@ -130,7 +130,7 @@
 template <typename T, typename U = __tensor_ops_detail::__enable_if_t<__tensor_ops_detail::__is_integral_v<T>>>
 struct dynamic_length
 {
-  static constexpr constant T value = metal::numeric_limits<T>::max();
+    static constexpr constant T value = metal::numeric_limits<T>::max();
 };
 
 template <typename T, typename U = __tensor_ops_detail::__enable_if_t<__tensor_ops_detail::__is_integral_v<T>>>

MetalPerformancePrimitives iOS xcode26.1 b2

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!