epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

zero9178 · 2024-06-17T13:13:13Z

NsNet2, as processed by our IREE backend (https://github.com/opencompl/Quidditch) currently produces 6 different kernels.
This epic documents the state of compiling each + the input IR. The order is in percentage of cycles in the LLVM backend execution.

main$async_dispatch_1_matmul_transpose_b_1x1200x400_f32 (48.71% of all cycles)

IR

func.func @main$async_dispatch_1_matmul_transpose_b_1x1200x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x1200xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: ?>>, %arg2: memref<1200x400xf64, strided<[400, 1], offset: ?>>, %arg3: memref<1x1200xf64, strided<[1200, 1], offset: ?>>, %arg4: memref<1x1200xf64, strided<[1200, 1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: ?>>, memref<1200x400xf64, strided<[400, 1], offset: ?>>) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x1200xf64>, memref<1x1200xf64, strided<[1200, 1], offset: ?>>) outs(%arg4 : memref<1x1200xf64, strided<[1200, 1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}

main$async_dispatch_9_matmul_transpose_b_1x161x600_f32 (27.50% of all cycles)
Needs support for: math.exp

IR

func.func @main$async_dispatch_9_matmul_transpose_b_1x161x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x161xf64>, %arg1: memref<1x600xf64, strided<[600, 1], offset: 600>>, %arg2: memref<161x600xf64, strided<[600, 1], offset: 2590800>>, %arg3: memref<1x161xf64, strided<[161, 1], offset: 2687400>>, %arg4: memref<1x161xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  %cst_0 = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64, strided<[600, 1], offset: 600>>, memref<161x600xf64, strided<[600, 1], offset: 2590800>>) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.mulf %in, %in_1 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x161xf64>, memref<1x161xf64, strided<[161, 1], offset: 2687400>>) outs(%arg4 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.addf %in, %in_1 : f64
    %1 = arith.negf %0 : f64
    %2 = math.exp %1 : f64
    %3 = arith.addf %2, %cst_0 : f64
    %4 = arith.divf %cst_0, %3 : f64
    linalg.yield %4 : f64
  }
  return
}

main$async_dispatch_8_matmul_transpose_b_1x600x600_f32 (8.85% of all cycles)

IR

func.func @main$async_dispatch_8_matmul_transpose_b_1x600x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x600xf64>, %arg2: memref<600x600xf64, strided<[600, 1], offset: 2230200>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2590200>>, %arg4: memref<1x600xf64, strided<[600, 1], offset: 600>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64>, memref<600x600xf64, strided<[600, 1], offset: 2230200>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2590200>>) outs(%arg4 : memref<1x600xf64, strided<[600, 1], offset: 600>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}

main$async_dispatch_7_matmul_transpose_b_1x600x400_f32 (5.89% of all cycles)

IR

func.func @main$async_dispatch_7_matmul_transpose_b_1x600x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: 400>>, %arg2: memref<600x400xf64, strided<[400, 1], offset: 1989600>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2229600>>, %arg4: memref<1x600xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: 400>>, memref<600x400xf64, strided<[400, 1], offset: 1989600>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2229600>>) outs(%arg4 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}

main$async_dispatch_0_matmul_transpose_b_1x400x161_f32 (1.62% of all cycles)

IR

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel0(%arg0: memref<1x50xf64>) {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x50xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel1(%arg0: memref<1x161xf64>, %arg1: memref<50x161xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<1x161xf64>, memref<50x161xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel2(%arg0: memref<1x50xf64>, %arg1: memref<1x50xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x50xf64>, memref<1x50xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}

main$async_dispatch_3_elementwise_400_f32 (1.26% of all cycles)
Needs support for: Dynamic offsets in MemRef, math.exp and math.tanh

IR

func.func @main$async_dispatch_3_elementwise_400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<400xf64, strided<[1], offset: ?>>, %arg1: memref<400xf64, strided<[1], offset: ?>>, %arg2: memref<400xf64, strided<[1], offset: ?>>, %arg3: memref<400xf64, strided<[1], offset: ?>>, %arg4: memref<400xf64, strided<[1], offset: ?>>, %arg5: memref<400xf64, strided<[1], offset: ?>>, %arg6: memref<400xf64, strided<[1], offset: ?>>, %arg7: memref<400xf64, strided<[1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 : memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>) outs(%arg7 : memref<400xf64, strided<[1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %in_1: f64, %in_2: f64, %in_3: f64, %in_4: f64, %in_5: f64, %out: f64):
    %0 = arith.addf %in_4, %in_5 : f64
    %1 = arith.addf %in_2, %in_3 : f64
    %2 = arith.negf %1 : f64
    %3 = math.exp %2 : f64
    %4 = arith.addf %3, %cst : f64
    %5 = arith.divf %cst, %4 : f64
    %6 = arith.mulf %in_1, %5 : f64
    %7 = arith.addf %in_0, %6 : f64
    %8 = math.tanh %7 : f64
    %9 = arith.negf %0 : f64
    %10 = math.exp %9 : f64
    %11 = arith.addf %10, %cst : f64
    %12 = arith.divf %cst, %11 : f64
    %13 = arith.subf %in, %8 : f64
    %14 = arith.mulf %13, %12 : f64
    %15 = arith.addf %14, %8 : f64
    linalg.yield %15 : f64
  }
  return
}

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

zero9178 commented Jun 17, 2024 •

edited

Loading

epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

Comments

zero9178 commented Jun 17, 2024 • edited Loading

zero9178 commented Jun 17, 2024 •

edited

Loading