Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

Open
4 of 6 tasks
zero9178 opened this issue Jun 17, 2024 · 0 comments
Open
4 of 6 tasks

epic, backend: (risc-v) Lowering NsNet2 IREE kernels #2740

zero9178 opened this issue Jun 17, 2024 · 0 comments

Comments

@zero9178
Copy link
Contributor

zero9178 commented Jun 17, 2024

NsNet2, as processed by our IREE backend (https://github.com/opencompl/Quidditch) currently produces 6 different kernels.
This epic documents the state of compiling each + the input IR. The order is in percentage of cycles in the LLVM backend execution.

  • main$async_dispatch_1_matmul_transpose_b_1x1200x400_f32 (48.71% of all cycles)
IR
func.func @main$async_dispatch_1_matmul_transpose_b_1x1200x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x1200xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: ?>>, %arg2: memref<1200x400xf64, strided<[400, 1], offset: ?>>, %arg3: memref<1x1200xf64, strided<[1200, 1], offset: ?>>, %arg4: memref<1x1200xf64, strided<[1200, 1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: ?>>, memref<1200x400xf64, strided<[400, 1], offset: ?>>) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x1200xf64>, memref<1x1200xf64, strided<[1200, 1], offset: ?>>) outs(%arg4 : memref<1x1200xf64, strided<[1200, 1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}
  • main$async_dispatch_9_matmul_transpose_b_1x161x600_f32 (27.50% of all cycles)
    Needs support for: math.exp
IR
func.func @main$async_dispatch_9_matmul_transpose_b_1x161x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x161xf64>, %arg1: memref<1x600xf64, strided<[600, 1], offset: 600>>, %arg2: memref<161x600xf64, strided<[600, 1], offset: 2590800>>, %arg3: memref<1x161xf64, strided<[161, 1], offset: 2687400>>, %arg4: memref<1x161xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  %cst_0 = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64, strided<[600, 1], offset: 600>>, memref<161x600xf64, strided<[600, 1], offset: 2590800>>) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.mulf %in, %in_1 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x161xf64>, memref<1x161xf64, strided<[161, 1], offset: 2687400>>) outs(%arg4 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.addf %in, %in_1 : f64
    %1 = arith.negf %0 : f64
    %2 = math.exp %1 : f64
    %3 = arith.addf %2, %cst_0 : f64
    %4 = arith.divf %cst_0, %3 : f64
    linalg.yield %4 : f64
  }
  return
}
  • main$async_dispatch_8_matmul_transpose_b_1x600x600_f32 (8.85% of all cycles)
IR
func.func @main$async_dispatch_8_matmul_transpose_b_1x600x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x600xf64>, %arg2: memref<600x600xf64, strided<[600, 1], offset: 2230200>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2590200>>, %arg4: memref<1x600xf64, strided<[600, 1], offset: 600>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64>, memref<600x600xf64, strided<[600, 1], offset: 2230200>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2590200>>) outs(%arg4 : memref<1x600xf64, strided<[600, 1], offset: 600>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}
  • main$async_dispatch_7_matmul_transpose_b_1x600x400_f32 (5.89% of all cycles)
IR
func.func @main$async_dispatch_7_matmul_transpose_b_1x600x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: 400>>, %arg2: memref<600x400xf64, strided<[400, 1], offset: 1989600>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2229600>>, %arg4: memref<1x600xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: 400>>, memref<600x400xf64, strided<[400, 1], offset: 1989600>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2229600>>) outs(%arg4 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}
  • main$async_dispatch_0_matmul_transpose_b_1x400x161_f32 (1.62% of all cycles)
IR
func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel0(%arg0: memref<1x50xf64>) {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x50xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel1(%arg0: memref<1x161xf64>, %arg1: memref<50x161xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<1x161xf64>, memref<50x161xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel2(%arg0: memref<1x50xf64>, %arg1: memref<1x50xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x50xf64>, memref<1x50xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}
  • main$async_dispatch_3_elementwise_400_f32 (1.26% of all cycles)
    Needs support for: Dynamic offsets in MemRef, math.exp and math.tanh
IR
func.func @main$async_dispatch_3_elementwise_400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<400xf64, strided<[1], offset: ?>>, %arg1: memref<400xf64, strided<[1], offset: ?>>, %arg2: memref<400xf64, strided<[1], offset: ?>>, %arg3: memref<400xf64, strided<[1], offset: ?>>, %arg4: memref<400xf64, strided<[1], offset: ?>>, %arg5: memref<400xf64, strided<[1], offset: ?>>, %arg6: memref<400xf64, strided<[1], offset: ?>>, %arg7: memref<400xf64, strided<[1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 : memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>) outs(%arg7 : memref<400xf64, strided<[1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %in_1: f64, %in_2: f64, %in_3: f64, %in_4: f64, %in_5: f64, %out: f64):
    %0 = arith.addf %in_4, %in_5 : f64
    %1 = arith.addf %in_2, %in_3 : f64
    %2 = arith.negf %1 : f64
    %3 = math.exp %2 : f64
    %4 = arith.addf %3, %cst : f64
    %5 = arith.divf %cst, %4 : f64
    %6 = arith.mulf %in_1, %5 : f64
    %7 = arith.addf %in_0, %6 : f64
    %8 = math.tanh %7 : f64
    %9 = arith.negf %0 : f64
    %10 = math.exp %9 : f64
    %11 = arith.addf %10, %cst : f64
    %12 = arith.divf %cst, %11 : f64
    %13 = arith.subf %in, %8 : f64
    %14 = arith.mulf %13, %12 : f64
    %15 = arith.addf %14, %8 : f64
    linalg.yield %15 : f64
  }
  return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant