Merge branch 'index_type_refactoring'

spcl · Jun 22, 2020 · 5245e0b · 5245e0b
2 parents d10cada + 6f64c56
commit 5245e0b
Show file tree

Hide file tree

Showing 19 changed files with 508 additions and 659 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,14 +16,7 @@ endif()
 project(oec-opt LANGUAGES CXX C)
 include(CheckLanguage)
 
-check_language(CUDA)
-if (CMAKE_CUDA_COMPILER)
-  enable_language(CUDA)
-else()
-  message(SEND_ERROR
-    "Building the GPU lowering of oec-opt requires CUDA")
-endif()
-find_library(CUDA_RUNTIME_LIBRARY cuda)
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to")
 
 # setup llvm lit
 set(LLVM_LIT_ARGS "-sv" CACHE STRING "lit default options")
@@ -42,6 +35,7 @@ list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
 include(TableGen)
 include(AddLLVM)
 include(AddMLIR)
+
 include(HandleLLVMOptions)
 
 include_directories(${LLVM_INCLUDE_DIRS})
@@ -54,6 +48,36 @@ add_definitions(${LLVM_DEFINITIONS})
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
 include(sanitizers)
 
+set(CUDA_BACKEND_ENABLED 1 CACHE BOOL "Enable building the oec CUDA backend")
+set(ROCM_BACKEND_ENABLED 0 CACHE BOOL "Enable building the oec ROCM backend")
+if(CUDA_BACKEND_ENABLED)
+  add_definitions(-DCUDA_BACKEND_ENABLED)
+endif()
+if(ROCM_BACKEND_ENABLED)
+  add_definitions(-DROCM_BACKEND_ENABLED)
+endif()
+
+if (CUDA_BACKEND_ENABLED)
+  if (NOT ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR "Building the oec CUDA backend requires NVPTX")
+  endif()
+
+  check_language(CUDA)
+  if (CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  else()
+    message(SEND_ERROR "Building the oec CUDA backend requires CUDA")
+  endif()
+  find_library(CUDA_RUNTIME_LIBRARY cuda)
+endif()
+if (ROCM_BACKEND_ENABLED)
+  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR "Building the oec ROCM backend requires AMDGPU")
+  endif()
+
+  find_package(LLD REQUIRED CONFIG)
+endif()
+
 add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(test)

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # The Stencil Dialect
 
-Development repository for the open earth compiler. The repository depends on a build of llvm including mlir. The OEC build has been tested with LLVM commit c9f63297e24. 
+Development repository for the open earth compiler. The repository depends on a build of llvm including mlir. The OEC build has been tested with LLVM commit 16cc759ebd5. 
 
 
 ## Build Instructions
@@ -11,6 +11,10 @@ mkdir build && cd build
 cmake -G Ninja .. -DMLIR_DIR=$PREFIX/lib/cmake/mlir -DLLVM_EXTERNAL_LIT=$BUILD_DIR/bin/llvm-lit
 cmake --build . --target check-oec-opt
 ```
+The ROCM_BACKEND_ENABLED flag enables the support for AMDGPU tragets. It requires an llvm build with lld and we need to set the path to lld using the following flag:
+```sh
+-DLLD_DIR=$PREFIX/lib/cmake/lld
+```
 To build the documentation from the TableGen description of the dialect operations, run
 ```sh
 cmake --build . --target mlir-doc
@@ -20,9 +24,7 @@ cmake --build . --target mlir-doc
 # LLVM Build Instructions
 
 Cmake configuration for llvm
-
 ```
-cmake -G Ninja ../llvm -DLLVM_BUILD_EXAMPLES=OFF -DLLVM_TARGETS_TO_BUILD="host;NVPTX" -DCMAKE_INSTALL_PREFIX=<install_root> -DLLVM_ENABLE_PROJECTS='mlir' -DLLVM_OPTIMIZED_TABLEGEN=ON -DLLVM_ENABLE_OCAMLDOC=OFF -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INSTALL_UTILS=ON -DMLIR_CUDA_RUNNER_ENABLED=ON -DCMAKE_CUDA_COMPILER=<path_to_nvcc> -DCMAKE_LINKER=<path_to_lld> -DLLVM_PARALLEL_LINK_JOBS=2
+cmake -G Ninja ../llvm -DLLVM_BUILD_EXAMPLES=OFF -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" -DCMAKE_INSTALL_PREFIX=<install_root> -DLLVM_ENABLE_PROJECTS='mlir;lld' -DLLVM_OPTIMIZED_TABLEGEN=ON -DLLVM_ENABLE_OCAMLDOC=OFF -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INSTALL_UTILS=ON -DMLIR_CUDA_RUNNER_ENABLED=ON -DCMAKE_CUDA_COMPILER=<path_to_nvcc> -DCMAKE_LINKER=<path_to_lld> -DLLVM_PARALLEL_LINK_JOBS=2
 ```
-
 Do not forget to apply possible patches to llvm before compiling (patches located in stencil-dialect/patches).
diff --git a/include/Conversion/LoopsToCUDA/Passes.h b/include/Conversion/LoopsToCUDA/Passes.h
@@ -1,9 +1,9 @@
 #ifndef CONVERSION_LOOPSTOCUDA_PASSES_H
 #define CONVERSION_LOOPSTOCUDA_PASSES_H
 
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/IR/Function.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
 #include "llvm/ADT/StringRef.h"
 #include <memory>
 #include <string>
@@ -12,16 +12,11 @@ namespace mlir {
 
 class Pass;
 
-std::unique_ptr<Pass> createLaunchFuncToCUDACallsPass();
-
-std::unique_ptr<Pass> createStencilIndexOptimizationPass();
-
-std::unique_ptr<OperationPass<FuncOp>> createStencilLoopMappingPass();
-
-OwnedCubin compilePtxToCubin(const std::string &ptx, Location loc,
-                             StringRef name);
+std::unique_ptr<Pass>
+createLaunchFuncToRuntimeCallsPass(StringRef gpuBinaryAnnotation = "");
 
 void registerGPUToCUBINPipeline();
+void registerGPUToHSACOPipeline();
 
 } // namespace mlir
 

diff --git a/include/Conversion/LoopsToCUDA/Passes.td b/include/Conversion/LoopsToCUDA/Passes.td
@@ -3,23 +3,13 @@
 
 include "mlir/Pass/PassBase.td"
 
-def LaunchFuncToCUDACallsPass : Pass<"stencil-gpu-to-cuda", "ModuleOp"> {
-  let summary = "Convert all kernel launches to CUDA runtime calls";
-  let constructor = "mlir::createLaunchFuncToCUDACallsPass()";
-}
-
-def StencilIndexOptimizationPass : Pass<"stencil-index-optimization", "LLVM::LLVMFuncOp"> {
-  let summary = "Convert 64-bit index computations to 32-bit index computations";
-  let constructor = "mlir::createStencilIndexOptimizationPass()";
-}
-
-def StencilLoopMappingPass : FunctionPass<"stencil-loop-mapping"> {
-  let summary = "Map parallel loops to blocks and threads";
-  let constructor = "mlir::createStencilLoopMappingPass()";
+def LaunchFuncToRuntimeCallsPass : Pass<"stencil-gpu-to-runtime", "ModuleOp"> {
+  let summary = "Convert all kernel launches to GPU runtime calls";
+  let constructor = "mlir::createLaunchFuncToRuntimeCallsPass()";
   let options = [
-    ListOption<"blockSizes", "block-sizes", "int64_t",
-           "Block sizes used for the mapping",
-           "llvm::cl::OneOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
+           "\"nvvm.cubin\"",
+           "Annotation attribute string for GPU binary">,
   ];
 }
 

diff --git a/lib/Conversion/LoopsToCUDA/CMakeLists.txt b/lib/Conversion/LoopsToCUDA/CMakeLists.txt
@@ -1,14 +1,51 @@
-add_mlir_dialect_library(GPUtoCUDATransforms
-  ConvertLaunchFuncToCUDACalls.cpp
+if(CUDA_BACKEND_ENABLED)
+  set(NVPTX_LIB
+    NVPTXCodeGen
+    NVPTXDesc
+    NVPTXInfo
+  )
+  set(NVVMIR_LIB
+    MLIRNVVMIR
+    MLIRTargetNVVMIR
+  )
+endif()
+if(ROCM_BACKEND_ENABLED)
+  set(AMDGPU_LIBS
+    AMDGPUAsmParser
+    AMDGPUCodeGen
+    AMDGPUDesc
+    AMDGPUInfo
+  )
+  set(ROCDLIR_LIB
+    lldCommon
+    lldDriver
+    lldELF
+    MLIRROCDLIR
+    MLIRTargetROCDLIR
+  )
+endif()
+
+add_mlir_dialect_library(GPUToKernelAndRuntimeCalls
+  ConvertLaunchFuncToRuntimeCalls.cpp
   ConvertKernelFuncToCubin.cpp
-  StencilIndexOptimizationPass.cpp
-  StencilLoopMappingPass.cpp
+  ConvertKernelFuncToHsaco.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/include/Conversion/LoopsToCUDA
 
+  LINK_COMPONENTS
+  Core
+  MC
+  MCParser
+  ${NVPTX_LIBS}
+  ${AMDGPU_LIBS}
+
+  LINK_LIBS PUBLIC
+  ${NVVMIR_LIB}
+  ${ROCDLIR_LIB}
+
   DEPENDS
   MLIRLoopsToCUDAPassIncGen
 )
 
-target_link_libraries(GPUtoCUDATransforms PUBLIC MLIRIR)
+target_link_libraries(GPUToKernelAndRuntimeCalls PUBLIC MLIRIR)
diff --git a/lib/Conversion/LoopsToCUDA/ConvertKernelFuncToCubin.cpp b/lib/Conversion/LoopsToCUDA/ConvertKernelFuncToCubin.cpp
@@ -1,5 +1,5 @@
 #include "Conversion/LoopsToCUDA/Passes.h"
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
@@ -9,8 +9,11 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "mlir/Target/NVVMIR.h"
 #include "mlir/Transforms/Passes.h"
+#include "llvm/Support/TargetSelect.h"
 
+#ifdef CUDA_BACKEND_ENABLED
 #include "cuda.h"
 
 using namespace mlir;
@@ -35,8 +38,8 @@ inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
     }                                                                          \
   }
 
-OwnedCubin mlir::compilePtxToCubin(const std::string &ptx, Location loc,
-                                   StringRef name) {
+static OwnedBlob compilePtxToCubin(const std::string &ptx, Location loc,
+                                  StringRef name) {
   char jitErrorBuffer[4096] = {0};
 
   RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
@@ -75,7 +78,7 @@ OwnedCubin mlir::compilePtxToCubin(const std::string &ptx, Location loc,
                        "cuLinkComplete");
 
   char *cubinAsChar = static_cast<char *>(cubinData);
-  OwnedCubin result =
+  OwnedBlob result =
       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
 
   // This will also destroy the cubin data.
@@ -87,21 +90,27 @@ OwnedCubin mlir::compilePtxToCubin(const std::string &ptx, Location loc,
 namespace mlir {
 void registerGPUToCUBINPipeline() {
   PassPipelineRegistration<>(
-      "stencil-gpu-to-cubin", "Lowering of stencil kernels to cubins",
+      "stencil-kernel-to-cubin", "Lower kernels to cubin",
       [](OpPassManager &pm) {
+        // Initialize LLVM NVPTX backend.
+        LLVMInitializeNVPTXTarget();
+        LLVMInitializeNVPTXTargetInfo();
+        LLVMInitializeNVPTXTargetMC();
+        LLVMInitializeNVPTXAsmPrinter();
+        // Define the bitwidth
         pm.addPass(createGpuKernelOutliningPass());
         auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
         kernelPm.addPass(createStripDebugInfoPass());
-        kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
-        kernelPm.addPass(createStencilIndexOptimizationPass());
-        kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
-        // TODO set appropriate bitwidth
-        LowerToLLVMOptions llvmOptions;
-        llvmOptions.emitCWrappers = true;
-        llvmOptions.useAlignedAlloc = false;
-        llvmOptions.useBarePtrCallConv = false;
-        llvmOptions.indexBitwidth = kDeriveIndexBitwidthFromDataLayout;
-        pm.addPass(createLowerToLLVMPass(llvmOptions));
+        kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass(32));
+        kernelPm.addPass(createConvertGPUKernelToBlobPass(
+            translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
+            "sm_35", "+ptx60", "nvvm.cubin"));
+        pm.addPass(createLowerToLLVMPass({/* useBarePtrCallConv */ false,
+                                          /* emitCWrappers */ true,
+                                          /* indexBitwidth */ 32,
+                                          /* useAlignedAlloc */ false}));
+        pm.addPass(createLaunchFuncToRuntimeCallsPass());
       });
 }
 } // namespace mlir
+#endif