Skip to content

Commit

Permalink
Merge branch 'index_type_refactoring'
Browse files Browse the repository at this point in the history
  • Loading branch information
Tobias Gysi committed Jun 22, 2020
2 parents d10cada + 6f64c56 commit 5245e0b
Show file tree
Hide file tree
Showing 19 changed files with 508 additions and 659 deletions.
40 changes: 32 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,7 @@ endif()
project(oec-opt LANGUAGES CXX C)
include(CheckLanguage)

check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
enable_language(CUDA)
else()
message(SEND_ERROR
"Building the GPU lowering of oec-opt requires CUDA")
endif()
find_library(CUDA_RUNTIME_LIBRARY cuda)
set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to")

# setup llvm lit
set(LLVM_LIT_ARGS "-sv" CACHE STRING "lit default options")
Expand All @@ -42,6 +35,7 @@ list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
include(TableGen)
include(AddLLVM)
include(AddMLIR)

include(HandleLLVMOptions)

include_directories(${LLVM_INCLUDE_DIRS})
Expand All @@ -54,6 +48,36 @@ add_definitions(${LLVM_DEFINITIONS})
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
include(sanitizers)

set(CUDA_BACKEND_ENABLED 1 CACHE BOOL "Enable building the oec CUDA backend")
set(ROCM_BACKEND_ENABLED 0 CACHE BOOL "Enable building the oec ROCM backend")
if(CUDA_BACKEND_ENABLED)
add_definitions(-DCUDA_BACKEND_ENABLED)
endif()
if(ROCM_BACKEND_ENABLED)
add_definitions(-DROCM_BACKEND_ENABLED)
endif()

if (CUDA_BACKEND_ENABLED)
if (NOT ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD))
message(SEND_ERROR "Building the oec CUDA backend requires NVPTX")
endif()

check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
enable_language(CUDA)
else()
message(SEND_ERROR "Building the oec CUDA backend requires CUDA")
endif()
find_library(CUDA_RUNTIME_LIBRARY cuda)
endif()
if (ROCM_BACKEND_ENABLED)
if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
message(SEND_ERROR "Building the oec ROCM backend requires AMDGPU")
endif()

find_package(LLD REQUIRED CONFIG)
endif()

add_subdirectory(include)
add_subdirectory(lib)
add_subdirectory(test)
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# The Stencil Dialect

Development repository for the open earth compiler. The repository depends on a build of llvm including mlir. The OEC build has been tested with LLVM commit c9f63297e24.
Development repository for the open earth compiler. The repository depends on a build of llvm including mlir. The OEC build has been tested with LLVM commit 16cc759ebd5.


## Build Instructions
Expand All @@ -11,6 +11,10 @@ mkdir build && cd build
cmake -G Ninja .. -DMLIR_DIR=$PREFIX/lib/cmake/mlir -DLLVM_EXTERNAL_LIT=$BUILD_DIR/bin/llvm-lit
cmake --build . --target check-oec-opt
```
The ROCM_BACKEND_ENABLED flag enables the support for AMDGPU tragets. It requires an llvm build with lld and we need to set the path to lld using the following flag:
```sh
-DLLD_DIR=$PREFIX/lib/cmake/lld
```
To build the documentation from the TableGen description of the dialect operations, run
```sh
cmake --build . --target mlir-doc
Expand All @@ -20,9 +24,7 @@ cmake --build . --target mlir-doc
# LLVM Build Instructions

Cmake configuration for llvm

```
cmake -G Ninja ../llvm -DLLVM_BUILD_EXAMPLES=OFF -DLLVM_TARGETS_TO_BUILD="host;NVPTX" -DCMAKE_INSTALL_PREFIX=<install_root> -DLLVM_ENABLE_PROJECTS='mlir' -DLLVM_OPTIMIZED_TABLEGEN=ON -DLLVM_ENABLE_OCAMLDOC=OFF -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INSTALL_UTILS=ON -DMLIR_CUDA_RUNNER_ENABLED=ON -DCMAKE_CUDA_COMPILER=<path_to_nvcc> -DCMAKE_LINKER=<path_to_lld> -DLLVM_PARALLEL_LINK_JOBS=2
cmake -G Ninja ../llvm -DLLVM_BUILD_EXAMPLES=OFF -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" -DCMAKE_INSTALL_PREFIX=<install_root> -DLLVM_ENABLE_PROJECTS='mlir;lld' -DLLVM_OPTIMIZED_TABLEGEN=ON -DLLVM_ENABLE_OCAMLDOC=OFF -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INSTALL_UTILS=ON -DMLIR_CUDA_RUNNER_ENABLED=ON -DCMAKE_CUDA_COMPILER=<path_to_nvcc> -DCMAKE_LINKER=<path_to_lld> -DLLVM_PARALLEL_LINK_JOBS=2
```

Do not forget to apply possible patches to llvm before compiling (patches located in stencil-dialect/patches).
13 changes: 4 additions & 9 deletions include/Conversion/LoopsToCUDA/Passes.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef CONVERSION_LOOPSTOCUDA_PASSES_H
#define CONVERSION_LOOPSTOCUDA_PASSES_H

#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/IR/Function.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>
#include <string>
Expand All @@ -12,16 +12,11 @@ namespace mlir {

class Pass;

std::unique_ptr<Pass> createLaunchFuncToCUDACallsPass();

std::unique_ptr<Pass> createStencilIndexOptimizationPass();

std::unique_ptr<OperationPass<FuncOp>> createStencilLoopMappingPass();

OwnedCubin compilePtxToCubin(const std::string &ptx, Location loc,
StringRef name);
std::unique_ptr<Pass>
createLaunchFuncToRuntimeCallsPass(StringRef gpuBinaryAnnotation = "");

void registerGPUToCUBINPipeline();
void registerGPUToHSACOPipeline();

} // namespace mlir

Expand Down
22 changes: 6 additions & 16 deletions include/Conversion/LoopsToCUDA/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,13 @@

include "mlir/Pass/PassBase.td"

def LaunchFuncToCUDACallsPass : Pass<"stencil-gpu-to-cuda", "ModuleOp"> {
let summary = "Convert all kernel launches to CUDA runtime calls";
let constructor = "mlir::createLaunchFuncToCUDACallsPass()";
}

def StencilIndexOptimizationPass : Pass<"stencil-index-optimization", "LLVM::LLVMFuncOp"> {
let summary = "Convert 64-bit index computations to 32-bit index computations";
let constructor = "mlir::createStencilIndexOptimizationPass()";
}

def StencilLoopMappingPass : FunctionPass<"stencil-loop-mapping"> {
let summary = "Map parallel loops to blocks and threads";
let constructor = "mlir::createStencilLoopMappingPass()";
def LaunchFuncToRuntimeCallsPass : Pass<"stencil-gpu-to-runtime", "ModuleOp"> {
let summary = "Convert all kernel launches to GPU runtime calls";
let constructor = "mlir::createLaunchFuncToRuntimeCallsPass()";
let options = [
ListOption<"blockSizes", "block-sizes", "int64_t",
"Block sizes used for the mapping",
"llvm::cl::OneOrMore, llvm::cl::MiscFlags::CommaSeparated">,
Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
"\"nvvm.cubin\"",
"Annotation attribute string for GPU binary">,
];
}

Expand Down
47 changes: 42 additions & 5 deletions lib/Conversion/LoopsToCUDA/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,51 @@
add_mlir_dialect_library(GPUtoCUDATransforms
ConvertLaunchFuncToCUDACalls.cpp
if(CUDA_BACKEND_ENABLED)
set(NVPTX_LIB
NVPTXCodeGen
NVPTXDesc
NVPTXInfo
)
set(NVVMIR_LIB
MLIRNVVMIR
MLIRTargetNVVMIR
)
endif()
if(ROCM_BACKEND_ENABLED)
set(AMDGPU_LIBS
AMDGPUAsmParser
AMDGPUCodeGen
AMDGPUDesc
AMDGPUInfo
)
set(ROCDLIR_LIB
lldCommon
lldDriver
lldELF
MLIRROCDLIR
MLIRTargetROCDLIR
)
endif()

add_mlir_dialect_library(GPUToKernelAndRuntimeCalls
ConvertLaunchFuncToRuntimeCalls.cpp
ConvertKernelFuncToCubin.cpp
StencilIndexOptimizationPass.cpp
StencilLoopMappingPass.cpp
ConvertKernelFuncToHsaco.cpp

ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/Conversion/LoopsToCUDA

LINK_COMPONENTS
Core
MC
MCParser
${NVPTX_LIBS}
${AMDGPU_LIBS}

LINK_LIBS PUBLIC
${NVVMIR_LIB}
${ROCDLIR_LIB}

DEPENDS
MLIRLoopsToCUDAPassIncGen
)

target_link_libraries(GPUtoCUDATransforms PUBLIC MLIRIR)
target_link_libraries(GPUToKernelAndRuntimeCalls PUBLIC MLIRIR)
39 changes: 24 additions & 15 deletions lib/Conversion/LoopsToCUDA/ConvertKernelFuncToCubin.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "Conversion/LoopsToCUDA/Passes.h"
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
Expand All @@ -9,8 +9,11 @@
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassRegistry.h"
#include "mlir/Target/NVVMIR.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/Support/TargetSelect.h"

#ifdef CUDA_BACKEND_ENABLED
#include "cuda.h"

using namespace mlir;
Expand All @@ -35,8 +38,8 @@ inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
} \
}

OwnedCubin mlir::compilePtxToCubin(const std::string &ptx, Location loc,
StringRef name) {
static OwnedBlob compilePtxToCubin(const std::string &ptx, Location loc,
StringRef name) {
char jitErrorBuffer[4096] = {0};

RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
Expand Down Expand Up @@ -75,7 +78,7 @@ OwnedCubin mlir::compilePtxToCubin(const std::string &ptx, Location loc,
"cuLinkComplete");

char *cubinAsChar = static_cast<char *>(cubinData);
OwnedCubin result =
OwnedBlob result =
std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);

// This will also destroy the cubin data.
Expand All @@ -87,21 +90,27 @@ OwnedCubin mlir::compilePtxToCubin(const std::string &ptx, Location loc,
namespace mlir {
void registerGPUToCUBINPipeline() {
PassPipelineRegistration<>(
"stencil-gpu-to-cubin", "Lowering of stencil kernels to cubins",
"stencil-kernel-to-cubin", "Lower kernels to cubin",
[](OpPassManager &pm) {
// Initialize LLVM NVPTX backend.
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
// Define the bitwidth
pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
kernelPm.addPass(createStencilIndexOptimizationPass());
kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
// TODO set appropriate bitwidth
LowerToLLVMOptions llvmOptions;
llvmOptions.emitCWrappers = true;
llvmOptions.useAlignedAlloc = false;
llvmOptions.useBarePtrCallConv = false;
llvmOptions.indexBitwidth = kDeriveIndexBitwidthFromDataLayout;
pm.addPass(createLowerToLLVMPass(llvmOptions));
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass(32));
kernelPm.addPass(createConvertGPUKernelToBlobPass(
translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
"sm_35", "+ptx60", "nvvm.cubin"));
pm.addPass(createLowerToLLVMPass({/* useBarePtrCallConv */ false,
/* emitCWrappers */ true,
/* indexBitwidth */ 32,
/* useAlignedAlloc */ false}));
pm.addPass(createLaunchFuncToRuntimeCallsPass());
});
}
} // namespace mlir
#endif
Loading

0 comments on commit 5245e0b

Please sign in to comment.