Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WiP]: Cuasr 1.2 #24

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ option(CUASR_TEST "Build cuASR test suite. Use with CUASR_TEST_LEVEL={0|1|2}.
option(CUASR_BENCH "Build cuASR benchmark suite." ON)
option(CUASR_EXAMPLE "Build cuASR examples." ON)

# By default, build fat binaries. TODO add sm_80 here
option(CUASR_CUDA_ARCHS "List of CUDA architectures to compile for." "60 61 70 72 75")

# CUDA native compiler (nvcc) only supports upto C++14 for now
find_package(CUDA REQUIRED)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)

# C++ compiler flags for target compile options
set(cuASR_CXX_FLAGS -Wall -Wextra -Wno-unused-parameter -Wno-uninitialized -Wno-strict-aliasing)
Expand All @@ -41,7 +43,9 @@ set(cuASR_CUDA_FLAGS --expt-relaxed-constexpr)
set(cuASR_CUDA_FLAGS_DEBUG -G ${cuASR_CUDA_FLAGS})
set(cuASR_CUDA_FLAGS_RELEASE -O3 ${cuASR_CUDA_FLAGS})
set(cuASR_CUDA_FLAGS_RELWITHDEBINFO -G ${cuASR_CUDA_FLAGS})
set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS})
if (NOT CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "80")
endif()

# the sub-modules update themselves with git, so find git
find_package(Git QUIET)
Expand Down Expand Up @@ -94,6 +98,7 @@ message(STATUS " C++ Compiler : ${CMAKE_CXX_COMPILER}")
message(STATUS " C++ Compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS " CUDA Compiler : ${CMAKE_CUDA_COMPILER}")
message(STATUS " CUDA Compiler version: ${CMAKE_CUDA_COMPILER_VERSION}")
message(STATUS " CUDA Arch support : ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS " Build tests : ${CUASR_TEST}")
message(STATUS " Test level : ${CUASR_TEST_LEVEL}")
message(STATUS " Build benchmarks : ${CUASR_BENCH}")
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ Normally, BLAS libraries are defined as operations over real numbers,`+` and `x`
cuASR is a template library and therefore header only, but includes an exhaustive list of tests and benchmarks. The build system is based on `CMake`. Basic checkout and build instructions are as follows:

```sh
$ git clone --recurse-submodules https://github.com/hpcgarage/semiring-gemm /path/to/repo
$ git clone --recurse-submodules https://github.com/hpcgarage/cuASR /path/to/repo
$ cd /path/to/repo
$ mkdir build && cd build
$ cmake .. -G Ninja -DCUASR_CUDA_ARCHS="70 75"
$ cmake .. -G Ninja -DCMAKE_CUDA_ARCHITECTURES="70 75 80"
$ ninja
```

Expand All @@ -34,7 +34,7 @@ Notable build flags:

| Build Flag | Usage Description |
|-|-|
| `CUASR_CUDA_ARCHS` | lists the CUDA SM architectures the fat binaries should be built to target. `CUASR_CUDA_ARCHS="60 61 70 72 75"` (all Pascal and Volta GPUs) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on.
| `CMAKE_CUDA_ARCHITECTURES` | lists the CUDA SM architectures the fat binaries should be built to target. `CMAKE_CUDA_ARCHITECTURES="80"` (Ampere) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on.
| `CUASR_TEST` | Set to `ON` by default and controls whether tests will be built or not. Set to `OFF` to disable building all tests. |
| `CUASR_BENCH` | Set to `ON` by default and controls whether benchmarks will be built or not. Set to `OFF` to disable building all benchmarks. |
| `CUASR_EXAMPLES` | Set to `ON` by default and controls whether examples will be built or not. Set to `OFF` to disable building all examples. |
Expand Down
47 changes: 42 additions & 5 deletions bench/device/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,60 @@
file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS *.cu)
add_executable(cuasr_bench_srgemm_device
# SM50 defualt configurations
add_executable(cuasr_bench_srgemm_device_sm50_defaults
sm50_defaults.cu
)
target_include_directories(
cuasr_bench_srgemm_device_sm50_defaults
PRIVATE
${PROJECT_SOURCE_DIR}/include/
${PROJECT_SOURCE_DIR}/tools/include/
${PROJECT_SOURCE_DIR}/cutlass/include/
${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
)
target_link_libraries(cuasr_bench_srgemm_device_sm50_defaults
benchmark
benchmark_main
${cuASR_LIB_NAME}
)

# SM80 defualt configurations
add_executable(cuasr_bench_srgemm_device_sm80_defaults
sm80_defaults.cu
)
target_include_directories(
cuasr_bench_srgemm_device_sm80_defaults
PRIVATE
${PROJECT_SOURCE_DIR}/include/
${PROJECT_SOURCE_DIR}/tools/include/
${PROJECT_SOURCE_DIR}/cutlass/include/
${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
)
target_link_libraries(cuasr_bench_srgemm_device_sm80_defaults
benchmark
benchmark_main
${cuASR_LIB_NAME}
)

# All shmoo benchmarks
file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS sm50_simt_*.cu)
add_executable(cuasr_bench_srgemm_device_shmoo
${SIMT_BENCH_SRCS}
)
target_include_directories(
cuasr_bench_srgemm_device
cuasr_bench_srgemm_device_shmoo
PRIVATE
${PROJECT_SOURCE_DIR}/include/
${PROJECT_SOURCE_DIR}/tools/include/
${PROJECT_SOURCE_DIR}/cutlass/include/
${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
)
target_link_libraries(cuasr_bench_srgemm_device
target_link_libraries(cuasr_bench_srgemm_device_shmoo
benchmark
benchmark_main
${cuASR_LIB_NAME}
)
if(NOT DEFINED CUASR_BENCH_LEVEL)
set(CUASR_BENCH_LEVEL 0)
endif()
target_compile_definitions(cuasr_bench_srgemm_device
target_compile_definitions(cuasr_bench_srgemm_device_shmoo
PRIVATE CUASR_BENCH_LEVEL=${CUASR_BENCH_LEVEL}
)
174 changes: 174 additions & 0 deletions bench/device/gen_default_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import os
import sys
import argparse

################################################################################
# This file creates all the possible semiring-gemm kernels for all transposes
# using just the defualt SRGEMM configurations for them.
################################################################################

precisions = [
["d", "double"],
["s", "float"],
]

transposes = [
[False, False, True],
[False, False, False],
[False, True, True],
[False, True, False],
[True, False, True],
[True, False, False],
[True, True, True],
[True, True, False],
]

semiring_operators = [
["plus", "multiplies"], # regular GEMM
["minimum", "plus"], # min-plus (tropical)
["maximum", "plus"], # max-plus
["minimum", "maximum"], # min-max
["maximum", "minimum"], # max-min
["minimum", "multiplies"], # min-multiplies
["maximum", "multiplies"], # max-multiplies
["binary_or", "binary_and"] # or-and
]

benchfile_header = """\
/***************************************************************************************************
* Copyright (c) 2021, Vijay Thakkar ([email protected]).
**************************************************************************************************/
//////////////////////////////////////////////////////////////////////
// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY //
//////////////////////////////////////////////////////////////////////

#include "benchmark/benchmark.h"

#include "cuasr/gemm/device/default_srgemm_configuration.h"
#include "cuasr/gemm/device/srgemm.h"
#include "cuasr/functional.h"

#include "harness.h"
"""

bench_template = """\

///////////////////////////////////////////////////////////////////////////////

static void BM_SM{sm_arch}_device_{add_op}_{mult_op}_{precision_char}srgemm_{transA}{transB}_{transC}(benchmark::State &state) {{
const auto N = static_cast<int>(state.range(0));
using precision = {precision_type};
using OpClass = cutlass::arch::OpClassSimt;
using SmArch = cutlass::arch::Sm{sm_arch};

using AddOp = cuasr::{add_op}<precision>;
using MultOp = cuasr::{mult_op}<precision>;

using Srgemm = cuasr::gemm::device::Srgemm< //
AddOp, MultOp, //
precision, cutlass::layout::{trans_typeA}Major, //
precision, cutlass::layout::{trans_typeB}Major, //
precision, cutlass::layout::{trans_typeC}Major, //
precision, OpClass, SmArch>;

// setup bench harness
cuasr::bench::device::BenchHarness<Srgemm> bench({{ N, N, N }});

// benchmark loop
for (auto _ : state) {{
benchmark::DoNotOptimize(bench.run());
cudaDeviceSynchronize();
}}

double flops_per_itr = 2.0 * N * N * N;
state.counters["Flop/s"]
= benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate);
}}
BENCHMARK(BM_SM{sm_arch}_device_{add_op}_{mult_op}_{precision_char}srgemm_{transA}{transB}_{transC})
->RangeMultiplier(2)->Range(256, 4096);
"""


def write_benchmark_file_header(benchfile):
benchfile.write(benchfile_header)


def write_benchmark_to_file(
benchfile,
sm_arch,
add_op,
mult_op,
precision_char,
precision_type,
transA,
transB,
transC):
trans_typeA = "Column" if transA == "n" else "Row"
trans_typeB = "Column" if transB == "n" else "Row"
trans_typeC = "Column" if transC == "n" else "Row"
benchfile.write(bench_template.format(
sm_arch=sm_arch,
add_op=add_op,
mult_op=mult_op,
precision_char=precision_char,
precision_type=precision_type,
transA=transA,
transB=transB,
transC=transC,
trans_typeA=trans_typeA,
trans_typeB=trans_typeB,
trans_typeC=trans_typeC
))


def main(args):
num_benches = 0
benchfile_name = "sm{}_defaults.cu".format(args.sm_arch)
print(benchfile_name)
filePath = os.path.join(args.output_dir, benchfile_name)

# open file and gen all default tests
with open(filePath, "w") as benchfile:
write_benchmark_file_header(benchfile)

# for all semirings
for add_op, mult_op in semiring_operators:
# for all precisions
for precision in precisions:
precision_char = precision[0]
precision_type = precision[1]

# transposes
for transpose in transposes:
# get transpose char
column_major_A = transpose[0]
column_major_B = transpose[1]
column_major_C = transpose[2]
transA = "n" if column_major_A else "t"
transB = "n" if column_major_B else "t"
transC = "n" if column_major_C else "t"

# write to file
write_benchmark_to_file(
benchfile,
args.sm_arch,
add_op,
mult_op,
precision_char,
precision_type,
transA,
transB,
transC)
num_benches += 1
print("Total bench count per semi-ring = {}".format(
num_benches // len(semiring_operators)))


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output-dir", type=str, required=False, default=".",
help="Path to the output dir.")
parser.add_argument("-sm", "--sm-arch", type=int, required=False, default=50, choices=[50, 80],
help="SM architecture version number,")
args = parser.parse_args(sys.argv[1:])
main(args)
Loading