Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use googlebench in Benchmarks #116

Merged
merged 5 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


include(FetchContent)
FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG main
)
FetchContent_MakeAvailable(googlebenchmark)

set(CUTLASS_BENCHMARKS_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)

add_custom_target(cutlass_benchmarks)
Expand Down Expand Up @@ -54,6 +63,7 @@ function(cutlass_benchmark_add_executable NAME)
cutlass_tools_util_includes
$<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
$<$<BOOL:${ADD_CUDA}>:cuda>
benchmark::benchmark
)

if (CUTLASS_ENABLE_SYCL)
Expand All @@ -66,6 +76,7 @@ function(cutlass_benchmark_add_executable NAME)
)
endfunction()


if(SYCL_INTEL_TARGET)
add_subdirectory(pvc)
endif()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ int main(int argc, const char** argv)

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

BenchmarkRunner<Gemm> runner;
BenchmarkRunner<Gemm> runner("ampere_gemm_bf16_bf16_fp32_tensor_op_fp32");

runner.run(options, hw_info);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ int main(int argc, const char** argv)

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

BenchmarkRunner<Gemm> runner;
BenchmarkRunner<Gemm> runner("ampere_gemm_fp16_fp16_fp32_tensor_op_fp32");

runner.run(options, hw_info);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ int main(int argc, const char** argv)

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

BenchmarkRunner<Gemm> runner;
BenchmarkRunner<Gemm> runner("ampere_gemm_tf32_tf32_fp32_tensor_op_fp32");

runner.run(options, hw_info);

Expand Down
127 changes: 46 additions & 81 deletions benchmarks/common/benchmark_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/print_error.hpp"

#include <benchmark/benchmark.h>

template <typename T>
static void fill_matrix(std::vector<T> &M)
{
Expand All @@ -75,7 +77,7 @@ struct Options {
Options():
help(false),
error(false),
m(4096), n(4096), k(4096), l(1), iterations(100),
m(4096), n(4096), k(4096), l(1),
alpha(1.f), beta(0.f)
{ }

Expand All @@ -94,7 +96,6 @@ struct Options {
cmd.get_cmd_line_argument("l", l, 1);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations, 100);
}

/// Prints the usage statement.
Expand Down Expand Up @@ -161,11 +162,19 @@ struct BenchmarkRunner {
ElementOutput epsilon;
ElementOutput nonzero_floor;

BenchmarkRunner() : epsilon(static_cast<ElementOutput>(0.1f)),
nonzero_floor(static_cast<ElementOutput>(0.1f)) {};
BenchmarkRunner(std::string test_name) : epsilon(static_cast<ElementOutput>(0.1f)),
nonzero_floor(static_cast<ElementOutput>(0.1f)), test_name(test_name) {
int argc = 0;
benchmark::SetDefaultTimeUnit(benchmark::kMillisecond);
benchmark::Initialize(&argc, nullptr);
};

BenchmarkRunner(ElementOutput epsilon, ElementOutput nonzeroFloor) :
epsilon(epsilon), nonzero_floor(nonzeroFloor) {}
BenchmarkRunner(ElementOutput epsilon, ElementOutput nonzeroFloor, std::string test_name) :
epsilon(epsilon), nonzero_floor(nonzeroFloor), test_name(test_name) {
int argc = 0;
benchmark::SetDefaultTimeUnit(benchmark::kMillisecond);
benchmark::Initialize(&argc, nullptr);
}

//
// Methods
Expand Down Expand Up @@ -261,6 +270,7 @@ struct BenchmarkRunner {
}

virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
benchmark::ClearRegisteredBenchmarks();
ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};

initialize(problem_size);
Expand Down Expand Up @@ -293,86 +303,41 @@ struct BenchmarkRunner {

// Verify that the result is correct
bool passed = verify(problem_size, options.alpha, options.beta);
std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;

if (passed && options.iterations > 0) {
GPU_Clock timer;
timer.start();
for (int i = 0; i < options.iterations; ++i) {
gemm_op.run();
}

float cute_time = timer.seconds() / options.iterations;
double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
<< std::endl;
printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
if(not passed) {
throw std::runtime_error("Disposition Failed.");
}
}
};

template <class Gemm>
struct PvcBenchmarkRunner : BenchmarkRunner<Gemm> {
using Base = BenchmarkRunner<Gemm>;

using ElementB = typename Base::ElementB;

using ProblemShapeType = typename Base::ProblemShapeType;

void initialize(const ProblemShapeType& problem_size) override {
Base::initialize(problem_size);
}
std::stringstream full_test_name;
full_test_name << test_name << "/";
std::string test_name_suffix = std::to_string(options.m) + "x" +
std::to_string(options.n) + "x" +
std::to_string(options.k) + "x" +
std::to_string(options.l);
full_test_name << test_name_suffix;
benchmark::RegisterBenchmark(full_test_name.str().c_str(), run_benchmark, options, gemm_op)
->UseManualTime();
benchmark::RunSpecifiedBenchmarks();
}

~BenchmarkRunner() {
benchmark::Shutdown();
}

void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};

initialize(problem_size);

typename Gemm::GemmKernel::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
problem_size,
{Base::block_A.get(), Base::stride_A, Base::block_B.get(), Base::stride_B},
{
{options.alpha, options.beta},
Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
},
hw_info
};

Gemm gemm_op;

size_t workspace_size = Gemm::get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);

gemm_op.can_implement(arguments);

gemm_op.initialize(arguments, workspace.get());

// Run the GEMM
gemm_op.run();

#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::wait();
#else
cudaDeviceSynchronize();
#endif

// Verify that the result is correct
bool passed = Base::verify(problem_size, options.alpha, options.beta);
std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;

if (passed && options.iterations > 0) {
private:
static void run_benchmark(benchmark::State& state, const Options& options, Gemm gemm_op) {
state.counters["runtime_ms"] = 0;
for(auto _ : state) {
GPU_Clock timer;
timer.start();
for (int i = 0; i < options.iterations; ++i) {
gemm_op.run();
}

float cute_time = timer.seconds() / options.iterations;
double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
gemm_op.run();
auto ms_elapsed = timer.milliseconds();
state.counters["runtime_ms"] += ms_elapsed;
state.SetIterationTime(ms_elapsed / 1000);
}
state.counters["runtime_ms"] /= state.iterations();
state.counters["TFlops"] = ((2.0 * options.m * options.n * options.k * options.l) * 1e-12) /
(state.counters["runtime_ms"] / 1000);
}
};

std::string test_name;
};
2 changes: 1 addition & 1 deletion benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ int main(int argc, const char** argv)

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

PvcBenchmarkRunner<Gemm> runner;
BenchmarkRunner<Gemm> runner("pvc_gemm_bf16_bf16_fp32_dpas_fp32");

runner.run(options, hw_info);

Expand Down