Merge branch 'main' of https://github.com/nvidia/fuser into multi_mat…

…mul_tma
NVIDIA · Nov 7, 2024 · 7ecafd1 · 7ecafd1
2 parents f8aa777 + ba4f7d4
commit 7ecafd1
Show file tree

Hide file tree

Showing 137 changed files with 4,202 additions and 3,697 deletions.
diff --git a/benchmarks/cpp/batch_norm_channels_first.cpp b/benchmarks/cpp/batch_norm_channels_first.cpp
@@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm(
   std::vector<c10::IValue> aten_inputs(
       {at_x, at_weight, at_bias, at_run_mean, at_run_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/batch_norm_channels_first_backward.cpp
@@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD(
   std::vector<c10::IValue> aten_inputs(
       {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_last.cpp b/benchmarks/cpp/batch_norm_channels_last.cpp
@@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_nhwc(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc(
   std::vector<c10::IValue> aten_inputs(
       {at_x, at_weight, at_bias, at_run_mean, at_run_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/batch_norm_channels_last_backward.cpp
@@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_nhwc_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
   std::vector<c10::IValue> aten_inputs(
       {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/bert.cpp b/benchmarks/cpp/bert.cpp
@@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_DivMaxSoftDropFwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
@@ -135,15 +135,15 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
   std::vector<c10::IValue> at_inputs = {t0, t1};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
 }
 
 static void NvFuserScheduler_DivMaxSoftDropBwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
@@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   // Some reason t1 isn't used, ignore it.
   bytes -=
@@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
   std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
   std::vector<c10::IValue> at_inputs = {t0, t21};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));

diff --git a/benchmarks/cpp/broadcast.cpp b/benchmarks/cpp/broadcast.cpp
@@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {
 
 static void NvFuserScheduler_Broadcast(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int bcast_dim) {
   auto bcast_size = benchmark_state.range(0);
@@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast(
 
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp
@@ -162,8 +162,8 @@ static void NvFuserScheduler_GeluBackward_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
-    executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+    KernelExecutor ke;
+    ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
   }
 }
 
@@ -187,13 +187,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 
   for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(
+    outputs = ke.runFusion(
         c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
     C10_CUDA_CHECK(cudaDeviceSynchronize());
     clearL2Cache();
@@ -218,11 +218,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
 
   runBenchmarkIterations(
-      benchmark_state, &executor, inputs, heuristic_params->lparams);
+      benchmark_state, &ke, inputs, heuristic_params->lparams);
 }
 
 BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly)
@@ -247,12 +247,12 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.setExecuteKernelFlag(false);
-  executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.setExecuteKernelFlag(false);
+  ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
 
   for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(
+    outputs = ke.runFusion(
         c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }

diff --git a/benchmarks/cpp/gelu_backward_reduction.cpp b/benchmarks/cpp/gelu_backward_reduction.cpp
@@ -93,7 +93,7 @@ static void setupGeluBackwardReduction(
 
 static void NvFuserScheduler_GeluBackwardReduction(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int reduction_dim) {
   auto reduction_size = benchmark_state.range(0);
@@ -112,7 +112,7 @@ static void NvFuserScheduler_GeluBackwardReduction(
 
   std::vector<c10::IValue> aten_inputs = {aten_input_grad, aten_input_x};
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // inputs: gradient tensor + input tensor
   // outputs: output, output_of_reduction

diff --git a/benchmarks/cpp/heuristic_cache.cpp b/benchmarks/cpp/heuristic_cache.cpp
@@ -26,7 +26,7 @@ using namespace nvfuser;
 
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -84,12 +84,12 @@ static auto getLayerBackwardNormRuntime(
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
@@ -98,14 +98,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
@@ -120,7 +120,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
 
 static auto getLayerForwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -141,11 +141,11 @@ static auto getLayerForwardNormRuntime(
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn(shape, options);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormForward_HeuristicCache(
@@ -154,14 +154,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicCache(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);