From 2b8a41b158512edc8d908fdf7f68f0e77af44a96 Mon Sep 17 00:00:00 2001 From: Benjamin Chetioui Date: Wed, 2 Oct 2024 11:44:55 -0700 Subject: [PATCH] [XLA:GPU] Remove the now obsolete `--xla_gpu_enable_triton_hopper` flag. MMA_V3 has been enabled by default, and this only gated varying the number of CTAs when autotuning matrix multiplications at this point. This also fixes a bug where the number of CTAs was not being autotuned when using exhaustive tiling. PiperOrigin-RevId: 681535035 --- xla/debug_options_flags.cc | 6 -- xla/service/gpu/autotuning/BUILD | 1 + .../gpu/autotuning/gemm_fusion_autotuner.cc | 24 +++---- .../autotuning/gemm_fusion_autotuner_test.cc | 62 +++++++++++++++++++ xla/xla.proto | 3 +- 5 files changed, 77 insertions(+), 19 deletions(-) diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc index 149ddc3563a83..c19a48a324d08 100644 --- a/xla/debug_options_flags.cc +++ b/xla/debug_options_flags.cc @@ -229,7 +229,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_enable_cudnn_layer_norm(false); opts.set_xla_gpu_threshold_for_windowed_einsum_mib(100000); - opts.set_xla_gpu_enable_triton_hopper(false); opts.set_xla_gpu_experimental_enable_fusion_block_level_rewriter(false); opts.set_xla_gpu_enable_llvm_module_compilation_parallelism(false); @@ -1787,11 +1786,6 @@ void MakeDebugOptionsFlags(std::vector* flag_list, "Einsums that have partitioned operand(can be either LHS or RHS) that's " "larger than this threshold will be transformed to use windowed einsums." "Default is 100000")); - flag_list->push_back(tsl::Flag( - "xla_gpu_enable_triton_hopper", - bool_setter_for(&DebugOptions::set_xla_gpu_enable_triton_hopper), - debug_options->xla_gpu_enable_triton_hopper(), - "Currently used to enable MMA_V3 for Hopper in Triton")); flag_list->push_back(tsl::Flag( "xla_gpu_experimental_enable_fusion_block_level_rewriter", bool_setter_for( diff --git a/xla/service/gpu/autotuning/BUILD b/xla/service/gpu/autotuning/BUILD index 25042e4d5da92..ea34a37200c21 100644 --- a/xla/service/gpu/autotuning/BUILD +++ b/xla/service/gpu/autotuning/BUILD @@ -148,6 +148,7 @@ xla_test( "//xla/tests:xla_internal_test_main", # fixdeps: keep "//xla/tools:hlo_decomposer_lib", "//xla/tsl/lib/core:status_test_util", + "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_absl//absl/status:statusor", diff --git a/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc index 45d820ca0f327..80ff6a55ed423 100644 --- a/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc +++ b/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc @@ -1190,8 +1190,9 @@ std::vector GemmFusionAutotunerImpl::GetExhaustiveTritonConfigs() const { std::vector configs; se::CudaComputeCapability cc = GetComputeCapability(); - bool tune_ctas = - debug_options_.xla_gpu_enable_triton_hopper() && cc.IsAtLeastHopper(); + // Clusters are only support from Hopper. We don't autotune them by default. + bool should_tune_ctas = + debug_options_.xla_gpu_exhaustive_tiling_search() && cc.IsAtLeastHopper(); for (int num_stages : kNumStages) { // Volta doesn't support num_stages > 2. @@ -1214,18 +1215,19 @@ GemmFusionAutotunerImpl::GetExhaustiveTritonConfigs() const { split_k > 1) { break; } - for (int num_ctas : kNumCtas) { - // Clusters are only supported on Hopper. - // Autotuning this parameter is enabled by a flag. - if (!tune_ctas && num_ctas > 1) { - break; - } - if (num_ctas > num_warps) { - break; + + if (should_tune_ctas) { + for (int num_ctas : kNumCtas) { + if (num_ctas <= num_warps) { + configs.push_back(TritonGemmConfig(tile_m, tile_n, tile_k, + split_k, num_stages, + num_warps, num_ctas)); + } } + } else { configs.push_back(TritonGemmConfig(tile_m, tile_n, tile_k, split_k, num_stages, - num_warps, num_ctas)); + num_warps, /*num_ctas=*/1)); } } } diff --git a/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc index 01e6e630a9952..a40500cf6b849 100644 --- a/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc +++ b/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc @@ -23,6 +23,7 @@ limitations under the License. #include #include +#include "absl/container/flat_hash_set.h" #include "absl/log/check.h" #include "absl/log/log.h" #include "absl/status/statusor.h" @@ -1062,6 +1063,67 @@ ENTRY wais { INSTANTIATE_TEST_SUITE_P(GemmFusionAutotunerConfigSweep, GemmFusionAutotunerConfigTest, ::testing::Bool()); +TEST_F(StatelessAutotunerTest, + ExhaustiveAutotuningTunesNumberOfCtasFromHopper) { + const std::string kHloText = R"( +HloModule test + +ENTRY main { + lhs = f32[5,1600] parameter(0) + rhs = f32[1600,10] parameter(1) + ROOT dot = f32[5,10] dot(lhs, rhs), + lhs_contracting_dims={1}, rhs_contracting_dims={0} +})"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText)); + DebugOptions debug_options_with_exhaustive_autotuning = + GetDebugOptionsForTest(); + debug_options_with_exhaustive_autotuning.set_xla_gpu_exhaustive_tiling_search( + true); + + TF_ASSERT_OK_AND_ASSIGN( + const std::vector exhaustive_ampere_configs, + GetPossibleMatmulAutotuneTritonConfigs( + *Cast( + module->entry_computation()->root_instruction()), + se::CudaComputeCapability::Ampere(), GetToolkitVersion(), + debug_options_with_exhaustive_autotuning)); + for (const auto& config : exhaustive_ampere_configs) { + // We do not tune the number of CTAs on Ampere... + EXPECT_EQ(config.num_ctas, 1); + } + + TF_ASSERT_OK_AND_ASSIGN( + const std::vector exhaustive_hopper_configs, + GetPossibleMatmulAutotuneTritonConfigs( + *Cast( + module->entry_computation()->root_instruction()), + se::CudaComputeCapability::Hopper(), GetToolkitVersion(), + debug_options_with_exhaustive_autotuning)); + absl::flat_hash_set config_num_ctas; + for (const auto& config : exhaustive_hopper_configs) { + // ... but we do on Hopper... + config_num_ctas.insert(config.num_ctas); + } + EXPECT_GT(config_num_ctas.size(), 1); + + DebugOptions debug_options_without_exhaustive_autotuning = + GetDebugOptionsForTest(); + debug_options_without_exhaustive_autotuning + .set_xla_gpu_exhaustive_tiling_search(false); + + TF_ASSERT_OK_AND_ASSIGN( + const std::vector hopper_configs, + GetPossibleMatmulAutotuneTritonConfigs( + *Cast( + module->entry_computation()->root_instruction()), + se::CudaComputeCapability::Hopper(), GetToolkitVersion(), + debug_options_without_exhaustive_autotuning)); + for (const auto& config : hopper_configs) { + // ... except if exhaustive autotuning is disabled. + EXPECT_EQ(config.num_ctas, 1); + } +} + TEST_F(GemmFusionAutotunerTest, SplitKFLoatNormalization) { if (!GetCudaComputeCapability().IsAtLeastHopper()) { GTEST_SKIP() << "f8 types are only supported from Hopper onwards."; diff --git a/xla/xla.proto b/xla/xla.proto index fc161b0bfca2d..38b1626663775 100644 --- a/xla/xla.proto +++ b/xla/xla.proto @@ -782,8 +782,7 @@ message DebugOptions { // Threshold to enable windowed einsum (collective matmul) in MB. int64 xla_gpu_threshold_for_windowed_einsum_mib = 265; - // Enables currently disabled features within Triton for Hopper. - bool xla_gpu_enable_triton_hopper = 266; + reserved 266; // was xla_gpu_enable_triton_hopper // Enable NCCL user buffers. bool xla_gpu_enable_nccl_user_buffers = 267;