From 9439795470d803fa40c69ca2cae4d9377e8974e0 Mon Sep 17 00:00:00 2001 From: Kyle Lucke Date: Tue, 17 Sep 2024 12:11:44 -0700 Subject: [PATCH] Add a command line flag to disable XLA GPU passes based on binary libraries. Start using the new pass to optionally disable many cuDNN-specific passes. PiperOrigin-RevId: 675660345 --- xla/debug_options_flags.cc | 8 ++++ xla/service/gpu/nvptx_compiler.cc | 66 +++++++++++++++++++++---------- xla/xla.proto | 5 ++- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc index 68bd37f3ca7d8..f651949a82a14 100644 --- a/xla/debug_options_flags.cc +++ b/xla/debug_options_flags.cc @@ -290,6 +290,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_executable_warn_stuck_timeout_seconds(10); opts.set_xla_gpu_executable_terminate_timeout_seconds(30); + opts.set_xla_gpu_experimental_disable_binary_libraries(false); return opts; } @@ -1936,6 +1937,13 @@ void MakeDebugOptionsFlags(std::vector* flag_list, &DebugOptions::set_xla_gpu_executable_terminate_timeout_seconds), debug_options->xla_gpu_executable_terminate_timeout_seconds(), "Set timeout for RendezvousSingle termination")); + flag_list->push_back(tsl::Flag( + "xla_gpu_experimental_disable_binary_libraries", + bool_setter_for( + &DebugOptions::set_xla_gpu_experimental_disable_binary_libraries), + debug_options->xla_gpu_experimental_disable_binary_libraries(), + "Disable XLA GPU passes that depend on non-open source binary " + "libraries")); } // NOLINT(readability/fn_size) // Allocates flag_values and flag_objects; this function must not be called more diff --git a/xla/service/gpu/nvptx_compiler.cc b/xla/service/gpu/nvptx_compiler.cc index ce0c182770814..17833f1f444f2 100644 --- a/xla/service/gpu/nvptx_compiler.cc +++ b/xla/service/gpu/nvptx_compiler.cc @@ -207,13 +207,17 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization( pipeline.AddPass(&matmul_bf16_support); pipeline.AddPass(); - pipeline.AddPass(cuda_compute_capability); - pipeline.AddPass(cuda_compute_capability, dnn_version, - toolkit_version); - pipeline.AddPass(); - pipeline.AddPass(cuda_compute_capability); - pipeline.AddPass(cuda_compute_capability, - dnn_version); + if (!hlo_module->config() + .debug_options() + .xla_gpu_experimental_disable_binary_libraries()) { + pipeline.AddPass(cuda_compute_capability); + pipeline.AddPass(cuda_compute_capability, + dnn_version, toolkit_version); + pipeline.AddPass(); + pipeline.AddPass(cuda_compute_capability); + pipeline.AddPass(cuda_compute_capability, + dnn_version); + } // The conv padding/vectorization passes which we need to get rid of. They // also leave behind unnecessary tuple/get-tuple-element pairs that // TupleSimplifier fixes. @@ -228,12 +232,16 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization( pipeline.AddPass>(algsimp_options, gpu_version); - // CudnnSimplifyPadding gets rid of some padding introduced by - // CudnnPadForConvolutions and used by CudnnVectorizeConvolutions. The - // pattern-matches in this pass need to be run after inlining and simplifying - // tuples from CudnnVectorizeConvolutions. We also need to run algsimp to - // e.g. clean up unnecessary nop `convert`s. - pipeline.AddPass(); + if (!hlo_module->config() + .debug_options() + .xla_gpu_experimental_disable_binary_libraries()) { + // CudnnSimplifyPadding gets rid of some padding introduced by + // CudnnPadForConvolutions and used by CudnnVectorizeConvolutions. The + // pattern-matches in this pass need to be run after inlining and + // simplifying tuples from CudnnVectorizeConvolutions. We also need to run + // algsimp to e.g. clean up unnecessary nop `convert`s. + pipeline.AddPass(); + } // tf2xla bridge, DepthwiseConvolutionConverter, ConvRewriter, and // CudnnSimplifyPadding introduce reshapes and transposes. Run ReshapeMover @@ -275,7 +283,10 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment( auto cuda_compute_capability = std::get( gpu_target_config.device_description.gpu_compute_capability()); - if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_fmha()) { + if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_fmha() && + !hlo_module->config() + .debug_options() + .xla_gpu_experimental_disable_binary_libraries()) { HloPassPipeline mha_fusion_pipeline( "nvptx cudnn multi-headed attention fusion"); // The LayoutAssignment pass may leave behind kCopy instructions which are @@ -314,7 +325,10 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment( } HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1"); - if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_layer_norm()) { + if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_layer_norm() && + !hlo_module->config() + .debug_options() + .xla_gpu_experimental_disable_binary_libraries()) { // Rewrite normalization patterns into cuDNN Custom Calls. pre_pipeline.AddPass(cuda_compute_capability); } @@ -322,12 +336,17 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment( pre_pipeline.AddPass(); pre_pipeline.AddPass(); - for (const CublasPaddingRequirement& requirement : - CublasPaddingRequirements) { - if (cuda_compute_capability.IsAtLeast(requirement.min_compute_capability)) { - pre_pipeline.AddPass(cuda_compute_capability, - requirement.data_type, - requirement.multiple_of); + if (!hlo_module->config() + .debug_options() + .xla_gpu_experimental_disable_binary_libraries()) { + for (const CublasPaddingRequirement& requirement : + CublasPaddingRequirements) { + if (cuda_compute_capability.IsAtLeast( + requirement.min_compute_capability)) { + pre_pipeline.AddPass(cuda_compute_capability, + requirement.data_type, + requirement.multiple_of); + } } } // Padding a gemm operand that's a constant results in pad(constant). Run @@ -397,6 +416,11 @@ absl::Status NVPTXCompiler::AddCustomKernelReplacementPasses( absl::Status NVPTXCompiler::RunCudnnCompilerPasses( HloModule* module, se::StreamExecutor* stream_exec, BinaryMap* dnn_compiled_graphs) { + if (module->config() + .debug_options() + .xla_gpu_experimental_disable_binary_libraries()) { + return absl::OkStatus(); + } tsl::profiler::ScopedAnnotation annotation([&] { return absl::StrFormat("XlaCompileCudnnFusion:#module=%s,program_id=%d#", module->name(), module->unique_id()); diff --git a/xla/xla.proto b/xla/xla.proto index 00543e3b3313f..d5384f52d52e8 100644 --- a/xla/xla.proto +++ b/xla/xla.proto @@ -117,6 +117,9 @@ message DebugOptions { // Specifies the behavior of per kernel autotuning cache. AutotuneCacheMode xla_gpu_experimental_autotune_cache_mode = 324; + // Experimentally disables binary libraries in GPU compiler passes. + bool xla_gpu_experimental_disable_binary_libraries = 329; + // Gates the experimental feature coupling the Triton Softmax pattern matcher // with priority fusion. bool xla_gpu_experimental_enable_triton_softmax_priority_fusion = 325; @@ -971,7 +974,7 @@ message DebugOptions { int32 xla_gpu_executable_warn_stuck_timeout_seconds = 327; int32 xla_gpu_executable_terminate_timeout_seconds = 328; - // Next id: 329 + // Next id: 330 // Extra options to pass to the compilation backend (e.g. LLVM); specific // interpretation of these values is left to the backend.