From a414eb87a7d1c231119ec74eeb8db53cda9b47e9 Mon Sep 17 00:00:00 2001 From: Shraiysh Vaishay Date: Tue, 17 Sep 2024 17:27:53 +0000 Subject: [PATCH] Adding Strictness level to PGLE accuracy checker. Two flags control the behavior now. * `xla_gpu_pgle_profile_file_or_directory_path` unspecified, `xla_gpu_strict_pgle_accuracy_checker` off: this means that PGLE will not be used. * `xla_gpu_pgle_profile_file_or_directory_path` specified, `xla_gpu_strict_pgle_accuracy_checker` off: this means that PGLE will warn about accuracy checker failures like missing instructions, but will continue with them. * `xla_gpu_pgle_profile_file_or_directory_path` specified, `xla_gpu_strict_pgle_accuracy_checker` on: this means that PGLE will error out if the accuracy checker fails. * `xla_gpu_pgle_profile_file_or_directory_path` unspecified, `xla_gpu_strict_pgle_accuracy_checker` on: this is an invalid flag combination. --- xla/debug_options_flags.cc | 8 ++--- xla/service/gpu/gpu_hlo_schedule.cc | 29 ++++++++++--------- xla/service/gpu/gpu_hlo_schedule_test.cc | 27 ++++++++++++++++- .../profile_guided_latency_estimator.cc | 20 ++++++++----- xla/xla.proto | 2 +- 5 files changed, 58 insertions(+), 28 deletions(-) diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc index 06daea4fd1727c..e38d269f38adbd 100644 --- a/xla/debug_options_flags.cc +++ b/xla/debug_options_flags.cc @@ -288,7 +288,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_enable_triton_gemm_int4(false); - opts.set_xla_gpu_enable_pgle_accuracy_checker(false); + opts.set_xla_gpu_strict_pgle_accuracy_checker(false); opts.set_xla_gpu_executable_warn_stuck_timeout_seconds(10); opts.set_xla_gpu_executable_terminate_timeout_seconds(30); @@ -1919,9 +1919,9 @@ void MakeDebugOptionsFlags(std::vector* flag_list, "a training. The location of the marker (if any) is determined " "by the option value of type DebugOptions::StepMarkerLocation.")); flag_list->push_back(tsl::Flag( - "xla_gpu_enable_pgle_accuracy_checker", - bool_setter_for(&DebugOptions::set_xla_gpu_enable_pgle_accuracy_checker), - debug_options->xla_gpu_enable_pgle_accuracy_checker(), + "xla_gpu_strict_pgle_accuracy_checker", + bool_setter_for(&DebugOptions::set_xla_gpu_strict_pgle_accuracy_checker), + debug_options->xla_gpu_strict_pgle_accuracy_checker(), "Enables strict PGLE checking. If an FDO profile is specified and " "latency hiding scheduler encounters missing instructions in the profile " "compilation will halt.")); diff --git a/xla/service/gpu/gpu_hlo_schedule.cc b/xla/service/gpu/gpu_hlo_schedule.cc index be25cd091e6cbc..673262eba6795b 100644 --- a/xla/service/gpu/gpu_hlo_schedule.cc +++ b/xla/service/gpu/gpu_hlo_schedule.cc @@ -458,15 +458,24 @@ absl::StatusOr ScheduleGpuModule( VLOG(1) << "Fingerprint before LHS for module " << module->name() << "(" << module->unique_id() << ") = " << fingerprint; + const DebugOptions& options = module->config().debug_options(); + const bool enable_latency_hiding_scheduler = - module->config() - .debug_options() - .xla_gpu_enable_latency_hiding_scheduler(); + options.xla_gpu_enable_latency_hiding_scheduler(); if (!enable_latency_hiding_scheduler) { return ScheduleMetadata{memory_limit}; } + VLOG(0) << "Here"; + + if (options.xla_gpu_pgle_profile_file_or_directory_path().empty() && + options.xla_gpu_strict_pgle_accuracy_checker()) { + return absl::InvalidArgumentError( + "xla_gpu_strict_pgle_accuracy_checker is turned on, but no profile " + "path specified in xla_gpu_pgle_profile_file_or_directory_path"); + } + SchedulerConfig config = GetSchedulerConfig(memory_limit); auto gpu_latency_estimator = std::make_unique(pointer_size); @@ -476,9 +485,7 @@ absl::StatusOr ScheduleGpuModule( ReadPGLEProfile(module, fingerprint); const bool enable_analytical_latency_estimator = - module->config() - .debug_options() - .xla_gpu_enable_analytical_latency_estimator(); + options.xla_gpu_enable_analytical_latency_estimator(); HloPassPipeline pipeline("latency-hiding-scheduler"); if (profile.has_value()) { auto aggregator = std::make_unique(); @@ -487,11 +494,7 @@ absl::StatusOr ScheduleGpuModule( std::move(aggregator)); LOG(INFO) << "Found profile, using profile guided latency estimator"; VLOG(1) << "Profile:\n" << profile->DebugString(); - if (module->config() - .debug_options() - .xla_gpu_enable_pgle_accuracy_checker()) { - pipeline.AddPass(*pg_latency_estimator); - } + pipeline.AddPass(*pg_latency_estimator); latency_estimator = std::move(pg_latency_estimator); } else if (enable_analytical_latency_estimator) { latency_estimator = std::make_unique( @@ -506,9 +509,7 @@ absl::StatusOr ScheduleGpuModule( } auto async_tracker = [&]() -> std::unique_ptr { - return module->config() - .debug_options() - .xla_gpu_lhs_enable_gpu_async_tracker() + return options.xla_gpu_lhs_enable_gpu_async_tracker() ? std::make_unique(config) : std::make_unique(config); }(); diff --git a/xla/service/gpu/gpu_hlo_schedule_test.cc b/xla/service/gpu/gpu_hlo_schedule_test.cc index 0f9c8412bcdfc1..f25f2fb7cf1761 100644 --- a/xla/service/gpu/gpu_hlo_schedule_test.cc +++ b/xla/service/gpu/gpu_hlo_schedule_test.cc @@ -539,7 +539,7 @@ TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelFailsWithIncompleteProfile) { HloModuleConfig config(module->config()); DebugOptions dboptions(config.debug_options()); - dboptions.set_xla_gpu_enable_pgle_accuracy_checker(true); + dboptions.set_xla_gpu_strict_pgle_accuracy_checker(true); config.set_debug_options(dboptions); module->set_config(config); @@ -1637,5 +1637,30 @@ TEST_F(GpuHloScheduleTest, AsyncOps) { HloOpcode::kAsyncDone, HloOpcode::kAdd)); } +TEST_F(GpuHloScheduleTest, InvalidPGLEOptions) { + const char* hlo = R"( + HloModule test + ENTRY add { + a = s32[] parameter(0) + b = s32[] parameter(1) + ROOT add = add(a,b) + } + )"; + + HloModuleConfig config; + DebugOptions options; + options.set_xla_gpu_strict_pgle_accuracy_checker(true); + options.set_xla_gpu_enable_latency_hiding_scheduler(true); + config.set_debug_options(options); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseAndReturnVerifiedModule(hlo, config)); + + GTEST_FLAG_SET(death_test_style, "threadsafe"); + EXPECT_DEATH( + BuildHloOrdering(module.get()), + "xla_gpu_strict_pgle_accuracy_checker is turned on, but no profile path " + "specified in xla_gpu_pgle_profile_file_or_directory_path"); +} + } // namespace gpu } // namespace xla diff --git a/xla/service/profile_guided_latency_estimator.cc b/xla/service/profile_guided_latency_estimator.cc index d8e20f2445c4a4..e2f21b991b64cf 100644 --- a/xla/service/profile_guided_latency_estimator.cc +++ b/xla/service/profile_guided_latency_estimator.cc @@ -188,16 +188,20 @@ absl::Status ProfileGuidedLatencyEstimator::CheckAccuracy( ProfileStatisticsAggregator::Statistics stats = aggregator_->GetStats(); size_t missing_instructions_count = stats.missing_instructions.size(); if (missing_instructions_count > 0) { - LOG(ERROR) << "Found " << stats.found_instructions_count - << " instructions from the profile."; - LOG(ERROR) << "Missing " << missing_instructions_count - << " instructions from the profile."; + LOG(WARNING) << "Found " << stats.found_instructions_count + << " instructions from the profile."; + LOG(WARNING) << "Missing " << missing_instructions_count + << " instructions from the profile."; for (const HloInstruction* instr : stats.missing_instructions) { - LOG(ERROR) << " " << instr->name(); + LOG(WARNING) << " " << instr->name(); + } + if (module.config() + .debug_options() + .xla_gpu_strict_pgle_accuracy_checker()) { + return absl::InvalidArgumentError( + absl::StrCat("Found ", missing_instructions_count, + " missing instructions. Discarding the profile.")); } - return absl::InvalidArgumentError( - absl::StrCat("Found ", missing_instructions_count, - " missing instructions. Discarding the profile.")); } return absl::OkStatus(); } diff --git a/xla/xla.proto b/xla/xla.proto index b6fc5c9bb454d8..b528d9585aed70 100644 --- a/xla/xla.proto +++ b/xla/xla.proto @@ -965,7 +965,7 @@ message DebugOptions { // Enables strict PGLE checking. If an FDO profile is specified and latency // hiding scheduler encounters missing instructions in the profile // compilation will halt. - bool xla_gpu_enable_pgle_accuracy_checker = 326; + bool xla_gpu_strict_pgle_accuracy_checker = 326; // Timeouts for RendezvousSingle stuck warning and termination. int32 xla_gpu_executable_warn_stuck_timeout_seconds = 327;