From a414eb87a7d1c231119ec74eeb8db53cda9b47e9 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <svaishay@nvidia.com>
Date: Tue, 17 Sep 2024 17:27:53 +0000
Subject: [PATCH] Adding Strictness level to PGLE accuracy checker.

Two flags control the behavior now.

 * `xla_gpu_pgle_profile_file_or_directory_path` unspecified,
   `xla_gpu_strict_pgle_accuracy_checker` off: this means that PGLE will
not be used.
 * `xla_gpu_pgle_profile_file_or_directory_path` specified,
   `xla_gpu_strict_pgle_accuracy_checker` off: this means that PGLE will
warn about accuracy checker failures like missing instructions, but will
continue with them.
 * `xla_gpu_pgle_profile_file_or_directory_path` specified,
   `xla_gpu_strict_pgle_accuracy_checker` on: this means that PGLE will
error out if the accuracy checker fails.
 * `xla_gpu_pgle_profile_file_or_directory_path` unspecified,
   `xla_gpu_strict_pgle_accuracy_checker` on: this is an invalid flag
combination.
---
 xla/debug_options_flags.cc                    |  8 ++---
 xla/service/gpu/gpu_hlo_schedule.cc           | 29 ++++++++++---------
 xla/service/gpu/gpu_hlo_schedule_test.cc      | 27 ++++++++++++++++-
 .../profile_guided_latency_estimator.cc       | 20 ++++++++-----
 xla/xla.proto                                 |  2 +-
 5 files changed, 58 insertions(+), 28 deletions(-)
diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc
index 06daea4fd1727c..e38d269f38adbd 100644
--- a/xla/debug_options_flags.cc
+++ b/xla/debug_options_flags.cc
@@ -288,7 +288,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_triton_gemm_int4(false);
 
-  opts.set_xla_gpu_enable_pgle_accuracy_checker(false);
+  opts.set_xla_gpu_strict_pgle_accuracy_checker(false);
 
   opts.set_xla_gpu_executable_warn_stuck_timeout_seconds(10);
   opts.set_xla_gpu_executable_terminate_timeout_seconds(30);
@@ -1919,9 +1919,9 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "a training. The location of the marker (if any) is determined "
       "by the option value of type DebugOptions::StepMarkerLocation."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_pgle_accuracy_checker",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_pgle_accuracy_checker),
-      debug_options->xla_gpu_enable_pgle_accuracy_checker(),
+      "xla_gpu_strict_pgle_accuracy_checker",
+      bool_setter_for(&DebugOptions::set_xla_gpu_strict_pgle_accuracy_checker),
+      debug_options->xla_gpu_strict_pgle_accuracy_checker(),
       "Enables strict PGLE checking. If an FDO profile is specified and "
       "latency hiding scheduler encounters missing instructions in the profile "
       "compilation will halt."));
diff --git a/xla/service/gpu/gpu_hlo_schedule.cc b/xla/service/gpu/gpu_hlo_schedule.cc
index be25cd091e6cbc..673262eba6795b 100644
--- a/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/xla/service/gpu/gpu_hlo_schedule.cc
@@ -458,15 +458,24 @@ absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
   VLOG(1) << "Fingerprint before LHS for module " << module->name() << "("
           << module->unique_id() << ") = " << fingerprint;
 
+  const DebugOptions& options = module->config().debug_options();
+
   const bool enable_latency_hiding_scheduler =
-      module->config()
-          .debug_options()
-          .xla_gpu_enable_latency_hiding_scheduler();
+      options.xla_gpu_enable_latency_hiding_scheduler();
 
   if (!enable_latency_hiding_scheduler) {
     return ScheduleMetadata{memory_limit};
   }
 
+  VLOG(0) << "Here";
+
+  if (options.xla_gpu_pgle_profile_file_or_directory_path().empty() &&
+      options.xla_gpu_strict_pgle_accuracy_checker()) {
+    return absl::InvalidArgumentError(
+        "xla_gpu_strict_pgle_accuracy_checker is turned on, but no profile "
+        "path specified in xla_gpu_pgle_profile_file_or_directory_path");
+  }
+
   SchedulerConfig config = GetSchedulerConfig(memory_limit);
   auto gpu_latency_estimator =
       std::make_unique<GpuLatencyEstimator>(pointer_size);
@@ -476,9 +485,7 @@ absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
       ReadPGLEProfile(module, fingerprint);
 
   const bool enable_analytical_latency_estimator =
-      module->config()
-          .debug_options()
-          .xla_gpu_enable_analytical_latency_estimator();
+      options.xla_gpu_enable_analytical_latency_estimator();
   HloPassPipeline pipeline("latency-hiding-scheduler");
   if (profile.has_value()) {
     auto aggregator = std::make_unique<GPUProfileStatisticsAggregator>();
@@ -487,11 +494,7 @@ absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
         std::move(aggregator));
     LOG(INFO) << "Found profile, using profile guided latency estimator";
     VLOG(1) << "Profile:\n" << profile->DebugString();
-    if (module->config()
-            .debug_options()
-            .xla_gpu_enable_pgle_accuracy_checker()) {
-      pipeline.AddPass<PGLEAccuracyChecker>(*pg_latency_estimator);
-    }
+    pipeline.AddPass<PGLEAccuracyChecker>(*pg_latency_estimator);
     latency_estimator = std::move(pg_latency_estimator);
   } else if (enable_analytical_latency_estimator) {
     latency_estimator = std::make_unique<AnalyticalLatencyEstimator>(
@@ -506,9 +509,7 @@ absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
   }
 
   auto async_tracker = [&]() -> std::unique_ptr<AsyncTracker> {
-    return module->config()
-                   .debug_options()
-                   .xla_gpu_lhs_enable_gpu_async_tracker()
+    return options.xla_gpu_lhs_enable_gpu_async_tracker()
                ? std::make_unique<GpuAsyncTracker>(config)
                : std::make_unique<GpuAsyncTrackerBase>(config);
   }();
diff --git a/xla/service/gpu/gpu_hlo_schedule_test.cc b/xla/service/gpu/gpu_hlo_schedule_test.cc
index 0f9c8412bcdfc1..f25f2fb7cf1761 100644
--- a/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -539,7 +539,7 @@ TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelFailsWithIncompleteProfile) {
 
   HloModuleConfig config(module->config());
   DebugOptions dboptions(config.debug_options());
-  dboptions.set_xla_gpu_enable_pgle_accuracy_checker(true);
+  dboptions.set_xla_gpu_strict_pgle_accuracy_checker(true);
   config.set_debug_options(dboptions);
   module->set_config(config);
 
@@ -1637,5 +1637,30 @@ TEST_F(GpuHloScheduleTest, AsyncOps) {
                           HloOpcode::kAsyncDone, HloOpcode::kAdd));
 }
 
+TEST_F(GpuHloScheduleTest, InvalidPGLEOptions) {
+  const char* hlo = R"(
+    HloModule test
+    ENTRY add {
+      a = s32[] parameter(0)
+      b = s32[] parameter(1)
+      ROOT add = add(a,b)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions options;
+  options.set_xla_gpu_strict_pgle_accuracy_checker(true);
+  options.set_xla_gpu_enable_latency_hiding_scheduler(true);
+  config.set_debug_options(options);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo, config));
+
+  GTEST_FLAG_SET(death_test_style, "threadsafe");
+  EXPECT_DEATH(
+      BuildHloOrdering(module.get()),
+      "xla_gpu_strict_pgle_accuracy_checker is turned on, but no profile path "
+      "specified in xla_gpu_pgle_profile_file_or_directory_path");
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/xla/service/profile_guided_latency_estimator.cc b/xla/service/profile_guided_latency_estimator.cc
index d8e20f2445c4a4..e2f21b991b64cf 100644
--- a/xla/service/profile_guided_latency_estimator.cc
+++ b/xla/service/profile_guided_latency_estimator.cc
@@ -188,16 +188,20 @@ absl::Status ProfileGuidedLatencyEstimator::CheckAccuracy(
   ProfileStatisticsAggregator::Statistics stats = aggregator_->GetStats();
   size_t missing_instructions_count = stats.missing_instructions.size();
   if (missing_instructions_count > 0) {
-    LOG(ERROR) << "Found " << stats.found_instructions_count
-               << " instructions from the profile.";
-    LOG(ERROR) << "Missing " << missing_instructions_count
-               << " instructions from the profile.";
+    LOG(WARNING) << "Found " << stats.found_instructions_count
+                 << " instructions from the profile.";
+    LOG(WARNING) << "Missing " << missing_instructions_count
+                 << " instructions from the profile.";
     for (const HloInstruction* instr : stats.missing_instructions) {
-      LOG(ERROR) << "  " << instr->name();
+      LOG(WARNING) << "  " << instr->name();
+    }
+    if (module.config()
+            .debug_options()
+            .xla_gpu_strict_pgle_accuracy_checker()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Found ", missing_instructions_count,
+                       " missing instructions. Discarding the profile."));
     }
-    return absl::InvalidArgumentError(
-        absl::StrCat("Found ", missing_instructions_count,
-                     " missing instructions. Discarding the profile."));
   }
   return absl::OkStatus();
 }
diff --git a/xla/xla.proto b/xla/xla.proto
index b6fc5c9bb454d8..b528d9585aed70 100644
--- a/xla/xla.proto
+++ b/xla/xla.proto
@@ -965,7 +965,7 @@ message DebugOptions {
   // Enables strict PGLE checking. If an FDO profile is specified and latency
   // hiding scheduler encounters missing instructions in the profile
   // compilation will halt.
-  bool xla_gpu_enable_pgle_accuracy_checker = 326;
+  bool xla_gpu_strict_pgle_accuracy_checker = 326;
 
   // Timeouts for RendezvousSingle stuck warning and termination.
   int32 xla_gpu_executable_warn_stuck_timeout_seconds = 327;