Cost model now considers the compute latency in addition to its throu…

…ghput. PiperOrigin-RevId: 671505921
openxla · Sep 13, 2024 · 728baa6 · 728baa6
1 parent 32ebd69
commit 728baa6
Show file tree

Hide file tree

Showing 19 changed files with 103 additions and 11 deletions.
diff --git a/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -267,6 +267,7 @@ int64_t PriorityFusionShapeSize(const Shape& shape) {
 HloCostAnalysis::Options PriorityFusionOptions() {
   return {/*shape_size=*/PriorityFusionShapeSize,
           /*per_second_rates=*/{},
+          /*min_latencies_seconds=*/{},
           /*count_multiple_input_accesses=*/true};
 }
 

diff --git a/xla/service/gpu/fusion_pipeline.cc b/xla/service/gpu/fusion_pipeline.cc
@@ -62,6 +62,7 @@ HloPassPipeline FusionPipeline(
     GpuHloCostAnalysis::Options cost_analysis_options{
         shape_size_bytes_function,
         /*per_second_rates=*/{},
+        /*min_latencies_seconds=*/{},
         /*count_multiple_input_accesses=*/true};
     fusion.AddPass<PriorityFusion>(thread_pool, gpu_device_info,
                                    std::move(cost_analysis_options));

diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc
@@ -1010,6 +1010,7 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
     GpuHloCostAnalysis::Options cost_analysis_options{
         shape_size_fn,
         /*per_second_rates=*/{},
+        /*min_latencies_seconds=*/{},
         /*count_multiple_input_accesses=*/true};
 
     HloPassPipeline post_fusion_analysis("post_fusion_analysis");

diff --git a/xla/service/gpu/model/analytical_latency_estimator.cc b/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -86,6 +86,7 @@ AnalyticalLatencyEstimator::AnalyticalLatencyEstimator(
   cost_analysis_.emplace(
       GpuHloCostAnalysis::Options{shape_size_function_,
                                   /*per_second_rates=*/{},
+                                  /*min_latencies_seconds=*/{},
                                   /*count_multiple_input_accesses=*/true},
       gpu_info_);
   TF_CHECK_OK(computation->Accept(&cost_analysis_.value()));

diff --git a/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -47,6 +47,7 @@ class GpuCostModelStatsCollectionTest : public HloTestBase {
       TestGpuDeviceInfo::RTXA6000DeviceInfo(),
       GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(),
                                   /*per_second_rates=*/{},
+                                  /*min_latencies_seconds=*/{},
                                   /*count_multiple_input_accesses=*/true}};
 };
 

diff --git a/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc b/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
@@ -45,6 +45,7 @@ class GpuHloCostAnalysisTest : public HloTestBase {
  public:
   HloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
                                     /*per_second_rates=*/{},
+                                    /*min_latencies_seconds=*/{},
                                     /*count_multiple_input_accesses=*/true};
   GpuHloCostAnalysis analysis_{options_};
   GpuHloCostAnalysisTest() : HloTestBase() {}

diff --git a/xla/service/gpu/model/gpu_indexing_performance_model.h b/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -66,6 +66,7 @@ class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
         cost_analysis_(
             GpuHloCostAnalysis::Options{shape_size_,
                                         /*per_second_rates=*/{},
+                                        /*min_latencies_seconds=*/{},
                                         /*count_multiple_input_accesses=*/true},
             *device_info_),
         mlir_context_(mlir_context) {}

diff --git a/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -553,6 +553,7 @@ class FlopsPerElementTest : public GpuIndexingPerformanceModelTest {
     GpuHloCostAnalysis cost_analysis(
         GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(),
                                     /*per_second_rates=*/{},
+                                    /*min_latencies_seconds=*/{},
                                     /*count_multiple_input_accesses=*/true},
         device_info_);
 

diff --git a/xla/service/gpu/model/gpu_performance_model_base_test.cc b/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -47,6 +47,7 @@ class GpuPerformanceModelBaseTest : public HloTestBase {
 
   GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
                                        /*per_second_rates=*/{},
+                                       /*min_latencies_seconds=*/{},
                                        /*count_multiple_input_accesses=*/true};
   // The reference times in the test cases below are measured
   // on A6000 by profiling the execution of the HLOs.

diff --git a/xla/service/gpu/model/gpu_performance_model_test.cc b/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -87,6 +87,7 @@ class GpuPerformanceModelTest : public HloTestBase {
   mlir::MLIRContext mlir_context_;
   GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
                                        /*per_second_rates=*/{},
+                                       /*min_latencies_seconds=*/{},
                                        /*count_multiple_input_accesses=*/true};
   // The reference times in the test cases below are measured
   // on A6000 by profiling the execution of the HLOs.

diff --git a/xla/service/gpu/transforms/fusion_merger.cc b/xla/service/gpu/transforms/fusion_merger.cc
@@ -276,6 +276,7 @@ FusionDecision FusionInstructionMerger::ShouldFuse(HloInstruction* producer) {
     cost_analysis_.emplace(
         GpuHloCostAnalysis::Options{shape_size_function_,
                                     /*per_second_rates=*/{},
+                                    /*min_latencies_seconds=*/{},
                                     /*count_multiple_input_accesses=*/true},
         gpu_device_info_);
     TF_CHECK_OK(computation_->Accept(&cost_analysis_.value()));

diff --git a/xla/service/gpu/transforms/multi_output_fusion.cc b/xla/service/gpu/transforms/multi_output_fusion.cc
@@ -407,6 +407,7 @@ absl::StatusOr<bool> MultiOutputFusion::DoMultiOutputFusion() {
   RecomputeReachability();
   GpuHloCostAnalysis cost_analysis({shape_size_function_,
                                     /*per_second_rates=*/{},
+                                    /*min_latencies_seconds=*/{},
                                     /*count_multiple_input_accesses=*/true},
                                    device_info_);
   TF_RETURN_IF_ERROR(computation_->Accept(&cost_analysis));

diff --git a/xla/service/gpu/transforms/priority_fusion_test.cc b/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -84,6 +84,7 @@ class PriorityFusionTest : public HloTestBase {
       /*thread_pool=*/nullptr, TestGpuDeviceInfo::RTXA6000DeviceInfo(),
       GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(),
                                   /*per_second_rates=*/{},
+                                  /*min_latencies_seconds=*/{},
                                   /*count_multiple_input_accesses=*/true}};
 };
 

diff --git a/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/xla/service/gpu/transforms/softmax_rewriter_triton.cc
@@ -473,6 +473,7 @@ EstimateOptimizedHloRunTimeWithoutSoftMaxRewriterTriton(
   GpuHloCostAnalysis::Options cost_analysis_options{
       shape_size,
       /*per_second_rates=*/{},
+      /*min_latencies_seconds=*/{},
       /*count_multiple_input_accesses=*/true};
   GpuHloCostAnalysis cost_analysis(cost_analysis_options, device_info);
   TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));

diff --git a/xla/service/hlo_cost_analysis.cc b/xla/service/hlo_cost_analysis.cc
@@ -43,9 +43,13 @@ limitations under the License.
 namespace xla {
 
 HloCostAnalysis::HloCostAnalysis(const Options& options) : options_(options) {}
+// TODO(mehrdadk): merge all constructors into HloCostAnalysis(const Options&
+// options)
 HloCostAnalysis::HloCostAnalysis(ShapeSizeFunction shape_size,
-                                 const Properties& per_second_rates)
-    : HloCostAnalysis(Options{shape_size, per_second_rates}) {}
+                                 const Properties& per_second_rates,
+                                 const Properties& min_latencies_seconds)
+    : HloCostAnalysis(
+          Options{shape_size, per_second_rates, min_latencies_seconds}) {}
 
 absl::Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) {
   // Set current instruction cost values to reasonable default values. Each
@@ -82,7 +86,9 @@ absl::Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
       }
       float per_second_rate = options_.per_second_rate(key);
       if (per_second_rate != 0) {
-        optimal_seconds = std::max(optimal_seconds, val / per_second_rate);
+        float time_for_key =
+            std::max(val / per_second_rate, options_.min_latency_seconds(key));
+        optimal_seconds = std::max(optimal_seconds, time_for_key);
       }
     });
     current_properties_[kOptimalSecondsKey] = optimal_seconds;

diff --git a/xla/service/hlo_cost_analysis.h b/xla/service/hlo_cost_analysis.h
@@ -405,6 +405,11 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     // property is bytes accessed, this is the number of bytes that can be
     // processed per second. Is empty if no rates have been set.
     Properties per_second_rates = {};
+    // The minimum amount of time (in seconds) required to process per each
+    // property. Hardware design choices (e.g., clock speeds, memory access
+    // latencies) impose a lower bound on the duration of any operation, even
+    // the simplest ones.
+    Properties min_latencies_seconds;
     // Operations like broadcast with reused inputs are not handled
     // efficiently on some platforms. Depending on the goal of the analysis
     // we may need to count or ignore them.
@@ -414,31 +419,44 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     void set_flops_per_second(float value) {
       per_second_rates[kFlopsKey] = value;
     }
+    void set_flops_min_latency_second(float value) {
+      min_latencies_seconds[kFlopsKey] = value;
+    }
     void set_transcendentals_per_second(float value) {
       per_second_rates[kTranscendentalsKey] = value;
     }
     void set_bytes_per_second(float value) {
       per_second_rates[kBytesAccessedKey] = value;
     }
+    void set_bytes_min_latency_second(float value) {
+      min_latencies_seconds[kBytesAccessedKey] = value;
+    }
 
     // Returns the specified per-second rate used by cost analysis.
     float per_second_rate(absl::string_view key) const {
       return per_second_rates[key];
     }
 
+    float min_latency_seconds(absl::string_view key) const {
+      return min_latencies_seconds[key];
+    }
+
     std::string ToString() const {
       return absl::StrFormat(
           "HloCostAnalysis::Options{\n"
           " per_second_rates: %s\n"
+          " min_latency_seconds: %s\n"
           " count_multiple_input_accesses: %d\n"
           "}",
-          per_second_rates.ToString(), count_multiple_input_accesses);
+          per_second_rates.ToString(), min_latencies_seconds.ToString(),
+          count_multiple_input_accesses);
     }
   };
 
   explicit HloCostAnalysis(const Options& options);
   explicit HloCostAnalysis(ShapeSizeFunction shape_size,
-                           const Properties& per_second_rates = {});
+                           const Properties& per_second_rates = {},
+                           const Properties& min_latency_seconds = {});
 
   // For all element-wise instruction we call HandleElementwiseOp. If necessary,
   // override HandleElementwiseOp instead.
@@ -594,6 +612,10 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   float per_second_rate(absl::string_view key) const {
     return options_.per_second_rate(key);
   }
+  // Returns the specified minimum latency used by cost analysis.
+  float min_latency_seconds(absl::string_view key) const {
+    return options_.min_latency_seconds(key);
+  }
 
   // Return the key that is used to index into Properties for the specified
   // input/output at the shape index.

diff --git a/xla/service/hlo_cost_analysis_test.cc b/xla/service/hlo_cost_analysis_test.cc
@@ -701,6 +701,32 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
   EXPECT_EQ(conv_analysis.flop_count(), matmul_analysis.flop_count());
 }
 
+// No instruction can finish faster than the clock cycle
+TEST_F(HloCostAnalysisTest, LatencyBoundedOptimalTime) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[1,1] parameter(0)
+    param1 = f32[1,1] parameter(1)
+    ROOT add = f32[1,1] add(param0, param1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  const HloInstruction* add = module->entry_computation()->root_instruction();
+  HloCostAnalysis::Options options{ShapeSize};
+  const float clock_cycle_seconds = 10.0f;
+  options.set_flops_per_second(1024);
+  options.set_bytes_per_second(1024);
+  options.set_transcendentals_per_second(1024);
+  options.set_flops_min_latency_second(clock_cycle_seconds);
+  HloCostAnalysis cost_analysis(options);
+  ASSERT_IS_OK(add->Accept(&cost_analysis));
+  EXPECT_EQ(cost_analysis.optimal_seconds(), clock_cycle_seconds);
+}
+
 using FusionCostAnalysis = HloTestBase;
 
 TEST_F(FusionCostAnalysis, LoopFusionDynUpdateSlice) {

diff --git a/xla/service/memory_space_assignment/cost_analysis.cc b/xla/service/memory_space_assignment/cost_analysis.cc
@@ -76,8 +76,10 @@ float HloCostAnalysisCosts::BytesPerSecond() {
 
 float HloCostAnalysisCosts::ComputeSeconds(const HloInstruction& instruction) {
   return std::max(
-      static_cast<float>(hlo_cost_analysis_.flop_count(instruction)) /
-          hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey),
+      std::max(
+          hlo_cost_analysis_.min_latency_seconds(HloCostAnalysis::kFlopsKey),
+          static_cast<float>(hlo_cost_analysis_.flop_count(instruction)) /
+              hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey)),
       static_cast<float>(hlo_cost_analysis_.transcendental_count(instruction)) /
           hlo_cost_analysis_.per_second_rate(
               HloCostAnalysis::kTranscendentalsKey));

diff --git a/xla/service/memory_space_assignment/cost_analysis_test.cc b/xla/service/memory_space_assignment/cost_analysis_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/memory_space_assignment/cost_analysis.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 
@@ -56,6 +57,7 @@ class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
     options.set_flops_per_second(8);
     options.set_bytes_per_second(32);
     options.set_transcendentals_per_second(16);
+    options.set_flops_min_latency_second(1);
     hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
     TF_RETURN_IF_ERROR(
         module->entry_computation()->Accept(hlo_cost_analysis_.get()));
@@ -90,8 +92,9 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, NoPipelineOverhead) {
   TF_ASSERT_OK(Initialize(module.get()));
 
   const HloInstruction* add = module->entry_computation()->root_instruction();
-  const float expected_compute_elapsed =
-      /*num_flops=*/8 / /*flops_per_second=*/8.0;
+  const float expected_compute_elapsed = std::max(
+      /*num_flops=*/8.0f / /*flops_per_second=*/8.0f,
+      hlo_cost_analysis_->min_latency_seconds(HloCostAnalysis::kFlopsKey));
   LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
   EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
             expected_compute_elapsed);
@@ -161,8 +164,9 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) {
                  /*pipeline_overhead_window_size_mib=*/(64.0 / 1024 / 1024)));
 
   const HloInstruction* add = module->entry_computation()->root_instruction();
-  const float expected_compute_elapsed =
-      /*num_flops=*/8 / /*flops_per_second=*/8.0;
+  const float expected_compute_elapsed = std::max(
+      /*num_flops=*/8.0f / /*flops_per_second=*/8.0f,
+      hlo_cost_analysis_->min_latency_seconds(HloCostAnalysis::kFlopsKey));
   LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
   EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
             expected_compute_elapsed);
@@ -230,5 +234,23 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) {
             expected_compute_elapsed);
 }
 
+TEST_F(MemorySpaceAssignmentCostAnalysisTest, LatencyBoundCompute) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,2] parameter(0)
+    param1 = f32[2,2] parameter(1)
+    ROOT add = f32[2,2] add(param0, param1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(Initialize(module.get()));
+
+  const HloInstruction* add = module->entry_computation()->root_instruction();
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add), 1.0f);
+}
+
 }  // namespace
 }  // namespace xla