diff --git a/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc index a0d98975bf4405..ade53fb6396d7a 100644 --- a/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc +++ b/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc @@ -267,6 +267,7 @@ int64_t PriorityFusionShapeSize(const Shape& shape) { HloCostAnalysis::Options PriorityFusionOptions() { return {/*shape_size=*/PriorityFusionShapeSize, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; } diff --git a/xla/service/gpu/fusion_pipeline.cc b/xla/service/gpu/fusion_pipeline.cc index 4fc4af0a4cfafa..20e58ad4b96f89 100644 --- a/xla/service/gpu/fusion_pipeline.cc +++ b/xla/service/gpu/fusion_pipeline.cc @@ -62,6 +62,7 @@ HloPassPipeline FusionPipeline( GpuHloCostAnalysis::Options cost_analysis_options{ shape_size_bytes_function, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; fusion.AddPass(thread_pool, gpu_device_info, std::move(cost_analysis_options)); diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc index 0e2d5e477ba1d7..f848a25042f754 100644 --- a/xla/service/gpu/gpu_compiler.cc +++ b/xla/service/gpu/gpu_compiler.cc @@ -1010,6 +1010,7 @@ absl::Status RunFusionPasses(HloModule* hlo_module, GpuHloCostAnalysis::Options cost_analysis_options{ shape_size_fn, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; HloPassPipeline post_fusion_analysis("post_fusion_analysis"); diff --git a/xla/service/gpu/model/analytical_latency_estimator.cc b/xla/service/gpu/model/analytical_latency_estimator.cc index 48772902066467..5bc9d0d7ea4d33 100644 --- a/xla/service/gpu/model/analytical_latency_estimator.cc +++ b/xla/service/gpu/model/analytical_latency_estimator.cc @@ -86,6 +86,7 @@ AnalyticalLatencyEstimator::AnalyticalLatencyEstimator( cost_analysis_.emplace( GpuHloCostAnalysis::Options{shape_size_function_, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}, gpu_info_); TF_CHECK_OK(computation->Accept(&cost_analysis_.value())); diff --git a/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc index 9e8c78c6aa44fa..2f8fe7b18b3233 100644 --- a/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc +++ b/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc @@ -47,6 +47,7 @@ class GpuCostModelStatsCollectionTest : public HloTestBase { TestGpuDeviceInfo::RTXA6000DeviceInfo(), GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(), /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}}; }; diff --git a/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc b/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc index 0571d080aabd1e..6115b812c912fc 100644 --- a/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc +++ b/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc @@ -45,6 +45,7 @@ class GpuHloCostAnalysisTest : public HloTestBase { public: HloCostAnalysis::Options options_{ShapeSizeBytesFunction(), /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; GpuHloCostAnalysis analysis_{options_}; GpuHloCostAnalysisTest() : HloTestBase() {} diff --git a/xla/service/gpu/model/gpu_indexing_performance_model.h b/xla/service/gpu/model/gpu_indexing_performance_model.h index df614e51761737..499b75ea61bffe 100644 --- a/xla/service/gpu/model/gpu_indexing_performance_model.h +++ b/xla/service/gpu/model/gpu_indexing_performance_model.h @@ -66,6 +66,7 @@ class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase { cost_analysis_( GpuHloCostAnalysis::Options{shape_size_, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}, *device_info_), mlir_context_(mlir_context) {} diff --git a/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/xla/service/gpu/model/gpu_indexing_performance_model_test.cc index d9a2ab7b1e2bf4..b6b020f28682e8 100644 --- a/xla/service/gpu/model/gpu_indexing_performance_model_test.cc +++ b/xla/service/gpu/model/gpu_indexing_performance_model_test.cc @@ -553,6 +553,7 @@ class FlopsPerElementTest : public GpuIndexingPerformanceModelTest { GpuHloCostAnalysis cost_analysis( GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(), /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}, device_info_); diff --git a/xla/service/gpu/model/gpu_performance_model_base_test.cc b/xla/service/gpu/model/gpu_performance_model_base_test.cc index 0ece419e1b009f..f96e9ee3767744 100644 --- a/xla/service/gpu/model/gpu_performance_model_base_test.cc +++ b/xla/service/gpu/model/gpu_performance_model_base_test.cc @@ -47,6 +47,7 @@ class GpuPerformanceModelBaseTest : public HloTestBase { GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(), /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; // The reference times in the test cases below are measured // on A6000 by profiling the execution of the HLOs. diff --git a/xla/service/gpu/model/gpu_performance_model_test.cc b/xla/service/gpu/model/gpu_performance_model_test.cc index 66768b0129fd46..dc80d18a4a3a37 100644 --- a/xla/service/gpu/model/gpu_performance_model_test.cc +++ b/xla/service/gpu/model/gpu_performance_model_test.cc @@ -87,6 +87,7 @@ class GpuPerformanceModelTest : public HloTestBase { mlir::MLIRContext mlir_context_; GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(), /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; // The reference times in the test cases below are measured // on A6000 by profiling the execution of the HLOs. diff --git a/xla/service/gpu/transforms/fusion_merger.cc b/xla/service/gpu/transforms/fusion_merger.cc index 5a09bf5359c86e..37986219faae16 100644 --- a/xla/service/gpu/transforms/fusion_merger.cc +++ b/xla/service/gpu/transforms/fusion_merger.cc @@ -276,6 +276,7 @@ FusionDecision FusionInstructionMerger::ShouldFuse(HloInstruction* producer) { cost_analysis_.emplace( GpuHloCostAnalysis::Options{shape_size_function_, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}, gpu_device_info_); TF_CHECK_OK(computation_->Accept(&cost_analysis_.value())); diff --git a/xla/service/gpu/transforms/multi_output_fusion.cc b/xla/service/gpu/transforms/multi_output_fusion.cc index 35bfe8eb092038..04456d8131ac76 100644 --- a/xla/service/gpu/transforms/multi_output_fusion.cc +++ b/xla/service/gpu/transforms/multi_output_fusion.cc @@ -407,6 +407,7 @@ absl::StatusOr MultiOutputFusion::DoMultiOutputFusion() { RecomputeReachability(); GpuHloCostAnalysis cost_analysis({shape_size_function_, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}, device_info_); TF_RETURN_IF_ERROR(computation_->Accept(&cost_analysis)); diff --git a/xla/service/gpu/transforms/priority_fusion_test.cc b/xla/service/gpu/transforms/priority_fusion_test.cc index 58c23381b4d8db..2909b2212876f4 100644 --- a/xla/service/gpu/transforms/priority_fusion_test.cc +++ b/xla/service/gpu/transforms/priority_fusion_test.cc @@ -84,6 +84,7 @@ class PriorityFusionTest : public HloTestBase { /*thread_pool=*/nullptr, TestGpuDeviceInfo::RTXA6000DeviceInfo(), GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(), /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}}; }; diff --git a/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/xla/service/gpu/transforms/softmax_rewriter_triton.cc index fa7dd119145321..af75843c1d6eae 100644 --- a/xla/service/gpu/transforms/softmax_rewriter_triton.cc +++ b/xla/service/gpu/transforms/softmax_rewriter_triton.cc @@ -473,6 +473,7 @@ EstimateOptimizedHloRunTimeWithoutSoftMaxRewriterTriton( GpuHloCostAnalysis::Options cost_analysis_options{ shape_size, /*per_second_rates=*/{}, + /*min_latencies_seconds=*/{}, /*count_multiple_input_accesses=*/true}; GpuHloCostAnalysis cost_analysis(cost_analysis_options, device_info); TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis)); diff --git a/xla/service/hlo_cost_analysis.cc b/xla/service/hlo_cost_analysis.cc index 16597b4769d04f..c49476e8f927e3 100644 --- a/xla/service/hlo_cost_analysis.cc +++ b/xla/service/hlo_cost_analysis.cc @@ -43,9 +43,13 @@ limitations under the License. namespace xla { HloCostAnalysis::HloCostAnalysis(const Options& options) : options_(options) {} +// TODO(mehrdadk): merge all constructors into HloCostAnalysis(const Options& +// options) HloCostAnalysis::HloCostAnalysis(ShapeSizeFunction shape_size, - const Properties& per_second_rates) - : HloCostAnalysis(Options{shape_size, per_second_rates}) {} + const Properties& per_second_rates, + const Properties& min_latencies_seconds) + : HloCostAnalysis( + Options{shape_size, per_second_rates, min_latencies_seconds}) {} absl::Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) { // Set current instruction cost values to reasonable default values. Each @@ -82,7 +86,9 @@ absl::Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) { } float per_second_rate = options_.per_second_rate(key); if (per_second_rate != 0) { - optimal_seconds = std::max(optimal_seconds, val / per_second_rate); + float time_for_key = + std::max(val / per_second_rate, options_.min_latency_seconds(key)); + optimal_seconds = std::max(optimal_seconds, time_for_key); } }); current_properties_[kOptimalSecondsKey] = optimal_seconds; diff --git a/xla/service/hlo_cost_analysis.h b/xla/service/hlo_cost_analysis.h index ade247ea1fccac..a1e700491ce9e8 100644 --- a/xla/service/hlo_cost_analysis.h +++ b/xla/service/hlo_cost_analysis.h @@ -405,6 +405,11 @@ class HloCostAnalysis : public ConstDfsHloVisitor { // property is bytes accessed, this is the number of bytes that can be // processed per second. Is empty if no rates have been set. Properties per_second_rates = {}; + // The minimum amount of time (in seconds) required to process per each + // property. Hardware design choices (e.g., clock speeds, memory access + // latencies) impose a lower bound on the duration of any operation, even + // the simplest ones. + Properties min_latencies_seconds; // Operations like broadcast with reused inputs are not handled // efficiently on some platforms. Depending on the goal of the analysis // we may need to count or ignore them. @@ -414,31 +419,44 @@ class HloCostAnalysis : public ConstDfsHloVisitor { void set_flops_per_second(float value) { per_second_rates[kFlopsKey] = value; } + void set_flops_min_latency_second(float value) { + min_latencies_seconds[kFlopsKey] = value; + } void set_transcendentals_per_second(float value) { per_second_rates[kTranscendentalsKey] = value; } void set_bytes_per_second(float value) { per_second_rates[kBytesAccessedKey] = value; } + void set_bytes_min_latency_second(float value) { + min_latencies_seconds[kBytesAccessedKey] = value; + } // Returns the specified per-second rate used by cost analysis. float per_second_rate(absl::string_view key) const { return per_second_rates[key]; } + float min_latency_seconds(absl::string_view key) const { + return min_latencies_seconds[key]; + } + std::string ToString() const { return absl::StrFormat( "HloCostAnalysis::Options{\n" " per_second_rates: %s\n" + " min_latency_seconds: %s\n" " count_multiple_input_accesses: %d\n" "}", - per_second_rates.ToString(), count_multiple_input_accesses); + per_second_rates.ToString(), min_latencies_seconds.ToString(), + count_multiple_input_accesses); } }; explicit HloCostAnalysis(const Options& options); explicit HloCostAnalysis(ShapeSizeFunction shape_size, - const Properties& per_second_rates = {}); + const Properties& per_second_rates = {}, + const Properties& min_latency_seconds = {}); // For all element-wise instruction we call HandleElementwiseOp. If necessary, // override HandleElementwiseOp instead. @@ -594,6 +612,10 @@ class HloCostAnalysis : public ConstDfsHloVisitor { float per_second_rate(absl::string_view key) const { return options_.per_second_rate(key); } + // Returns the specified minimum latency used by cost analysis. + float min_latency_seconds(absl::string_view key) const { + return options_.min_latency_seconds(key); + } // Return the key that is used to index into Properties for the specified // input/output at the shape index. diff --git a/xla/service/hlo_cost_analysis_test.cc b/xla/service/hlo_cost_analysis_test.cc index 74c6e158f834de..4bac5768a2d54e 100644 --- a/xla/service/hlo_cost_analysis_test.cc +++ b/xla/service/hlo_cost_analysis_test.cc @@ -701,6 +701,32 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) { EXPECT_EQ(conv_analysis.flop_count(), matmul_analysis.flop_count()); } +// No instruction can finish faster than the clock cycle +TEST_F(HloCostAnalysisTest, LatencyBoundedOptimalTime) { + absl::string_view hlo_string = R"( + HloModule module, is_scheduled=true + + ENTRY Entry { + param0 = f32[1,1] parameter(0) + param1 = f32[1,1] parameter(1) + ROOT add = f32[1,1] add(param0, param1) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnUnverifiedModule(hlo_string)); + + const HloInstruction* add = module->entry_computation()->root_instruction(); + HloCostAnalysis::Options options{ShapeSize}; + const float clock_cycle_seconds = 10.0f; + options.set_flops_per_second(1024); + options.set_bytes_per_second(1024); + options.set_transcendentals_per_second(1024); + options.set_flops_min_latency_second(clock_cycle_seconds); + HloCostAnalysis cost_analysis(options); + ASSERT_IS_OK(add->Accept(&cost_analysis)); + EXPECT_EQ(cost_analysis.optimal_seconds(), clock_cycle_seconds); +} + using FusionCostAnalysis = HloTestBase; TEST_F(FusionCostAnalysis, LoopFusionDynUpdateSlice) { diff --git a/xla/service/memory_space_assignment/cost_analysis.cc b/xla/service/memory_space_assignment/cost_analysis.cc index 3fe9a5b4bae473..d82ee863f868aa 100644 --- a/xla/service/memory_space_assignment/cost_analysis.cc +++ b/xla/service/memory_space_assignment/cost_analysis.cc @@ -76,8 +76,10 @@ float HloCostAnalysisCosts::BytesPerSecond() { float HloCostAnalysisCosts::ComputeSeconds(const HloInstruction& instruction) { return std::max( - static_cast(hlo_cost_analysis_.flop_count(instruction)) / - hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey), + std::max( + hlo_cost_analysis_.min_latency_seconds(HloCostAnalysis::kFlopsKey), + static_cast(hlo_cost_analysis_.flop_count(instruction)) / + hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey)), static_cast(hlo_cost_analysis_.transcendental_count(instruction)) / hlo_cost_analysis_.per_second_rate( HloCostAnalysis::kTranscendentalsKey)); diff --git a/xla/service/memory_space_assignment/cost_analysis_test.cc b/xla/service/memory_space_assignment/cost_analysis_test.cc index 39d4dbbded7bd2..06001beef0ef65 100644 --- a/xla/service/memory_space_assignment/cost_analysis_test.cc +++ b/xla/service/memory_space_assignment/cost_analysis_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "xla/service/memory_space_assignment/cost_analysis.h" +#include #include #include @@ -56,6 +57,7 @@ class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase { options.set_flops_per_second(8); options.set_bytes_per_second(32); options.set_transcendentals_per_second(16); + options.set_flops_min_latency_second(1); hlo_cost_analysis_ = std::make_unique(options); TF_RETURN_IF_ERROR( module->entry_computation()->Accept(hlo_cost_analysis_.get())); @@ -90,8 +92,9 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, NoPipelineOverhead) { TF_ASSERT_OK(Initialize(module.get())); const HloInstruction* add = module->entry_computation()->root_instruction(); - const float expected_compute_elapsed = - /*num_flops=*/8 / /*flops_per_second=*/8.0; + const float expected_compute_elapsed = std::max( + /*num_flops=*/8.0f / /*flops_per_second=*/8.0f, + hlo_cost_analysis_->min_latency_seconds(HloCostAnalysis::kFlopsKey)); LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed; EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add), expected_compute_elapsed); @@ -161,8 +164,9 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) { /*pipeline_overhead_window_size_mib=*/(64.0 / 1024 / 1024))); const HloInstruction* add = module->entry_computation()->root_instruction(); - const float expected_compute_elapsed = - /*num_flops=*/8 / /*flops_per_second=*/8.0; + const float expected_compute_elapsed = std::max( + /*num_flops=*/8.0f / /*flops_per_second=*/8.0f, + hlo_cost_analysis_->min_latency_seconds(HloCostAnalysis::kFlopsKey)); LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed; EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add), expected_compute_elapsed); @@ -230,5 +234,23 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) { expected_compute_elapsed); } +TEST_F(MemorySpaceAssignmentCostAnalysisTest, LatencyBoundCompute) { + absl::string_view hlo_string = R"( + HloModule module, is_scheduled=true + + ENTRY Entry { + param0 = f32[2,2] parameter(0) + param1 = f32[2,2] parameter(1) + ROOT add = f32[2,2] add(param0, param1) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + TF_ASSERT_OK(Initialize(module.get())); + + const HloInstruction* add = module->entry_computation()->root_instruction(); + EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add), 1.0f); +} + } // namespace } // namespace xla