From 3662bc34decbd525f1d3405be5e0ad5dcd910c68 Mon Sep 17 00:00:00 2001 From: Henning Becker Date: Mon, 16 Sep 2024 01:06:19 -0700 Subject: [PATCH] Remove if_cuda_is_configured and if_rocm_is_configured from command_buffer_cmd and custom_call_thunk PiperOrigin-RevId: 675043057 --- xla/service/gpu/BUILD | 212 +++++++++--------- xla/service/gpu/fusions/BUILD | 17 ++ xla/service/gpu/fusions/ir/tests/BUILD | 1 + xla/service/gpu/fusions/legacy/BUILD | 14 ++ xla/service/gpu/fusions/mlir/BUILD | 6 + xla/service/gpu/fusions/tests/BUILD | 1 + xla/service/gpu/fusions/tools/BUILD | 4 + xla/service/gpu/fusions/transforms/BUILD | 1 + .../gpu/fusions/transforms/tests/BUILD | 1 + xla/service/gpu/fusions/triton/BUILD | 3 + xla/service/gpu/gpu_executable.cc | 4 - xla/service/gpu/model/BUILD | 13 ++ xla/service/gpu/runtime/BUILD | 110 ++++----- xla/service/gpu/runtime/command_buffer_cmd.cc | 83 +++---- .../gpu/runtime/command_buffer_thunk_test.cc | 62 ++--- xla/service/gpu/runtime/convolution_thunk.cc | 13 +- xla/service/gpu/runtime/custom_call_thunk.cc | 11 +- xla/service/gpu/runtime/custom_call_thunk.h | 10 +- .../gpu/runtime/dynamic_slice_thunk_test.cc | 33 ++- .../gpu/runtime/nccl_collective_thunk.cc | 4 +- xla/service/gpu/tests/BUILD | 1 + xla/service/gpu/transforms/BUILD | 16 +- 22 files changed, 329 insertions(+), 291 deletions(-) diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD index ad89a46c852e0..2a21b9702a003 100644 --- a/xla/service/gpu/BUILD +++ b/xla/service/gpu/BUILD @@ -278,6 +278,7 @@ cc_library( name = "ir_emitter_context", srcs = ["ir_emitter_context.cc"], hdrs = ["ir_emitter_context.h"], + tags = ["gpu"], deps = [ ":execution_stream_assignment", ":gpu_constants", @@ -308,6 +309,7 @@ cc_library( ]) + if_rocm_hipblaslt([ "TF_HIPBLASLT=1", ]), + tags = ["gpu"], deps = [ ":backend_configs_cc", ":cublas_cudnn", @@ -350,12 +352,14 @@ cc_library( "//xla/service/gpu/kernels:custom_kernel", "//xla/service/gpu/kernels:topk_custom_kernel", "//xla/service/gpu/model:tiled_hlo_instruction_or_computation", + "//xla/service/gpu/runtime:cholesky_thunk", "//xla/service/gpu/runtime:command_buffer_cmd", "//xla/service/gpu/runtime:command_buffer_cmd_emitter", "//xla/service/gpu/runtime:command_buffer_thunk", "//xla/service/gpu/runtime:conditional_thunk", "//xla/service/gpu/runtime:convolution_thunk", "//xla/service/gpu/runtime:copy_thunk", + "//xla/service/gpu/runtime:cub_sort_thunk", "//xla/service/gpu/runtime:cudnn_thunk", "//xla/service/gpu/runtime:custom_call_thunk", "//xla/service/gpu/runtime:fft_thunk", @@ -379,6 +383,7 @@ cc_library( "//xla/service/gpu/runtime:send_recv_thunk", "//xla/service/gpu/runtime:sequential_thunk", "//xla/service/gpu/runtime:thunk", + "//xla/service/gpu/runtime:triangular_solve_thunk", "//xla/service/gpu/runtime:wait_for_streams_thunk", "//xla/service/gpu/runtime:while_thunk", "//xla/service/llvm_ir:buffer_assignment_util", @@ -421,11 +426,7 @@ cc_library( "@tsl//tsl/platform:human_readable_json", "@tsl//tsl/platform:statusor", "@tsl//tsl/protobuf:dnn_proto_cc", - ] + if_gpu_is_configured([ - "//xla/service/gpu/runtime:cholesky_thunk", - "//xla/service/gpu/runtime:cub_sort_thunk", - "//xla/service/gpu/runtime:triangular_solve_thunk", - ]) + if_rocm_is_configured([ + ] + if_rocm_is_configured([ "@local_config_rocm//rocm:rocm_headers", ]), ) @@ -442,7 +443,7 @@ cc_library( "ir_emitter.h", "ir_emitter_nested.h", ], - copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":backend_configs_cc", ":hlo_to_ir_bindings", @@ -486,11 +487,9 @@ cc_library( cc_library( name = "triton_call", - srcs = if_gpu_is_configured(["triton_call.cc"]), + srcs = ["triton_call.cc"], hdrs = ["triton_call.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), + tags = ["gpu"], deps = [ "@llvm-project//mlir:AsmParser", "@llvm-project//mlir:IR", @@ -556,6 +555,7 @@ cc_library( local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ "TENSORFLOW_USE_ROCM=1", ]), + tags = ["gpu"], deps = [ ":backend_configs_cc", ":buffer_allocations", @@ -1252,6 +1252,7 @@ cc_library( "compile_module_to_llvm_ir.h", ], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":executable_proto_cc", ":execution_stream_assignment", @@ -1303,6 +1304,7 @@ cc_library( name = "fusion_pipeline", srcs = ["fusion_pipeline.cc"], hdrs = ["fusion_pipeline.h"], + tags = ["gpu"], deps = [ "//xla:xla_proto_cc", "//xla/service:cpu_gpu_shape_verifier", @@ -1350,14 +1352,14 @@ cc_library( cc_library( name = "gpu_compiler", - srcs = if_gpu_is_configured([ + srcs = [ "gpu_compiler.cc", - ]), - hdrs = if_gpu_is_configured([ + ], + hdrs = [ "gpu_compiler.h", - ]), - deps = if_gpu_is_configured([ - # go/keep-sorted start prefix_order=":,, + ], + tags = ["gpu"], + deps = [ ":buffer_sharing", ":compile_module_to_llvm_ir", ":conv_layout_normalization", @@ -1384,83 +1386,17 @@ cc_library( ":reduction_utils", ":runtime_intrinsics", ":stream_executor_util", - "@com_google_absl//absl/base", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", - "@com_google_absl//absl/types:variant", - "@llvm-project//llvm:AsmParser", - "@llvm-project//llvm:BitReader", - "@llvm-project//llvm:BitWriter", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - "@llvm-project//llvm:TransformUtils", - "@llvm-project//mlir:FuncDialect", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:Pass", - "@llvm-project//mlir:Support", + "//xla:autotune_results_proto_cc", + "//xla:debug_options_flags", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla:xla_proto_cc", "//xla/hlo/ir:hlo", "//xla/hlo/ir:hlo_module_group", "//xla/pjrt/distributed:key_value_store_interface", - "//xla/service/gpu/autotuning:autotuner_util", - "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner", - "//xla/service/gpu/fusions/triton:triton_support", - "//xla/service/gpu/model:gpu_cost_model_stats_collection", - "//xla/service/gpu/model:gpu_hlo_cost_analysis", - "//xla/service/gpu/runtime:thunk", - "//xla/service/gpu/transforms:algebraic_simplifier", - "//xla/service/gpu/transforms:algorithm_checker", - "//xla/service/gpu/transforms:all_gather_dynamic_slice_simplifier", - "//xla/service/gpu/transforms:all_gather_optimizer", - "//xla/service/gpu/transforms:all_reduce_blueconnect", - "//xla/service/gpu/transforms:all_reduce_splitter", - "//xla/service/gpu/transforms:async_collective_annotator", - "//xla/service/gpu/transforms:async_wrapper", - "//xla/service/gpu/transforms:collective_permute_cycle_decomposer", - "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator", - "//xla/service/gpu/transforms:command_buffer_scheduling", - "//xla/service/gpu/transforms:conv_rewriter", - "//xla/service/gpu/transforms:convert_async_collectives_to_sync", - "//xla/service/gpu/transforms:cudnn_custom_call_converter", - "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter", - "//xla/service/gpu/transforms:dot_dimension_sorter", - "//xla/service/gpu/transforms:dot_operand_converter", - "//xla/service/gpu/transforms:double_buffer_loop_unrolling", - "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter", - "//xla/service/gpu/transforms:fusion_wrapper", - "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter", - "//xla/service/gpu/transforms:gemm_fusion", - "//xla/service/gpu/transforms:gemm_rewriter", - "//xla/service/gpu/transforms:gemv_rewriter", - "//xla/service/gpu/transforms:layout_assignment", - "//xla/service/gpu/transforms:move_copy_to_users", - "//xla/service/gpu/transforms:pipelined_p2p_rewriter", - "//xla/service/gpu/transforms:reduce_scatter_creator", - "//xla/service/gpu/transforms:reduction_degenerate_dim_remover", - "//xla/service/gpu/transforms:reduction_dimension_grouper", - "//xla/service/gpu/transforms:reduction_layout_normalizer", - "//xla/service/gpu/transforms:reduction_splitter", - "//xla/service/gpu/transforms:rename_fusions", - "//xla/service/gpu/transforms:sanitize_constant_names", - "//xla/service/gpu/transforms:scatter_expander", - "//xla/service/gpu/transforms:scatter_slice_simplifier", - "//xla/service/gpu/transforms:softmax_rewriter_triton", - "//xla/service/gpu/transforms:stream_attribute_annotator", - "//xla/service/gpu/transforms:stream_attribute_async_wrapper", - "//xla/service/gpu/transforms:topk_specializer", - "//xla/service/gpu/transforms:topk_splitter", - "//xla/service/gpu/transforms:transpose_dimension_grouper", - "//xla/service/gpu/transforms:tree_reduction_rewriter", - "//xla/service/gpu/transforms:triton_fusion_numerics_verifier", - "//xla/service/gpu/transforms:windowed_einsum_handler", - "//xla/service/llvm_ir:llvm_util", - "//xla/service/spmd:collective_permute_motion", "//xla/service:algebraic_simplifier", "//xla/service:all_gather_broadcast_reorder", "//xla/service:all_gather_combiner", @@ -1561,24 +1497,90 @@ cc_library( "//xla/service:while_loop_simplifier", "//xla/service:while_loop_trip_count_annotator", "//xla/service:zero_sized_hlo_elimination", + "//xla/service/gpu/autotuning:autotuner_util", + "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner", + "//xla/service/gpu/fusions/triton:triton_support", + "//xla/service/gpu/model:gpu_cost_model_stats_collection", + "//xla/service/gpu/model:gpu_hlo_cost_analysis", + "//xla/service/gpu/runtime:thunk", + "//xla/service/gpu/transforms:algebraic_simplifier", + "//xla/service/gpu/transforms:algorithm_checker", + "//xla/service/gpu/transforms:all_gather_dynamic_slice_simplifier", + "//xla/service/gpu/transforms:all_gather_optimizer", + "//xla/service/gpu/transforms:all_reduce_blueconnect", + "//xla/service/gpu/transforms:all_reduce_splitter", + "//xla/service/gpu/transforms:async_collective_annotator", + "//xla/service/gpu/transforms:async_wrapper", + "//xla/service/gpu/transforms:collective_permute_cycle_decomposer", + "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator", + "//xla/service/gpu/transforms:command_buffer_scheduling", + "//xla/service/gpu/transforms:conv_rewriter", + "//xla/service/gpu/transforms:convert_async_collectives_to_sync", + "//xla/service/gpu/transforms:cudnn_custom_call_converter", + "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter", + "//xla/service/gpu/transforms:dot_dimension_sorter", + "//xla/service/gpu/transforms:dot_operand_converter", + "//xla/service/gpu/transforms:double_buffer_loop_unrolling", + "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter", + "//xla/service/gpu/transforms:fusion_wrapper", + "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter", + "//xla/service/gpu/transforms:gemm_fusion", + "//xla/service/gpu/transforms:gemm_rewriter", + "//xla/service/gpu/transforms:gemv_rewriter", + "//xla/service/gpu/transforms:layout_assignment", + "//xla/service/gpu/transforms:move_copy_to_users", + "//xla/service/gpu/transforms:pipelined_p2p_rewriter", + "//xla/service/gpu/transforms:reduce_scatter_creator", + "//xla/service/gpu/transforms:reduction_degenerate_dim_remover", + "//xla/service/gpu/transforms:reduction_dimension_grouper", + "//xla/service/gpu/transforms:reduction_layout_normalizer", + "//xla/service/gpu/transforms:reduction_splitter", + "//xla/service/gpu/transforms:rename_fusions", + "//xla/service/gpu/transforms:sanitize_constant_names", + "//xla/service/gpu/transforms:scatter_expander", + "//xla/service/gpu/transforms:scatter_slice_simplifier", + "//xla/service/gpu/transforms:softmax_rewriter_triton", + "//xla/service/gpu/transforms:stream_attribute_annotator", + "//xla/service/gpu/transforms:stream_attribute_async_wrapper", + "//xla/service/gpu/transforms:topk_specializer", + "//xla/service/gpu/transforms:topk_splitter", + "//xla/service/gpu/transforms:transpose_dimension_grouper", + "//xla/service/gpu/transforms:tree_reduction_rewriter", + "//xla/service/gpu/transforms:triton_fusion_numerics_verifier", + "//xla/service/gpu/transforms:windowed_einsum_handler", + "//xla/service/llvm_ir:llvm_util", + "//xla/service/spmd:collective_permute_motion", "//xla/stream_executor", - "//xla/stream_executor/gpu:gpu_driver_header", - "//xla/stream_executor/integrations:device_mem_allocator", "//xla/stream_executor:device_description", "//xla/stream_executor:device_description_proto_cc", "//xla/stream_executor:dnn", "//xla/stream_executor:platform_manager", "//xla/stream_executor:semantic_version", + "//xla/stream_executor/gpu:gpu_driver_header", + "//xla/stream_executor/integrations:device_mem_allocator", "//xla/translate/hlo_to_mhlo:hlo_utils", "//xla/translate/mhlo_to_hlo:location_exporter", - "//xla:autotune_results_proto_cc", - "//xla:debug_options_flags", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla:xla_proto_cc", + "@com_google_absl//absl/base", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@com_google_absl//absl/types:variant", + "@llvm-project//llvm:AsmParser", + "@llvm-project//llvm:BitReader", + "@llvm-project//llvm:BitWriter", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:TransformUtils", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", "@tsl//tsl/lib/monitoring:counter", "@tsl//tsl/platform:blocking_counter", "@tsl//tsl/platform:casts", @@ -1592,8 +1594,7 @@ cc_library( "@tsl//tsl/platform:statusor", "@tsl//tsl/profiler/lib:scoped_annotation", "@tsl//tsl/profiler/lib:traceme", - # go/keep-sorted end - ]) + xla_internal(["service:export_hlo"]) + if_google([ + ] + xla_internal(["service:export_hlo"]) + if_google([ "//xla/hlo/experimental/auto_sharding", ]), ) @@ -2072,6 +2073,7 @@ cc_library( name = "gpu_hlo_schedule", srcs = ["gpu_hlo_schedule.cc"], hdrs = ["gpu_hlo_schedule.h"], + tags = ["gpu"], deps = [ ":backend_configs_cc", ":gpu_latency_hiding_scheduler", @@ -2625,8 +2627,9 @@ cc_library( cc_library( name = "make_batch_pointers", - srcs = if_gpu_is_configured(["make_batch_pointers.cc"]), - hdrs = if_gpu_is_configured(["make_batch_pointers.h"]), + srcs = ["make_batch_pointers.cc"], + hdrs = ["make_batch_pointers.h"], + tags = ["gpu"], deps = [ "//xla:types", "//xla:util", @@ -2943,6 +2946,7 @@ cc_library( xla_cc_test( name = "gpu_latency_hiding_scheduler_test", srcs = ["gpu_latency_hiding_scheduler_test.cc"], + tags = ["gpu"], deps = [ ":gpu_hlo_schedule", ":gpu_latency_hiding_scheduler", diff --git a/xla/service/gpu/fusions/BUILD b/xla/service/gpu/fusions/BUILD index 2a4f04c9f67ee..b77191263eda3 100644 --- a/xla/service/gpu/fusions/BUILD +++ b/xla/service/gpu/fusions/BUILD @@ -12,6 +12,7 @@ cc_library( name = "in_place_dynamic_update_slice_mlir", srcs = ["in_place_dynamic_update_slice_mlir.cc"], hdrs = ["in_place_dynamic_update_slice_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:xla_data_proto_cc", @@ -37,6 +38,7 @@ cc_library( name = "copy", srcs = ["copy.cc"], hdrs = ["copy.h"], + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla:shape_util", @@ -59,6 +61,7 @@ cc_library( srcs = ["custom.cc"], hdrs = ["custom.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla:shape_util", @@ -165,6 +168,7 @@ cc_library( name = "fusion_emitter", srcs = ["fusion_emitter.cc"], hdrs = ["fusion_emitter.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ "//xla:shape_util", @@ -201,6 +205,7 @@ cc_library( name = "fusions", srcs = ["fusions.cc"], hdrs = ["fusions.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ ":concatenate_mlir", @@ -242,6 +247,7 @@ cc_library( name = "loop_mlir", srcs = ["loop_mlir.cc"], hdrs = ["loop_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:status_macros", @@ -271,6 +277,7 @@ cc_library( name = "scatter_mlir", srcs = ["scatter_mlir.cc"], hdrs = ["scatter_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:xla_data_proto_cc", @@ -303,6 +310,7 @@ cc_library( name = "transpose_mlir", srcs = ["transpose_mlir.cc"], hdrs = ["transpose_mlir.h"], + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla:permutation_util", @@ -337,6 +345,7 @@ cc_library( name = "triton", srcs = ["triton.cc"], hdrs = ["triton.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ ":fusion_emitter", @@ -377,6 +386,7 @@ xla_cc_test( name = "triton_test", srcs = ["triton_test.cc"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":fusion_emitter", ":fusions", @@ -400,6 +410,7 @@ cc_library( srcs = ["cudnn.cc"], hdrs = ["cudnn.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla/hlo/ir:hlo", @@ -459,6 +470,7 @@ cc_library( name = "thunk_util", srcs = ["thunk_util.cc"], hdrs = ["thunk_util.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ "//xla:literal", @@ -478,6 +490,7 @@ cc_library( name = "reduction_base", srcs = ["reduction_base.cc"], hdrs = ["reduction_base.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ ":fusion_emitter", @@ -513,6 +526,7 @@ cc_library( name = "reduction_mlir", srcs = ["reduction_mlir.cc"], hdrs = ["reduction_mlir.h"], + tags = ["gpu"], deps = [ ":fusion_emitter", ":reduction_base", @@ -550,6 +564,7 @@ cc_library( xla_cc_test( name = "reduction_base_test", srcs = ["reduction_base_test.cc"], + tags = ["gpu"], deps = [ ":reduction_base", "//xla/service/gpu:gpu_device_info_for_tests", @@ -564,6 +579,7 @@ cc_library( name = "concatenate_mlir", srcs = ["concatenate_mlir.cc"], hdrs = ["concatenate_mlir.h"], + tags = ["gpu"], deps = [ "//xla/hlo/ir:hlo", "//xla/service/gpu:gpu_fusible", @@ -588,6 +604,7 @@ cc_library( name = "input_slices_mlir", srcs = ["input_slices_mlir.cc"], hdrs = ["input_slices_mlir.h"], + tags = ["gpu"], deps = [ "//xla:util", "//xla:xla_data_proto_cc", diff --git a/xla/service/gpu/fusions/ir/tests/BUILD b/xla/service/gpu/fusions/ir/tests/BUILD index 381d5a3220b1d..6a1e332a2e9aa 100644 --- a/xla/service/gpu/fusions/ir/tests/BUILD +++ b/xla/service/gpu/fusions/ir/tests/BUILD @@ -9,6 +9,7 @@ lit_test_suite( name = "tests", srcs = glob(["*.mlir"]), cfg = "//xla:lit.cfg.py", + tags = ["gpu"], tools = [ "//xla/service/gpu/fusions/tools:mlir_fusions_opt", "@llvm-project//llvm:FileCheck", diff --git a/xla/service/gpu/fusions/legacy/BUILD b/xla/service/gpu/fusions/legacy/BUILD index 4ddfb29634e3b..7be07567d8fae 100644 --- a/xla/service/gpu/fusions/legacy/BUILD +++ b/xla/service/gpu/fusions/legacy/BUILD @@ -10,6 +10,7 @@ cc_library( name = "in_place_dynamic_update_slice", srcs = ["in_place_dynamic_update_slice.cc"], hdrs = ["in_place_dynamic_update_slice.h"], + tags = ["gpu"], deps = [ "//xla/hlo/ir:hlo", "//xla/service/gpu:hlo_fusion_analysis", @@ -33,6 +34,7 @@ cc_library( xla_cc_test( name = "in_place_dynamic_update_slice_test", srcs = ["in_place_dynamic_update_slice_test.cc"], + tags = ["gpu"], deps = [ ":in_place_dynamic_update_slice", "//xla/service/gpu:gpu_device_info_for_tests", @@ -53,6 +55,7 @@ cc_library( name = "loop", srcs = ["loop.cc"], hdrs = ["loop.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:util", @@ -83,6 +86,7 @@ cc_library( xla_cc_test( name = "loop_test", srcs = ["loop_test.cc"], + tags = ["gpu"], deps = [ "//xla:status_macros", "//xla/service/gpu:gpu_device_info_for_tests", @@ -105,6 +109,7 @@ cc_library( name = "scatter", srcs = ["scatter.cc"], hdrs = ["scatter.h"], + tags = ["gpu"], deps = [ ":loop", "//xla:shape_util", @@ -135,6 +140,7 @@ cc_library( xla_cc_test( name = "scatter_test", srcs = ["scatter_test.cc"], + tags = ["gpu"], deps = [ ":scatter", "//xla/service/gpu:gpu_device_info_for_tests", @@ -183,6 +189,7 @@ cc_library( name = "reduction", srcs = ["reduction.cc"], hdrs = ["reduction.h"], + tags = ["gpu"], deps = [ ":tiling_util", "//xla:shape_util", @@ -235,6 +242,7 @@ cc_library( xla_cc_test( name = "reduction_test", srcs = ["reduction_test.cc"], + tags = ["gpu"], deps = [ ":reduction", "//xla/hlo/ir:hlo", @@ -258,6 +266,7 @@ cc_library( name = "concatenate", srcs = ["concatenate.cc"], hdrs = ["concatenate.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla/hlo/ir:hlo", @@ -285,6 +294,7 @@ cc_library( xla_cc_test( name = "concatenate_test", srcs = ["concatenate_test.cc"], + tags = ["gpu"], deps = [ ":concatenate", "//xla/service/gpu:gpu_device_info_for_tests", @@ -304,6 +314,7 @@ cc_library( name = "transpose", srcs = ["transpose.cc"], hdrs = ["transpose.h"], + tags = ["gpu"], deps = [ ":tiling_util", "//xla:permutation_util", @@ -338,6 +349,7 @@ cc_library( xla_cc_test( name = "transpose_test", srcs = ["transpose_test.cc"], + tags = ["gpu"], deps = [ ":transpose", "//xla:status_macros", @@ -359,6 +371,7 @@ cc_library( name = "input_slices", srcs = ["input_slices.cc"], hdrs = ["input_slices.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:util", @@ -390,6 +403,7 @@ cc_library( xla_cc_test( name = "input_slices_test", srcs = ["input_slices_test.cc"], + tags = ["gpu"], deps = [ ":input_slices", "//xla/service/gpu:gpu_device_info_for_tests", diff --git a/xla/service/gpu/fusions/mlir/BUILD b/xla/service/gpu/fusions/mlir/BUILD index 21b1787438c4b..23ae695600e66 100644 --- a/xla/service/gpu/fusions/mlir/BUILD +++ b/xla/service/gpu/fusions/mlir/BUILD @@ -17,6 +17,7 @@ cc_library( name = "computation_partitioner", srcs = ["computation_partitioner.cc"], hdrs = ["computation_partitioner.h"], + tags = ["gpu"], deps = [ ":type_util", "//xla:shape_util", @@ -45,6 +46,7 @@ cc_library( xla_cc_test( name = "computation_partitioner_test", srcs = ["computation_partitioner_test.cc"], + tags = ["gpu"], deps = [ ":computation_partitioner", "//xla/hlo/ir:hlo", @@ -62,6 +64,7 @@ cc_library( name = "elemental_hlo_to_mlir", srcs = ["elemental_hlo_to_mlir.cc"], hdrs = ["elemental_hlo_to_mlir.h"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":type_util", @@ -110,6 +113,7 @@ cc_library( xla_cc_test( name = "elemental_hlo_to_mlir_test", srcs = ["elemental_hlo_to_mlir_test.cc"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":elemental_hlo_to_mlir", @@ -149,6 +153,7 @@ cc_library( name = "mlir_fusion_emitter", srcs = ["mlir_fusion_emitter.cc"], hdrs = ["mlir_fusion_emitter.h"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":elemental_hlo_to_mlir", @@ -227,6 +232,7 @@ cc_library( xla_cc_test( name = "mlir_fusion_emitter_test", srcs = ["mlir_fusion_emitter_test.cc"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":mlir_fusion_emitter", diff --git a/xla/service/gpu/fusions/tests/BUILD b/xla/service/gpu/fusions/tests/BUILD index d3e3b665e75d3..71bec2f5d4472 100644 --- a/xla/service/gpu/fusions/tests/BUILD +++ b/xla/service/gpu/fusions/tests/BUILD @@ -10,6 +10,7 @@ lit_test_suite( srcs = glob(["**/*.hlo"]), cfg = "//xla:lit.cfg.py", default_tags = ["requires-gpu-sm80-only"], + tags = ["gpu"], tools = [ "//xla/service/gpu/fusions/tools:fusion_to_mlir", "//xla/service/gpu/fusions/tools:mlir_fusions_opt", diff --git a/xla/service/gpu/fusions/tools/BUILD b/xla/service/gpu/fusions/tools/BUILD index 5b6c18f409287..7d1abdcb0456b 100644 --- a/xla/service/gpu/fusions/tools/BUILD +++ b/xla/service/gpu/fusions/tools/BUILD @@ -8,6 +8,7 @@ package( xla_cc_binary( name = "mlir_fusions_opt", srcs = ["mlir_fusions_opt.cc"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ "//xla/mlir_hlo", @@ -41,6 +42,7 @@ cc_library( testonly = 1, srcs = ["test_lib.cc"], hdrs = ["test_lib.h"], + tags = ["gpu"], deps = [ "//xla:status_macros", "//xla/hlo/ir:hlo", @@ -75,6 +77,7 @@ xla_cc_binary( name = "fusion_to_mlir", testonly = 1, srcs = ["fusion_to_mlir.cc"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ ":test_lib", @@ -90,6 +93,7 @@ xla_cc_binary( name = "test_correctness", testonly = 1, srcs = ["test_correctness.cc"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ ":test_lib", diff --git a/xla/service/gpu/fusions/transforms/BUILD b/xla/service/gpu/fusions/transforms/BUILD index ee8a424c5e93e..1abb55e541195 100644 --- a/xla/service/gpu/fusions/transforms/BUILD +++ b/xla/service/gpu/fusions/transforms/BUILD @@ -51,6 +51,7 @@ cc_library( "vectorize_loads_stores.cc", ], hdrs = ["passes.h"], + tags = ["gpu"], deps = [ ":passes_inc_gen", "//xla:shape_util", diff --git a/xla/service/gpu/fusions/transforms/tests/BUILD b/xla/service/gpu/fusions/transforms/tests/BUILD index 381d5a3220b1d..6a1e332a2e9aa 100644 --- a/xla/service/gpu/fusions/transforms/tests/BUILD +++ b/xla/service/gpu/fusions/transforms/tests/BUILD @@ -9,6 +9,7 @@ lit_test_suite( name = "tests", srcs = glob(["*.mlir"]), cfg = "//xla:lit.cfg.py", + tags = ["gpu"], tools = [ "//xla/service/gpu/fusions/tools:mlir_fusions_opt", "@llvm-project//llvm:FileCheck", diff --git a/xla/service/gpu/fusions/triton/BUILD b/xla/service/gpu/fusions/triton/BUILD index f2eebf5c7dcf6..fed1df75bfd40 100644 --- a/xla/service/gpu/fusions/triton/BUILD +++ b/xla/service/gpu/fusions/triton/BUILD @@ -33,6 +33,7 @@ cc_library( "compilation_pipeline_rocm.cc", ]), hdrs = ["triton_fusion_emitter.h"], + tags = ["gpu"], deps = [ ":passes", "//xla:autotuning_proto_cc", @@ -289,6 +290,7 @@ cc_library( testonly = True, srcs = ["triton_test_utils.cc"], hdrs = ["triton_test_utils.h"], + tags = ["gpu"], deps = [ ":triton_fusion_emitter", "//xla:shape_util", @@ -325,6 +327,7 @@ cc_library( xla_cc_test( name = "triton_fusion_emitter_mem_utils_test", srcs = if_cuda_is_configured(["triton_fusion_emitter_mem_utils_test.cc"]), + tags = ["gpu"], deps = [ ":triton_fusion_emitter", "//xla/hlo/ir:hlo", diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc index e3d939e873a22..59a20bd4c3d1e 100644 --- a/xla/service/gpu/gpu_executable.cc +++ b/xla/service/gpu/gpu_executable.cc @@ -72,9 +72,7 @@ limitations under the License. #include "xla/stream_executor/device_memory_allocator.h" #include "xla/stream_executor/event_based_timer.h" #include "xla/stream_executor/gpu/scoped_activate_context.h" -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "xla/stream_executor/gpu/gpu_executor.h" -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "xla/stream_executor/module_spec.h" #include "xla/stream_executor/platform.h" #include "xla/stream_executor/rocm/rocm_platform_id.h" @@ -805,12 +803,10 @@ absl::StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator(); se::StreamExecutor* executor = run_options->stream()->parent(); -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // GpuExecutable always bound to a single GpuContext during its execution, so // we activate it once to skip expensive context activations later. se::gpu::GpuExecutor* gpu_executor = se::gpu::ExtractGpuExecutor(executor); se::gpu::ScopedActivateContext activation(gpu_executor); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Force synchronous execution if the allocator requires it. const bool block_host_until_done = diff --git a/xla/service/gpu/model/BUILD b/xla/service/gpu/model/BUILD index 5825878005729..e988e3607558f 100644 --- a/xla/service/gpu/model/BUILD +++ b/xla/service/gpu/model/BUILD @@ -25,6 +25,7 @@ cc_library( name = "analytical_latency_estimator", srcs = ["analytical_latency_estimator.cc"], hdrs = ["analytical_latency_estimator.h"], + tags = ["gpu"], deps = [ ":gpu_collective_performance_model", ":gpu_hlo_cost_analysis", @@ -102,6 +103,7 @@ cc_library( name = "gpu_cost_model_stats_collection", srcs = ["gpu_cost_model_stats_collection.cc"], hdrs = ["gpu_cost_model_stats_collection.h"], + tags = ["gpu"], deps = [ ":gpu_hlo_cost_analysis", ":gpu_performance_model", @@ -121,6 +123,7 @@ cc_library( xla_cc_test( name = "gpu_cost_model_stats_collection_test", srcs = ["gpu_cost_model_stats_collection_test.cc"], + tags = ["gpu"], deps = [ ":gpu_cost_model_stats_collection", ":gpu_hlo_cost_analysis", @@ -191,6 +194,7 @@ cc_library( name = "gpu_performance_model_base", srcs = ["gpu_performance_model_base.cc"], hdrs = ["gpu_performance_model_base.h"], + tags = ["gpu"], deps = [ ":fusion_analysis_cache", ":gpu_hlo_cost_analysis", @@ -219,6 +223,7 @@ cc_library( xla_cc_test( name = "gpu_performance_model_base_test", srcs = ["gpu_performance_model_base_test.cc"], + tags = ["gpu"], deps = [ ":gpu_hlo_cost_analysis", ":gpu_performance_model_base", @@ -241,6 +246,7 @@ cc_library( name = "gpu_performance_model", srcs = ["gpu_performance_model.cc"], hdrs = ["gpu_performance_model.h"], + tags = ["gpu"], deps = [ ":coalescing_analysis", ":gpu_hlo_cost_analysis", @@ -263,6 +269,7 @@ cc_library( xla_cc_test( name = "gpu_performance_model_test", srcs = ["gpu_performance_model_test.cc"], + tags = ["gpu"], deps = [ ":fusion_analysis_cache", ":gpu_hlo_cost_analysis", @@ -294,6 +301,7 @@ cc_library( srcs = ["gpu_collective_performance_model.cc"], hdrs = ["gpu_collective_performance_model.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":coalescing_analysis", ":fusion_analysis_cache", @@ -332,6 +340,7 @@ cc_library( xla_cc_test( name = "gpu_collective_performance_model_test", srcs = ["gpu_collective_performance_model_test.cc"], + tags = ["gpu"], deps = [ "//xla/service/gpu:backend_configs_cc", "//xla/tests:hlo_test_base", @@ -344,6 +353,7 @@ cc_library( name = "gpu_indexing_performance_model", srcs = ["gpu_indexing_performance_model.cc"], hdrs = ["gpu_indexing_performance_model.h"], + tags = ["gpu"], deps = [ ":coalescing_analysis", ":fusion_analysis_cache", @@ -384,6 +394,7 @@ cc_library( xla_cc_test( name = "gpu_indexing_performance_model_test", srcs = ["gpu_indexing_performance_model_test.cc"], + tags = ["gpu"], deps = [ ":fusion_analysis_cache", ":gpu_hlo_cost_analysis", @@ -802,6 +813,7 @@ cc_library( name = "coalescing_analysis", srcs = ["coalescing_analysis.cc"], hdrs = ["coalescing_analysis.h"], + tags = ["gpu"], deps = [ ":affine_map_evaluator", ":indexing_analysis", @@ -825,6 +837,7 @@ cc_library( xla_cc_test( name = "coalescing_analysis_test", srcs = ["coalescing_analysis_test.cc"], + tags = ["gpu"], deps = [ ":coalescing_analysis", "//xla:shape_util", diff --git a/xla/service/gpu/runtime/BUILD b/xla/service/gpu/runtime/BUILD index ba0ab788880b5..b8261fab685f1 100644 --- a/xla/service/gpu/runtime/BUILD +++ b/xla/service/gpu/runtime/BUILD @@ -50,9 +50,7 @@ cc_library( name = "command_buffer_cmd", srcs = ["command_buffer_cmd.cc"], hdrs = ["command_buffer_cmd.h"], - local_defines = if_cuda_is_configured([ - "GOOGLE_CUDA=1", - ]), + tags = ["gpu"], deps = [ ":annotation", ":custom_call_thunk", @@ -76,7 +74,6 @@ cc_library( "//xla/service:computation_placer", "//xla/service:custom_call_status_internal", "//xla/service:custom_call_status_public_headers", - "//xla/service:executable", "//xla/service:global_device_id", "//xla/service/gpu:buffer_allocations", "//xla/service/gpu:launch_dimensions", @@ -86,8 +83,8 @@ cc_library( "//xla/stream_executor", "//xla/stream_executor:command_buffer", "//xla/stream_executor:dnn", - "//xla/stream_executor:lazy_op_runner", "//xla/stream_executor:trace_command_buffer_factory", + "//xla/stream_executor/cuda:cuda_platform_id", "//xla/stream_executor/gpu:gpu_stream_header", "//xla/stream_executor/gpu:gpu_types_header", "//xla/tsl/concurrency:ref_count", @@ -114,6 +111,7 @@ cc_library( name = "command_buffer_cmd_emitter", srcs = ["command_buffer_cmd_emitter.cc"], hdrs = ["command_buffer_cmd_emitter.h"], + tags = ["gpu"], deps = [ ":command_buffer_cmd", ":conditional_thunk", @@ -144,9 +142,8 @@ cc_library( xla_test( name = "command_buffer_cmd_test", - srcs = if_gpu_is_configured(["command_buffer_cmd_test.cc"]), + srcs = ["command_buffer_cmd_test.cc"], backends = ["gpu"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]), deps = [ ":command_buffer_cmd", ":thunk", @@ -351,7 +348,7 @@ cc_library( xla_test( name = "dynamic_slice_thunk_test", - srcs = if_gpu_is_configured(["dynamic_slice_thunk_test.cc"]), + srcs = ["dynamic_slice_thunk_test.cc"], backend_tags = { "gpu_a100": if_google(["config-cuda-only"]), "gpu_v100": if_google(["config-cuda-only"]), @@ -361,7 +358,6 @@ xla_test( "gpu_v100", "gpu_amd_any", ], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]), deps = [ ":custom_call_thunk", ":dynamic_slice_thunk", @@ -375,7 +371,6 @@ xla_test( "//xla/service:executable", "//xla/service:platform_util", "//xla/service/gpu:buffer_allocations", - "//xla/service/gpu:launch_dimensions", "//xla/service/gpu:matmul_utils", "//xla/stream_executor", "//xla/stream_executor:blas", @@ -383,35 +378,33 @@ xla_test( "//xla/stream_executor:platform", "//xla/stream_executor:platform_manager", "//xla/stream_executor:stream_executor_memory_allocator", - "//xla/stream_executor/gpu:gpu_test_kernels", + "//xla/stream_executor/cuda:cuda_platform_id", "//xla/stream_executor/gpu:gpu_types_header", "//xla/tsl/lib/core:status_test_util", - "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", "@tsl//tsl/platform:statusor", "@tsl//tsl/platform:test", "@tsl//tsl/platform:test_main", - ] + if_cuda_is_configured([ - "@local_config_cuda//cuda:cuda_headers", - ]), + ], ) cc_library( name = "cholesky_thunk", - srcs = if_gpu_is_configured(["cholesky_thunk.cc"]), - hdrs = if_gpu_is_configured(["cholesky_thunk.h"]), - deps = if_gpu_is_configured([ - "//xla/service/gpu:buffer_allocations", - "//xla/service/gpu:cusolver_context", - "//xla/service/gpu:make_batch_pointers", - "//xla/service/gpu/runtime:thunk", + srcs = ["cholesky_thunk.cc"], + hdrs = ["cholesky_thunk.h"], + tags = ["gpu"], + deps = [ "//xla:types", "//xla:util", "//xla:xla_data_proto_cc", - "//xla/service:buffer_assignment", "//xla/hlo/ir:hlo", - "@tsl//tsl/platform:logging", + "//xla/service:buffer_assignment", + "//xla/service/gpu:buffer_allocations", + "//xla/service/gpu:cusolver_context", + "//xla/service/gpu:make_batch_pointers", + "//xla/service/gpu/runtime:thunk", "//xla/stream_executor", "//xla/stream_executor:blas", "//xla/stream_executor:device_memory", @@ -419,14 +412,16 @@ cc_library( "@com_google_absl//absl/status", "@com_google_absl//absl/strings:str_format", "@tsl//tsl/platform:errors", + "@tsl//tsl/platform:logging", "@tsl//tsl/platform:status", - ]), + ], ) cc_library( name = "command_buffer_thunk", srcs = ["command_buffer_thunk.cc"], hdrs = ["command_buffer_thunk.h"], + tags = ["gpu"], deps = [ ":annotation", ":command_buffer_cmd", @@ -463,7 +458,6 @@ xla_test( "gpu_v100", "gpu_amd_any", ], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]), deps = [ ":command_buffer_cmd", ":command_buffer_thunk", @@ -479,13 +473,16 @@ xla_test( "//xla/service/gpu:buffer_allocations", "//xla/service/gpu:launch_dimensions", "//xla/service/gpu:matmul_utils", + "//xla/service/gpu/kernels:custom_kernel", "//xla/stream_executor", "//xla/stream_executor:blas", "//xla/stream_executor:command_buffer", "//xla/stream_executor:kernel_spec", "//xla/stream_executor:platform", "//xla/stream_executor:platform_manager", + "//xla/stream_executor:semantic_version", "//xla/stream_executor:stream_executor_memory_allocator", + "//xla/stream_executor/cuda:cuda_platform_id", "//xla/stream_executor/gpu:gpu_test_kernels", "//xla/stream_executor/gpu:gpu_test_kernels_fatbin", "//xla/stream_executor/gpu:gpu_types_header", @@ -497,9 +494,7 @@ xla_test( "@tsl//tsl/platform:test", "@tsl//tsl/platform:test_main", "@tsl//tsl/profiler/lib:profiler_lock", - ] + if_cuda_is_configured([ - "@local_config_cuda//cuda:cuda_headers", - ]), + ], ) cc_library( @@ -530,17 +525,17 @@ cc_library( name = "convolution_thunk", srcs = ["convolution_thunk.cc"], hdrs = ["convolution_thunk.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), deps = [ ":thunk", "//xla:util", "//xla/service:buffer_assignment", "//xla/service/gpu:gpu_conv_runner", + "//xla/service/gpu:stream_executor_util", "//xla/stream_executor", "//xla/stream_executor:dnn", + "//xla/stream_executor:lazy_op_runner", "//xla/stream_executor:scratch_allocator", + "//xla/stream_executor/rocm:rocm_platform_id", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:inlined_vector", @@ -549,10 +544,8 @@ cc_library( "@com_google_absl//absl/synchronization", "@com_google_absl//absl/types:span", "@tsl//tsl/platform:errors", - ] + if_rocm_is_configured([ - # keep sorted - "//xla/service/gpu:stream_executor_util", - ]), + "@tsl//tsl/platform:statusor", + ], ) cc_library( @@ -578,12 +571,10 @@ cc_library( cc_library( name = "cub_sort_thunk", - srcs = if_gpu_is_configured(["cub_sort_thunk.cc"]), - hdrs = if_gpu_is_configured(["cub_sort_thunk.h"]), - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), - deps = if_gpu_is_configured([ + srcs = ["cub_sort_thunk.cc"], + hdrs = ["cub_sort_thunk.h"], + tags = ["gpu"], + deps = ([ "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/log:check", "@com_google_absl//absl/status", @@ -606,9 +597,7 @@ cc_library( name = "custom_call_thunk", srcs = ["custom_call_thunk.cc"], hdrs = ["custom_call_thunk.h"], - local_defines = if_cuda_is_configured([ - "GOOGLE_CUDA=1", - ]), + tags = ["gpu"], deps = [ ":thunk", "//xla:executable_run_options", @@ -880,9 +869,7 @@ cc_library( name = "nccl_collective_thunk", srcs = ["nccl_collective_thunk.cc"], hdrs = ["nccl_collective_thunk.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW=1", - ]), + local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), deps = [ ":nccl_api", ":nccl_clique", @@ -904,7 +891,6 @@ cc_library( "//xla/stream_executor", "//xla/stream_executor:event", "//xla/stream_executor/gpu:gpu_driver_header", - "//xla/stream_executor/gpu:gpu_stream", "//xla/stream_executor/gpu:gpu_types_header", "//xla/translate/mhlo_to_hlo:attribute_exporter", "@com_google_absl//absl/algorithm:container", @@ -912,6 +898,7 @@ cc_library( "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings:str_format", @@ -920,11 +907,7 @@ cc_library( "@llvm-project//mlir:IR", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:statusor", - ] + if_cuda_is_configured([ - "@local_config_nccl//:nccl", - ]) + if_rocm_is_configured([ - "@local_config_rocm//rocm:rccl", - ]), + ], ) cc_library( @@ -1059,9 +1042,6 @@ cc_library( name = "sequential_thunk", srcs = ["sequential_thunk.cc"], hdrs = ["sequential_thunk.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), deps = [ ":annotation", "//xla/service/gpu/runtime:thunk", @@ -1136,6 +1116,7 @@ cc_library( name = "for_all_thunks", srcs = ["for_all_thunks.cc"], hdrs = ["for_all_thunks.h"], + tags = ["gpu"], deps = [ ":command_buffer_thunk", ":conditional_thunk", @@ -1151,6 +1132,7 @@ cc_library( xla_cc_test( name = "for_all_thunks_test", srcs = ["for_all_thunks_test.cc"], + tags = ["gpu"], deps = [ ":command_buffer_cmd", ":command_buffer_thunk", @@ -1168,11 +1150,11 @@ xla_cc_test( cc_library( name = "triangular_solve_thunk", - srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]), - hdrs = if_gpu_is_configured(["triangular_solve_thunk.h"]), - deps = if_gpu_is_configured([ - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings:str_format", + srcs = ["triangular_solve_thunk.cc"], + hdrs = ["triangular_solve_thunk.h"], + tags = ["gpu"], + deps = [ + "//xla:status_macros", "//xla:types", "//xla:util", "//xla:xla_data_proto_cc", @@ -1185,10 +1167,12 @@ cc_library( "//xla/stream_executor:blas", "//xla/stream_executor:device_memory", "//xla/stream_executor/gpu:gpu_asm_opts", + "@com_google_absl//absl/status", + "@com_google_absl//absl/strings:str_format", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:logging", "@tsl//tsl/platform:status", - ]) + ["//xla:status_macros"], + ], ) cc_library( diff --git a/xla/service/gpu/runtime/command_buffer_cmd.cc b/xla/service/gpu/runtime/command_buffer_cmd.cc index 0a1a46980ed72..fa41af403d48b 100644 --- a/xla/service/gpu/runtime/command_buffer_cmd.cc +++ b/xla/service/gpu/runtime/command_buffer_cmd.cc @@ -45,6 +45,8 @@ limitations under the License. #include "xla/service/buffer_assignment.h" #include "xla/service/collective_ops_utils.h" #include "xla/service/computation_placer.h" +#include "xla/service/custom_call_status.h" +#include "xla/service/custom_call_status_internal.h" #include "xla/service/global_device_id.h" #include "xla/service/gpu/buffer_allocations.h" #include "xla/service/gpu/kernels/custom_kernel.h" @@ -59,13 +61,14 @@ limitations under the License. #include "xla/service/gpu/runtime/nccl_collective_thunk.h" #include "xla/service/gpu/runtime/thunk.h" #include "xla/service/gpu/stream_executor_util.h" -#include "xla/service/service_executable_run_options.h" #include "xla/stream_executor/command_buffer.h" +#include "xla/stream_executor/cuda/cuda_platform_id.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/dnn.h" +#include "xla/stream_executor/gpu/gpu_stream.h" +#include "xla/stream_executor/gpu/gpu_types.h" #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/launch_dim.h" -#include "xla/stream_executor/lazy_op_runner.h" #include "xla/stream_executor/stream.h" #include "xla/stream_executor/stream_executor.h" #include "xla/stream_executor/trace_command_buffer_factory.h" @@ -78,13 +81,6 @@ limitations under the License. #include "tsl/platform/statusor.h" #include "tsl/profiler/lib/scoped_annotation.h" -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#include "xla/service/custom_call_status.h" -#include "xla/service/custom_call_status_internal.h" -#include "xla/stream_executor/gpu/gpu_stream.h" -#include "xla/stream_executor/gpu/gpu_types.h" -#endif - namespace xla::gpu { namespace { @@ -555,20 +551,20 @@ CommandBufferCmd::BufferUsageVector ComputationIdCmd::buffers() { absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params, StateManager& state) { -#if defined(GOOGLE_CUDA) - { - absl::MutexLock lock(&mutex_); - if (memset_kernels_.contains(params.executor)) return absl::OkStatus(); - } + if (params.executor->GetPlatform()->id() == se::cuda::kCudaPlatformId) { + { + absl::MutexLock lock(&mutex_); + if (memset_kernels_.contains(params.executor)) return absl::OkStatus(); + } - TF_ASSIGN_OR_RETURN(std::unique_ptr kernel, - CreateKernel("memset32", 3, kMemset32Kernel, - /*cubin_data=*/{}, params.executor, - /*shared_mem_bytes=*/0)); + TF_ASSIGN_OR_RETURN(std::unique_ptr kernel, + CreateKernel("memset32", 3, kMemset32Kernel, + /*cubin_data=*/{}, params.executor, + /*shared_mem_bytes=*/0)); - absl::MutexLock lock(&mutex_); - memset_kernels_.emplace(params.executor, std::move(kernel)); -#endif // GOOGLE_CUDA + absl::MutexLock lock(&mutex_); + memset_kernels_.emplace(params.executor, std::move(kernel)); + } return absl::OkStatus(); } @@ -595,24 +591,29 @@ absl::Status ComputationIdCmd::Record( << "; execution_scope_id=" << execution_scope_id.value(); VLOG(5) << " Id: " << dest_ << " (" << dst.opaque() << ")"; -#if defined(GOOGLE_CUDA) - se::Kernel* memset_kernel = [&] { + const bool has_memset_kernel = [&] { absl::MutexLock lock(&mutex_); - return memset_kernels_[execute_params.stream->parent()].get(); + return memset_kernels_.contains(execute_params.stream->parent()); }(); - if (memset_kernel == nullptr) { - return absl::InternalError( - "Memset kernel not loaded on a command buffer executor"); - } + if (has_memset_kernel) { + se::Kernel* memset_kernel = [&] { + absl::MutexLock lock(&mutex_); + return memset_kernels_[execute_params.stream->parent()].get(); + }(); - auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst); - return command_buffer->Launch(execution_scope_id, se::ThreadDim(1), - se::BlockDim(1), *memset_kernel, *args); -#else - return command_buffer->Memset(execution_scope_id, &dst, value, - /*num_elements=*/1); -#endif // GOOGLE_CUDA + if (memset_kernel == nullptr) { + return absl::InternalError( + "Memset kernel not loaded on a command buffer executor"); + } + + auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst); + return command_buffer->Launch(execution_scope_id, se::ThreadDim(1), + se::BlockDim(1), *memset_kernel, *args); + } else { + return command_buffer->Memset(execution_scope_id, &dst, value, + /*num_elements=*/1); + } } //===----------------------------------------------------------------------===// @@ -1439,7 +1440,6 @@ absl::Status CustomCallCmd::RecordLegacyCustomCall( } } -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM TF_ASSIGN_OR_RETURN( auto nested_cmd, se::TraceCommandBufferFactory::Create( @@ -1460,11 +1460,6 @@ absl::Status CustomCallCmd::RecordLegacyCustomCall( return command_buffer->AddNestedCommandBuffer(execution_scope_id, *nested_cmd); -#else // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - return Unavailable( - "Custom calls on GPU are not supported in this configuration. Please " - "build with --config=cuda or --config=rocm"); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } absl::Status CustomCallCmd::RecordXlaFfiCall( @@ -1519,7 +1514,6 @@ absl::Status CustomCallCmd::RecordXlaFfiCall( builder.AddAttributes(attrs.Build()); ffi::CallFrame call_frame = builder.Build(); -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM TF_ASSIGN_OR_RETURN( auto nested_cmd, se::TraceCommandBufferFactory::Create( @@ -1537,11 +1531,6 @@ absl::Status CustomCallCmd::RecordXlaFfiCall( return command_buffer->AddNestedCommandBuffer(execution_scope_id, *nested_cmd); -#else // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - return Unavailable( - "Custom calls on GPU are not supported in this configuration. Please " - "build with --config=cuda or --config=rocm"); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } CommandBufferCmd::BufferUsageVector CustomCallCmd::buffers() { diff --git a/xla/service/gpu/runtime/command_buffer_thunk_test.cc b/xla/service/gpu/runtime/command_buffer_thunk_test.cc index bce9d1927d05e..6d9b4514fb174 100644 --- a/xla/service/gpu/runtime/command_buffer_thunk_test.cc +++ b/xla/service/gpu/runtime/command_buffer_thunk_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include #include #include @@ -29,6 +30,7 @@ limitations under the License. #include "absl/types/span.h" #include "xla/service/buffer_assignment.h" #include "xla/service/gpu/buffer_allocations.h" +#include "xla/service/gpu/kernels/custom_kernel.h" #include "xla/service/gpu/launch_dimensions.h" #include "xla/service/gpu/matmul_utils.h" #include "xla/service/gpu/runtime/command_buffer_cmd.h" @@ -40,6 +42,7 @@ limitations under the License. #include "xla/shape_util.h" #include "xla/stream_executor/blas.h" #include "xla/stream_executor/command_buffer.h" +#include "xla/stream_executor/cuda/cuda_platform_id.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/device_memory_allocator.h" #include "xla/stream_executor/gpu/gpu_test_kernels.h" @@ -47,8 +50,10 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_types.h" // IWYU pragma: keep #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/kernel_spec.h" +#include "xla/stream_executor/launch_dim.h" #include "xla/stream_executor/platform.h" #include "xla/stream_executor/platform_manager.h" +#include "xla/stream_executor/semantic_version.h" #include "xla/stream_executor/stream_executor.h" #include "xla/stream_executor/stream_executor_memory_allocator.h" #include "xla/tsl/lib/core/status_test_util.h" @@ -58,10 +63,6 @@ limitations under the License. #include "tsl/platform/test.h" #include "tsl/profiler/lib/profiler_lock.h" -#ifdef GOOGLE_CUDA -#include "third_party/gpus/cuda/include/cuda.h" -#endif - namespace xla::gpu { using MemoryAccess = CommandBufferCmd::MemoryAccess; @@ -101,14 +102,17 @@ KernelArgsPacking CreateDefaultArgsPacking() { } // Some of the tests rely on CUDA 12.3+ features. -bool IsAtLeastCuda12300() { -#if defined(TENSORFLOW_USE_ROCM) - return false; -#endif -#if CUDA_VERSION >= 12030 +static bool IsAtLeastCuda12300( + const stream_executor::StreamExecutor* executor) { + if (executor->GetPlatform()->id() != stream_executor::cuda::kCudaPlatformId) { + return false; + } + if (std::min({executor->GetDeviceDescription().runtime_version(), + executor->GetDeviceDescription().driver_version()}) < + stream_executor::SemanticVersion{12, 3, 0}) { + return false; + } return true; -#endif - return false; } // Give a short aliases to execution threads. @@ -592,12 +596,12 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) { } TEST(CommandBufferThunkTest, GemmCmd) { - if (!IsAtLeastCuda12300()) { + se::StreamExecutor* executor = GpuExecutor(); + + if (!IsAtLeastCuda12300(executor)) { GTEST_SKIP() << "CUDA graph tracing is not supported"; } - se::StreamExecutor* executor = GpuExecutor(); - TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream()); int64_t lhs_length = sizeof(float) * 2 * 4; @@ -708,12 +712,12 @@ TEST(CommandBufferThunkTest, GemmCmd) { } TEST(CommandBufferThunkTest, CublasLtCmd) { - if (!IsAtLeastCuda12300()) { + se::StreamExecutor* executor = GpuExecutor(); + + if (!IsAtLeastCuda12300(executor)) { GTEST_SKIP() << "CUDA graph tracing is not supported"; } - se::StreamExecutor* executor = GpuExecutor(); - TF_ASSERT_OK_AND_ASSIGN(auto stream1, executor->CreateStream()); TF_ASSERT_OK_AND_ASSIGN(auto stream2, executor->CreateStream()); @@ -967,12 +971,12 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) { } TEST(CommandBufferThunkTest, IfCmd) { - if (!IsAtLeastCuda12300()) { + se::StreamExecutor* executor = GpuExecutor(); + + if (!IsAtLeastCuda12300(executor)) { GTEST_SKIP() << "CUDA graph conditionals are not supported"; } - se::StreamExecutor* executor = GpuExecutor(); - TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream()); int64_t length = 4; @@ -1055,12 +1059,12 @@ TEST(CommandBufferThunkTest, IfCmd) { } TEST(CommandBufferThunkTest, IfElseCmd) { - if (!IsAtLeastCuda12300()) { + se::StreamExecutor* executor = GpuExecutor(); + + if (!IsAtLeastCuda12300(executor)) { GTEST_SKIP() << "CUDA graph conditionals are not supported"; } - se::StreamExecutor* executor = GpuExecutor(); - TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream()); int64_t length = 4; @@ -1148,12 +1152,12 @@ TEST(CommandBufferThunkTest, IfElseCmd) { } TEST(CommandBufferThunkTest, CaseCmd) { - if (!IsAtLeastCuda12300()) { + se::StreamExecutor* executor = GpuExecutor(); + + if (!IsAtLeastCuda12300(executor)) { GTEST_SKIP() << "CUDA graph conditionals are not supported"; } - se::StreamExecutor* executor = GpuExecutor(); - TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream()); int64_t length = 4; @@ -1237,12 +1241,12 @@ TEST(CommandBufferThunkTest, CaseCmd) { } TEST(CommandBufferThunkTest, ForCmd) { - if (!IsAtLeastCuda12300()) { + se::StreamExecutor* executor = GpuExecutor(); + + if (!IsAtLeastCuda12300(executor)) { GTEST_SKIP() << "CUDA graph conditionals are not supported"; } - se::StreamExecutor* executor = GpuExecutor(); - TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream()); int64_t length = 4; diff --git a/xla/service/gpu/runtime/convolution_thunk.cc b/xla/service/gpu/runtime/convolution_thunk.cc index 8577fe9f55106..35b0e34e404d3 100644 --- a/xla/service/gpu/runtime/convolution_thunk.cc +++ b/xla/service/gpu/runtime/convolution_thunk.cc @@ -27,17 +27,18 @@ limitations under the License. #include "absl/synchronization/mutex.h" #include "absl/types/span.h" #include "xla/service/buffer_assignment.h" -#if TENSORFLOW_USE_ROCM -#include "xla/service/gpu/stream_executor_util.h" -#endif // TENSORFLOW_USE_ROCM #include "xla/service/gpu/gpu_conv_runner.h" #include "xla/service/gpu/runtime/thunk.h" +#include "xla/service/gpu/stream_executor_util.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/dnn.h" +#include "xla/stream_executor/lazy_op_runner.h" +#include "xla/stream_executor/rocm/rocm_platform_id.h" #include "xla/stream_executor/scratch_allocator.h" #include "xla/stream_executor/stream_executor.h" #include "xla/util.h" #include "tsl/platform/errors.h" +#include "tsl/platform/statusor.h" namespace xla { namespace gpu { @@ -87,8 +88,9 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) { RunConvOptions opts; opts.runner_cache = &GetOrCreateRunner(params.stream, &runner_created); -#if TENSORFLOW_USE_ROCM - if (runner_created) { + if (params.stream->parent()->GetPlatform()->id() == + se::rocm::kROCmPlatformId && + runner_created) { TF_ASSIGN_OR_RETURN( GpuConvParams conv_params, GetGpuConvParams(config_, operand_se_buffers, result_se_buffers)); @@ -116,7 +118,6 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) { conv_params.output_buf, config_.conv_desc, &scratch_allocator, &profile_results); } -#endif // TENSORFLOW_USE_ROCM TF_RETURN_IF_ERROR(RunGpuConv(config_, absl::MakeSpan(operand_se_buffers), absl::MakeSpan(result_se_buffers), scratch, diff --git a/xla/service/gpu/runtime/custom_call_thunk.cc b/xla/service/gpu/runtime/custom_call_thunk.cc index 7cf44c109cb26..f34f202a613d4 100644 --- a/xla/service/gpu/runtime/custom_call_thunk.cc +++ b/xla/service/gpu/runtime/custom_call_thunk.cc @@ -39,14 +39,11 @@ limitations under the License. #include "xla/service/gpu/runtime/thunk.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/device_memory_allocator.h" +#include "xla/stream_executor/gpu/gpu_stream.h" #include "xla/stream_executor/stream.h" #include "xla/util.h" #include "tsl/platform/errors.h" -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#include "xla/stream_executor/gpu/gpu_stream.h" -#endif - namespace xla { namespace gpu { @@ -137,7 +134,6 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) { } } -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM auto gpu_stream = se::gpu::AsGpuStreamValue(params.stream); XlaCustomCallStatus custom_call_status; call_target_(gpu_stream, buffers.data(), opaque_.data(), opaque_.size(), @@ -148,11 +144,6 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) { } else { return absl::OkStatus(); } -#else // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - return Unavailable( - "Custom calls on GPU are not supported in this configuration. Please " - "build with --config=cuda or --config=rocm"); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } absl::Status CustomCallThunk::ExecuteFfiHandler( diff --git a/xla/service/gpu/runtime/custom_call_thunk.h b/xla/service/gpu/runtime/custom_call_thunk.h index c65676381f9c8..41a8048570344 100644 --- a/xla/service/gpu/runtime/custom_call_thunk.h +++ b/xla/service/gpu/runtime/custom_call_thunk.h @@ -39,11 +39,8 @@ limitations under the License. #include "xla/service/gpu/runtime/thunk.h" #include "xla/shape.h" #include "xla/stream_executor/device_memory_allocator.h" -#include "xla/stream_executor/stream.h" - -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "xla/stream_executor/gpu/gpu_types.h" -#endif +#include "xla/stream_executor/stream.h" namespace xla { namespace gpu { @@ -61,12 +58,7 @@ namespace gpu { // compiler is allowed to create. class CustomCallThunk : public Thunk { public: -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM using Stream = stream_executor::gpu::GpuStreamHandle; -#else // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - using Stream = void*; -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - using CustomCallTarget = std::function; diff --git a/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc b/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc index 75c700b75b4e3..e072d9badb27e 100644 --- a/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc +++ b/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc @@ -17,15 +17,15 @@ limitations under the License. #include #include -#include #include #include #include #include -#include "absl/algorithm/container.h" +#include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/ascii.h" +#include "absl/strings/string_view.h" #include "xla/ffi/ffi.h" #include "xla/ffi/ffi_api.h" #include "xla/service/buffer_assignment.h" @@ -39,6 +39,7 @@ limitations under the License. #include "xla/shape_util.h" #include "xla/stream_executor/blas.h" #include "xla/stream_executor/command_buffer.h" +#include "xla/stream_executor/cuda/cuda_platform_id.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/device_memory_allocator.h" #include "xla/stream_executor/gpu/gpu_types.h" // IWYU pragma: keep @@ -52,12 +53,6 @@ limitations under the License. #include "tsl/platform/statusor.h" #include "tsl/platform/test.h" -#if GOOGLE_CUDA -#define PLATFORM "CUDA" -#elif TENSORFLOW_USE_ROCM -#define PLATFORM "ROCM" -#endif - namespace xla::gpu { namespace { @@ -398,7 +393,9 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy, .Arg() // src .Ret() // dst ); -XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM, +XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", "CUDA", + kMemcpy); +XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", "ROCM", kMemcpy); TEST(DynamicSliceThunkTest, SlicedMemcpy) { @@ -447,7 +444,11 @@ TEST(DynamicSliceThunkTest, SlicedMemcpy) { // Preparing custom call thunk: setting up call target and operands + results // buffers. - auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM); + absl::string_view platform_name = + executor->GetPlatform()->id() == se::cuda::kCudaPlatformId ? "CUDA" + : "ROCM"; + auto registration = + xla::ffi::FindHandler("__xla_test$$memcpy", platform_name); ASSERT_TRUE(registration.ok()); std::vector> operands{ @@ -607,7 +608,11 @@ TEST(DynamicSliceThunkTest, SlicedOutputMemcpy) { // Preparing custom call thunk: setting up call target and operands + results // buffers. - auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM); + const absl::string_view platform_name = + executor->GetPlatform()->id() == se::cuda::kCudaPlatformId ? "CUDA" + : "ROCM"; + auto registration = + xla::ffi::FindHandler("__xla_test$$memcpy", platform_name); ASSERT_TRUE(registration.ok()); std::vector> operands{ @@ -1248,7 +1253,11 @@ TEST(DynamicSliceThunkTest, SlicedMemcpyOOB) { // Preparing custom call thunk: setting up call target and operands + results // buffers. - auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM); + absl::string_view platform_name = + executor->GetPlatform()->id() == se::cuda::kCudaPlatformId ? "CUDA" + : "ROCM"; + auto registration = + xla::ffi::FindHandler("__xla_test$$memcpy", platform_name); ASSERT_TRUE(registration.ok()); std::vector> operands{ diff --git a/xla/service/gpu/runtime/nccl_collective_thunk.cc b/xla/service/gpu/runtime/nccl_collective_thunk.cc index 8e075c8d01c73..db772eccb2d47 100644 --- a/xla/service/gpu/runtime/nccl_collective_thunk.cc +++ b/xla/service/gpu/runtime/nccl_collective_thunk.cc @@ -28,13 +28,13 @@ limitations under the License. #include "absl/base/thread_annotations.h" #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" +#include "absl/log/check.h" #include "absl/log/log.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/str_format.h" #include "absl/synchronization/mutex.h" #include "absl/time/time.h" -#include "mlir/IR/Value.h" #include "xla/debug_options_flags.h" #include "xla/hlo/ir/hlo_instructions.h" #include "xla/layout_util.h" @@ -43,13 +43,13 @@ limitations under the License. #include "xla/service/computation_placer.h" #include "xla/service/global_device_id.h" #include "xla/service/gpu/buffer_allocations.h" -#include "xla/service/gpu/ir_emission_utils.h" #include "xla/service/gpu/runtime/nccl_api.h" #include "xla/service/gpu/runtime/nccl_clique.h" #include "xla/service/gpu/runtime/nccl_clique_key.h" #include "xla/service/gpu/runtime/thunk.h" #include "xla/service/rendezvous.h" #include "xla/shape.h" +#include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/event.h" #include "xla/stream_executor/stream.h" #include "xla/util.h" diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD index 490f5f4fce44f..a2d5f8857b070 100644 --- a/xla/service/gpu/tests/BUILD +++ b/xla/service/gpu/tests/BUILD @@ -701,6 +701,7 @@ lit_test_suite( # cc_binary( # name = "xla-opt", # srcs = ["xla-opt.cc"], +# tags = ["gpu"], # deps = [ # "//xla/service/gpu/fusions/transforms:passes", # "//xla/service/gpu/fusions/triton:passes", diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD index 842dffa0028a6..12718715497de 100644 --- a/xla/service/gpu/transforms/BUILD +++ b/xla/service/gpu/transforms/BUILD @@ -1443,6 +1443,7 @@ cc_library( name = "fusion_merger", srcs = ["fusion_merger.cc"], hdrs = ["fusion_merger.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:util", @@ -1471,6 +1472,7 @@ xla_cc_test( name = "fusion_merger_test", srcs = ["fusion_merger_test.cc"], tags = [ + "gpu", "nomsan", ], deps = [ @@ -1994,6 +1996,7 @@ cc_library( name = "multi_output_fusion", srcs = ["multi_output_fusion.cc"], hdrs = ["multi_output_fusion.h"], + tags = ["gpu"], deps = [ "//xla:debug_options_flags", "//xla:shape_util", @@ -2025,6 +2028,7 @@ xla_cc_test( name = "multi_output_fusion_test", srcs = ["multi_output_fusion_test.cc"], tags = [ + "gpu", "nomsan", ], deps = [ @@ -2086,6 +2090,7 @@ cc_library( name = "priority_fusion", srcs = ["priority_fusion.cc"], hdrs = ["priority_fusion.h"], + tags = ["gpu"], deps = [ "//xla:debug_options_flags", "//xla:shape_util", @@ -2137,6 +2142,7 @@ xla_cc_test( name = "priority_fusion_test", srcs = ["priority_fusion_test.cc"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":priority_fusion", "//xla:shape_util", @@ -2522,6 +2528,7 @@ cc_library( name = "softmax_rewriter_triton", srcs = ["softmax_rewriter_triton.cc"], hdrs = ["softmax_rewriter_triton.h"], + tags = ["gpu"], deps = [ ":reduction_dimension_grouper", ":reduction_splitter", @@ -2568,6 +2575,7 @@ cc_library( xla_cc_test( name = "softmax_rewriter_triton_test", srcs = ["softmax_rewriter_triton_test.cc"], + tags = ["gpu"], deps = [ ":softmax_rewriter_triton", "//xla:shape_util", @@ -2593,11 +2601,9 @@ xla_cc_test( cc_library( name = "sort_rewriter", - srcs = if_gpu_is_configured( - ["sort_rewriter.cc"], - ["sort_rewriter_stub.cc"], - ), + srcs = ["sort_rewriter.cc"], hdrs = ["sort_rewriter.h"], + tags = ["gpu"], deps = [ "//xla:comparison_util", "//xla:shape_util", @@ -2620,7 +2626,7 @@ cc_library( xla_test( name = "sort_rewriter_test", - srcs = if_cuda_is_configured(["sort_rewriter_test.cc"]), + srcs = ["sort_rewriter_test.cc"], backends = ["gpu"], tags = ["no_oss"], deps = [