From 3662bc34decbd525f1d3405be5e0ad5dcd910c68 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 16 Sep 2024 01:06:19 -0700
Subject: [PATCH] Remove if_cuda_is_configured and if_rocm_is_configured from
 command_buffer_cmd and custom_call_thunk

PiperOrigin-RevId: 675043057
---
 xla/service/gpu/BUILD                         | 212 +++++++++---------
 xla/service/gpu/fusions/BUILD                 |  17 ++
 xla/service/gpu/fusions/ir/tests/BUILD        |   1 +
 xla/service/gpu/fusions/legacy/BUILD          |  14 ++
 xla/service/gpu/fusions/mlir/BUILD            |   6 +
 xla/service/gpu/fusions/tests/BUILD           |   1 +
 xla/service/gpu/fusions/tools/BUILD           |   4 +
 xla/service/gpu/fusions/transforms/BUILD      |   1 +
 .../gpu/fusions/transforms/tests/BUILD        |   1 +
 xla/service/gpu/fusions/triton/BUILD          |   3 +
 xla/service/gpu/gpu_executable.cc             |   4 -
 xla/service/gpu/model/BUILD                   |  13 ++
 xla/service/gpu/runtime/BUILD                 | 110 ++++-----
 xla/service/gpu/runtime/command_buffer_cmd.cc |  83 +++----
 .../gpu/runtime/command_buffer_thunk_test.cc  |  62 ++---
 xla/service/gpu/runtime/convolution_thunk.cc  |  13 +-
 xla/service/gpu/runtime/custom_call_thunk.cc  |  11 +-
 xla/service/gpu/runtime/custom_call_thunk.h   |  10 +-
 .../gpu/runtime/dynamic_slice_thunk_test.cc   |  33 ++-
 .../gpu/runtime/nccl_collective_thunk.cc      |   4 +-
 xla/service/gpu/tests/BUILD                   |   1 +
 xla/service/gpu/transforms/BUILD              |  16 +-
 22 files changed, 329 insertions(+), 291 deletions(-)

diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
index ad89a46c852e0..2a21b9702a003 100644
--- a/xla/service/gpu/BUILD
+++ b/xla/service/gpu/BUILD
@@ -278,6 +278,7 @@ cc_library(
     name = "ir_emitter_context",
     srcs = ["ir_emitter_context.cc"],
     hdrs = ["ir_emitter_context.h"],
+    tags = ["gpu"],
     deps = [
         ":execution_stream_assignment",
         ":gpu_constants",
@@ -308,6 +309,7 @@ cc_library(
     ]) + if_rocm_hipblaslt([
         "TF_HIPBLASLT=1",
     ]),
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -350,12 +352,14 @@ cc_library(
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:topk_custom_kernel",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
+        "//xla/service/gpu/runtime:cholesky_thunk",
         "//xla/service/gpu/runtime:command_buffer_cmd",
         "//xla/service/gpu/runtime:command_buffer_cmd_emitter",
         "//xla/service/gpu/runtime:command_buffer_thunk",
         "//xla/service/gpu/runtime:conditional_thunk",
         "//xla/service/gpu/runtime:convolution_thunk",
         "//xla/service/gpu/runtime:copy_thunk",
+        "//xla/service/gpu/runtime:cub_sort_thunk",
         "//xla/service/gpu/runtime:cudnn_thunk",
         "//xla/service/gpu/runtime:custom_call_thunk",
         "//xla/service/gpu/runtime:fft_thunk",
@@ -379,6 +383,7 @@ cc_library(
         "//xla/service/gpu/runtime:send_recv_thunk",
         "//xla/service/gpu/runtime:sequential_thunk",
         "//xla/service/gpu/runtime:thunk",
+        "//xla/service/gpu/runtime:triangular_solve_thunk",
         "//xla/service/gpu/runtime:wait_for_streams_thunk",
         "//xla/service/gpu/runtime:while_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
@@ -421,11 +426,7 @@ cc_library(
         "@tsl//tsl/platform:human_readable_json",
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/protobuf:dnn_proto_cc",
-    ] + if_gpu_is_configured([
-        "//xla/service/gpu/runtime:cholesky_thunk",
-        "//xla/service/gpu/runtime:cub_sort_thunk",
-        "//xla/service/gpu/runtime:triangular_solve_thunk",
-    ]) + if_rocm_is_configured([
+    ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -442,7 +443,7 @@ cc_library(
         "ir_emitter.h",
         "ir_emitter_nested.h",
     ],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":hlo_to_ir_bindings",
@@ -486,11 +487,9 @@ cc_library(
 
 cc_library(
     name = "triton_call",
-    srcs = if_gpu_is_configured(["triton_call.cc"]),
+    srcs = ["triton_call.cc"],
     hdrs = ["triton_call.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    tags = ["gpu"],
     deps = [
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
@@ -556,6 +555,7 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":buffer_allocations",
@@ -1252,6 +1252,7 @@ cc_library(
         "compile_module_to_llvm_ir.h",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":executable_proto_cc",
         ":execution_stream_assignment",
@@ -1303,6 +1304,7 @@ cc_library(
     name = "fusion_pipeline",
     srcs = ["fusion_pipeline.cc"],
     hdrs = ["fusion_pipeline.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:xla_proto_cc",
         "//xla/service:cpu_gpu_shape_verifier",
@@ -1350,14 +1352,14 @@ cc_library(
 
 cc_library(
     name = "gpu_compiler",
-    srcs = if_gpu_is_configured([
+    srcs = [
         "gpu_compiler.cc",
-    ]),
-    hdrs = if_gpu_is_configured([
+    ],
+    hdrs = [
         "gpu_compiler.h",
-    ]),
-    deps = if_gpu_is_configured([
-        # go/keep-sorted start prefix_order=":,,
+    ],
+    tags = ["gpu"],
+    deps = [
         ":buffer_sharing",
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
@@ -1384,83 +1386,17 @@ cc_library(
         ":reduction_utils",
         ":runtime_intrinsics",
         ":stream_executor_util",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-        "@llvm-project//llvm:AsmParser",
-        "@llvm-project//llvm:BitReader",
-        "@llvm-project//llvm:BitWriter",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TransformUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
+        "//xla:autotune_results_proto_cc",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/service/gpu/autotuning:autotuner_util",
-        "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner",
-        "//xla/service/gpu/fusions/triton:triton_support",
-        "//xla/service/gpu/model:gpu_cost_model_stats_collection",
-        "//xla/service/gpu/model:gpu_hlo_cost_analysis",
-        "//xla/service/gpu/runtime:thunk",
-        "//xla/service/gpu/transforms:algebraic_simplifier",
-        "//xla/service/gpu/transforms:algorithm_checker",
-        "//xla/service/gpu/transforms:all_gather_dynamic_slice_simplifier",
-        "//xla/service/gpu/transforms:all_gather_optimizer",
-        "//xla/service/gpu/transforms:all_reduce_blueconnect",
-        "//xla/service/gpu/transforms:all_reduce_splitter",
-        "//xla/service/gpu/transforms:async_collective_annotator",
-        "//xla/service/gpu/transforms:async_wrapper",
-        "//xla/service/gpu/transforms:collective_permute_cycle_decomposer",
-        "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator",
-        "//xla/service/gpu/transforms:command_buffer_scheduling",
-        "//xla/service/gpu/transforms:conv_rewriter",
-        "//xla/service/gpu/transforms:convert_async_collectives_to_sync",
-        "//xla/service/gpu/transforms:cudnn_custom_call_converter",
-        "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
-        "//xla/service/gpu/transforms:dot_dimension_sorter",
-        "//xla/service/gpu/transforms:dot_operand_converter",
-        "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
-        "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
-        "//xla/service/gpu/transforms:fusion_wrapper",
-        "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter",
-        "//xla/service/gpu/transforms:gemm_fusion",
-        "//xla/service/gpu/transforms:gemm_rewriter",
-        "//xla/service/gpu/transforms:gemv_rewriter",
-        "//xla/service/gpu/transforms:layout_assignment",
-        "//xla/service/gpu/transforms:move_copy_to_users",
-        "//xla/service/gpu/transforms:pipelined_p2p_rewriter",
-        "//xla/service/gpu/transforms:reduce_scatter_creator",
-        "//xla/service/gpu/transforms:reduction_degenerate_dim_remover",
-        "//xla/service/gpu/transforms:reduction_dimension_grouper",
-        "//xla/service/gpu/transforms:reduction_layout_normalizer",
-        "//xla/service/gpu/transforms:reduction_splitter",
-        "//xla/service/gpu/transforms:rename_fusions",
-        "//xla/service/gpu/transforms:sanitize_constant_names",
-        "//xla/service/gpu/transforms:scatter_expander",
-        "//xla/service/gpu/transforms:scatter_slice_simplifier",
-        "//xla/service/gpu/transforms:softmax_rewriter_triton",
-        "//xla/service/gpu/transforms:stream_attribute_annotator",
-        "//xla/service/gpu/transforms:stream_attribute_async_wrapper",
-        "//xla/service/gpu/transforms:topk_specializer",
-        "//xla/service/gpu/transforms:topk_splitter",
-        "//xla/service/gpu/transforms:transpose_dimension_grouper",
-        "//xla/service/gpu/transforms:tree_reduction_rewriter",
-        "//xla/service/gpu/transforms:triton_fusion_numerics_verifier",
-        "//xla/service/gpu/transforms:windowed_einsum_handler",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/service/spmd:collective_permute_motion",
         "//xla/service:algebraic_simplifier",
         "//xla/service:all_gather_broadcast_reorder",
         "//xla/service:all_gather_combiner",
@@ -1561,24 +1497,90 @@ cc_library(
         "//xla/service:while_loop_simplifier",
         "//xla/service:while_loop_trip_count_annotator",
         "//xla/service:zero_sized_hlo_elimination",
+        "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner",
+        "//xla/service/gpu/fusions/triton:triton_support",
+        "//xla/service/gpu/model:gpu_cost_model_stats_collection",
+        "//xla/service/gpu/model:gpu_hlo_cost_analysis",
+        "//xla/service/gpu/runtime:thunk",
+        "//xla/service/gpu/transforms:algebraic_simplifier",
+        "//xla/service/gpu/transforms:algorithm_checker",
+        "//xla/service/gpu/transforms:all_gather_dynamic_slice_simplifier",
+        "//xla/service/gpu/transforms:all_gather_optimizer",
+        "//xla/service/gpu/transforms:all_reduce_blueconnect",
+        "//xla/service/gpu/transforms:all_reduce_splitter",
+        "//xla/service/gpu/transforms:async_collective_annotator",
+        "//xla/service/gpu/transforms:async_wrapper",
+        "//xla/service/gpu/transforms:collective_permute_cycle_decomposer",
+        "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator",
+        "//xla/service/gpu/transforms:command_buffer_scheduling",
+        "//xla/service/gpu/transforms:conv_rewriter",
+        "//xla/service/gpu/transforms:convert_async_collectives_to_sync",
+        "//xla/service/gpu/transforms:cudnn_custom_call_converter",
+        "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
+        "//xla/service/gpu/transforms:dot_dimension_sorter",
+        "//xla/service/gpu/transforms:dot_operand_converter",
+        "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
+        "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
+        "//xla/service/gpu/transforms:fusion_wrapper",
+        "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter",
+        "//xla/service/gpu/transforms:gemm_fusion",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:gemv_rewriter",
+        "//xla/service/gpu/transforms:layout_assignment",
+        "//xla/service/gpu/transforms:move_copy_to_users",
+        "//xla/service/gpu/transforms:pipelined_p2p_rewriter",
+        "//xla/service/gpu/transforms:reduce_scatter_creator",
+        "//xla/service/gpu/transforms:reduction_degenerate_dim_remover",
+        "//xla/service/gpu/transforms:reduction_dimension_grouper",
+        "//xla/service/gpu/transforms:reduction_layout_normalizer",
+        "//xla/service/gpu/transforms:reduction_splitter",
+        "//xla/service/gpu/transforms:rename_fusions",
+        "//xla/service/gpu/transforms:sanitize_constant_names",
+        "//xla/service/gpu/transforms:scatter_expander",
+        "//xla/service/gpu/transforms:scatter_slice_simplifier",
+        "//xla/service/gpu/transforms:softmax_rewriter_triton",
+        "//xla/service/gpu/transforms:stream_attribute_annotator",
+        "//xla/service/gpu/transforms:stream_attribute_async_wrapper",
+        "//xla/service/gpu/transforms:topk_specializer",
+        "//xla/service/gpu/transforms:topk_splitter",
+        "//xla/service/gpu/transforms:transpose_dimension_grouper",
+        "//xla/service/gpu/transforms:tree_reduction_rewriter",
+        "//xla/service/gpu/transforms:triton_fusion_numerics_verifier",
+        "//xla/service/gpu/transforms:windowed_einsum_handler",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/spmd:collective_permute_motion",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:location_exporter",
-        "//xla:autotune_results_proto_cc",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:BitReader",
+        "@llvm-project//llvm:BitWriter",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@tsl//tsl/lib/monitoring:counter",
         "@tsl//tsl/platform:blocking_counter",
         "@tsl//tsl/platform:casts",
@@ -1592,8 +1594,7 @@ cc_library(
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/profiler/lib:scoped_annotation",
         "@tsl//tsl/profiler/lib:traceme",
-        # go/keep-sorted end
-    ]) + xla_internal(["service:export_hlo"]) + if_google([
+    ] + xla_internal(["service:export_hlo"]) + if_google([
         "//xla/hlo/experimental/auto_sharding",
     ]),
 )
@@ -2072,6 +2073,7 @@ cc_library(
     name = "gpu_hlo_schedule",
     srcs = ["gpu_hlo_schedule.cc"],
     hdrs = ["gpu_hlo_schedule.h"],
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":gpu_latency_hiding_scheduler",
@@ -2625,8 +2627,9 @@ cc_library(
 
 cc_library(
     name = "make_batch_pointers",
-    srcs = if_gpu_is_configured(["make_batch_pointers.cc"]),
-    hdrs = if_gpu_is_configured(["make_batch_pointers.h"]),
+    srcs = ["make_batch_pointers.cc"],
+    hdrs = ["make_batch_pointers.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:types",
         "//xla:util",
@@ -2943,6 +2946,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_latency_hiding_scheduler_test",
     srcs = ["gpu_latency_hiding_scheduler_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
diff --git a/xla/service/gpu/fusions/BUILD b/xla/service/gpu/fusions/BUILD
index 2a4f04c9f67ee..b77191263eda3 100644
--- a/xla/service/gpu/fusions/BUILD
+++ b/xla/service/gpu/fusions/BUILD
@@ -12,6 +12,7 @@ cc_library(
     name = "in_place_dynamic_update_slice_mlir",
     srcs = ["in_place_dynamic_update_slice_mlir.cc"],
     hdrs = ["in_place_dynamic_update_slice_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -37,6 +38,7 @@ cc_library(
     name = "copy",
     srcs = ["copy.cc"],
     hdrs = ["copy.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
@@ -59,6 +61,7 @@ cc_library(
     srcs = ["custom.cc"],
     hdrs = ["custom.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
@@ -165,6 +168,7 @@ cc_library(
     name = "fusion_emitter",
     srcs = ["fusion_emitter.cc"],
     hdrs = ["fusion_emitter.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:shape_util",
@@ -201,6 +205,7 @@ cc_library(
     name = "fusions",
     srcs = ["fusions.cc"],
     hdrs = ["fusions.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         ":concatenate_mlir",
@@ -242,6 +247,7 @@ cc_library(
     name = "loop_mlir",
     srcs = ["loop_mlir.cc"],
     hdrs = ["loop_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -271,6 +277,7 @@ cc_library(
     name = "scatter_mlir",
     srcs = ["scatter_mlir.cc"],
     hdrs = ["scatter_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -303,6 +310,7 @@ cc_library(
     name = "transpose_mlir",
     srcs = ["transpose_mlir.cc"],
     hdrs = ["transpose_mlir.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla:permutation_util",
@@ -337,6 +345,7 @@ cc_library(
     name = "triton",
     srcs = ["triton.cc"],
     hdrs = ["triton.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         ":fusion_emitter",
@@ -377,6 +386,7 @@ xla_cc_test(
     name = "triton_test",
     srcs = ["triton_test.cc"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         ":fusions",
@@ -400,6 +410,7 @@ cc_library(
     srcs = ["cudnn.cc"],
     hdrs = ["cudnn.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla/hlo/ir:hlo",
@@ -459,6 +470,7 @@ cc_library(
     name = "thunk_util",
     srcs = ["thunk_util.cc"],
     hdrs = ["thunk_util.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:literal",
@@ -478,6 +490,7 @@ cc_library(
     name = "reduction_base",
     srcs = ["reduction_base.cc"],
     hdrs = ["reduction_base.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         ":fusion_emitter",
@@ -513,6 +526,7 @@ cc_library(
     name = "reduction_mlir",
     srcs = ["reduction_mlir.cc"],
     hdrs = ["reduction_mlir.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         ":reduction_base",
@@ -550,6 +564,7 @@ cc_library(
 xla_cc_test(
     name = "reduction_base_test",
     srcs = ["reduction_base_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":reduction_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -564,6 +579,7 @@ cc_library(
     name = "concatenate_mlir",
     srcs = ["concatenate_mlir.cc"],
     hdrs = ["concatenate_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_fusible",
@@ -588,6 +604,7 @@ cc_library(
     name = "input_slices_mlir",
     srcs = ["input_slices_mlir.cc"],
     hdrs = ["input_slices_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:util",
         "//xla:xla_data_proto_cc",
diff --git a/xla/service/gpu/fusions/ir/tests/BUILD b/xla/service/gpu/fusions/ir/tests/BUILD
index 381d5a3220b1d..6a1e332a2e9aa 100644
--- a/xla/service/gpu/fusions/ir/tests/BUILD
+++ b/xla/service/gpu/fusions/ir/tests/BUILD
@@ -9,6 +9,7 @@ lit_test_suite(
     name = "tests",
     srcs = glob(["*.mlir"]),
     cfg = "//xla:lit.cfg.py",
+    tags = ["gpu"],
     tools = [
         "//xla/service/gpu/fusions/tools:mlir_fusions_opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/xla/service/gpu/fusions/legacy/BUILD b/xla/service/gpu/fusions/legacy/BUILD
index 4ddfb29634e3b..7be07567d8fae 100644
--- a/xla/service/gpu/fusions/legacy/BUILD
+++ b/xla/service/gpu/fusions/legacy/BUILD
@@ -10,6 +10,7 @@ cc_library(
     name = "in_place_dynamic_update_slice",
     srcs = ["in_place_dynamic_update_slice.cc"],
     hdrs = ["in_place_dynamic_update_slice.h"],
+    tags = ["gpu"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -33,6 +34,7 @@ cc_library(
 xla_cc_test(
     name = "in_place_dynamic_update_slice_test",
     srcs = ["in_place_dynamic_update_slice_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":in_place_dynamic_update_slice",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -53,6 +55,7 @@ cc_library(
     name = "loop",
     srcs = ["loop.cc"],
     hdrs = ["loop.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:util",
@@ -83,6 +86,7 @@ cc_library(
 xla_cc_test(
     name = "loop_test",
     srcs = ["loop_test.cc"],
+    tags = ["gpu"],
     deps = [
         "//xla:status_macros",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -105,6 +109,7 @@ cc_library(
     name = "scatter",
     srcs = ["scatter.cc"],
     hdrs = ["scatter.h"],
+    tags = ["gpu"],
     deps = [
         ":loop",
         "//xla:shape_util",
@@ -135,6 +140,7 @@ cc_library(
 xla_cc_test(
     name = "scatter_test",
     srcs = ["scatter_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":scatter",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -183,6 +189,7 @@ cc_library(
     name = "reduction",
     srcs = ["reduction.cc"],
     hdrs = ["reduction.h"],
+    tags = ["gpu"],
     deps = [
         ":tiling_util",
         "//xla:shape_util",
@@ -235,6 +242,7 @@ cc_library(
 xla_cc_test(
     name = "reduction_test",
     srcs = ["reduction_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":reduction",
         "//xla/hlo/ir:hlo",
@@ -258,6 +266,7 @@ cc_library(
     name = "concatenate",
     srcs = ["concatenate.cc"],
     hdrs = ["concatenate.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -285,6 +294,7 @@ cc_library(
 xla_cc_test(
     name = "concatenate_test",
     srcs = ["concatenate_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":concatenate",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -304,6 +314,7 @@ cc_library(
     name = "transpose",
     srcs = ["transpose.cc"],
     hdrs = ["transpose.h"],
+    tags = ["gpu"],
     deps = [
         ":tiling_util",
         "//xla:permutation_util",
@@ -338,6 +349,7 @@ cc_library(
 xla_cc_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":transpose",
         "//xla:status_macros",
@@ -359,6 +371,7 @@ cc_library(
     name = "input_slices",
     srcs = ["input_slices.cc"],
     hdrs = ["input_slices.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:util",
@@ -390,6 +403,7 @@ cc_library(
 xla_cc_test(
     name = "input_slices_test",
     srcs = ["input_slices_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":input_slices",
         "//xla/service/gpu:gpu_device_info_for_tests",
diff --git a/xla/service/gpu/fusions/mlir/BUILD b/xla/service/gpu/fusions/mlir/BUILD
index 21b1787438c4b..23ae695600e66 100644
--- a/xla/service/gpu/fusions/mlir/BUILD
+++ b/xla/service/gpu/fusions/mlir/BUILD
@@ -17,6 +17,7 @@ cc_library(
     name = "computation_partitioner",
     srcs = ["computation_partitioner.cc"],
     hdrs = ["computation_partitioner.h"],
+    tags = ["gpu"],
     deps = [
         ":type_util",
         "//xla:shape_util",
@@ -45,6 +46,7 @@ cc_library(
 xla_cc_test(
     name = "computation_partitioner_test",
     srcs = ["computation_partitioner_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         "//xla/hlo/ir:hlo",
@@ -62,6 +64,7 @@ cc_library(
     name = "elemental_hlo_to_mlir",
     srcs = ["elemental_hlo_to_mlir.cc"],
     hdrs = ["elemental_hlo_to_mlir.h"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":type_util",
@@ -110,6 +113,7 @@ cc_library(
 xla_cc_test(
     name = "elemental_hlo_to_mlir_test",
     srcs = ["elemental_hlo_to_mlir_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":elemental_hlo_to_mlir",
@@ -149,6 +153,7 @@ cc_library(
     name = "mlir_fusion_emitter",
     srcs = ["mlir_fusion_emitter.cc"],
     hdrs = ["mlir_fusion_emitter.h"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":elemental_hlo_to_mlir",
@@ -227,6 +232,7 @@ cc_library(
 xla_cc_test(
     name = "mlir_fusion_emitter_test",
     srcs = ["mlir_fusion_emitter_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":mlir_fusion_emitter",
diff --git a/xla/service/gpu/fusions/tests/BUILD b/xla/service/gpu/fusions/tests/BUILD
index d3e3b665e75d3..71bec2f5d4472 100644
--- a/xla/service/gpu/fusions/tests/BUILD
+++ b/xla/service/gpu/fusions/tests/BUILD
@@ -10,6 +10,7 @@ lit_test_suite(
     srcs = glob(["**/*.hlo"]),
     cfg = "//xla:lit.cfg.py",
     default_tags = ["requires-gpu-sm80-only"],
+    tags = ["gpu"],
     tools = [
         "//xla/service/gpu/fusions/tools:fusion_to_mlir",
         "//xla/service/gpu/fusions/tools:mlir_fusions_opt",
diff --git a/xla/service/gpu/fusions/tools/BUILD b/xla/service/gpu/fusions/tools/BUILD
index 5b6c18f409287..7d1abdcb0456b 100644
--- a/xla/service/gpu/fusions/tools/BUILD
+++ b/xla/service/gpu/fusions/tools/BUILD
@@ -8,6 +8,7 @@ package(
 xla_cc_binary(
     name = "mlir_fusions_opt",
     srcs = ["mlir_fusions_opt.cc"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         "//xla/mlir_hlo",
@@ -41,6 +42,7 @@ cc_library(
     testonly = 1,
     srcs = ["test_lib.cc"],
     hdrs = ["test_lib.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
@@ -75,6 +77,7 @@ xla_cc_binary(
     name = "fusion_to_mlir",
     testonly = 1,
     srcs = ["fusion_to_mlir.cc"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         ":test_lib",
@@ -90,6 +93,7 @@ xla_cc_binary(
     name = "test_correctness",
     testonly = 1,
     srcs = ["test_correctness.cc"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         ":test_lib",
diff --git a/xla/service/gpu/fusions/transforms/BUILD b/xla/service/gpu/fusions/transforms/BUILD
index ee8a424c5e93e..1abb55e541195 100644
--- a/xla/service/gpu/fusions/transforms/BUILD
+++ b/xla/service/gpu/fusions/transforms/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "vectorize_loads_stores.cc",
     ],
     hdrs = ["passes.h"],
+    tags = ["gpu"],
     deps = [
         ":passes_inc_gen",
         "//xla:shape_util",
diff --git a/xla/service/gpu/fusions/transforms/tests/BUILD b/xla/service/gpu/fusions/transforms/tests/BUILD
index 381d5a3220b1d..6a1e332a2e9aa 100644
--- a/xla/service/gpu/fusions/transforms/tests/BUILD
+++ b/xla/service/gpu/fusions/transforms/tests/BUILD
@@ -9,6 +9,7 @@ lit_test_suite(
     name = "tests",
     srcs = glob(["*.mlir"]),
     cfg = "//xla:lit.cfg.py",
+    tags = ["gpu"],
     tools = [
         "//xla/service/gpu/fusions/tools:mlir_fusions_opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/xla/service/gpu/fusions/triton/BUILD b/xla/service/gpu/fusions/triton/BUILD
index f2eebf5c7dcf6..fed1df75bfd40 100644
--- a/xla/service/gpu/fusions/triton/BUILD
+++ b/xla/service/gpu/fusions/triton/BUILD
@@ -33,6 +33,7 @@ cc_library(
         "compilation_pipeline_rocm.cc",
     ]),
     hdrs = ["triton_fusion_emitter.h"],
+    tags = ["gpu"],
     deps = [
         ":passes",
         "//xla:autotuning_proto_cc",
@@ -289,6 +290,7 @@ cc_library(
     testonly = True,
     srcs = ["triton_test_utils.cc"],
     hdrs = ["triton_test_utils.h"],
+    tags = ["gpu"],
     deps = [
         ":triton_fusion_emitter",
         "//xla:shape_util",
@@ -325,6 +327,7 @@ cc_library(
 xla_cc_test(
     name = "triton_fusion_emitter_mem_utils_test",
     srcs = if_cuda_is_configured(["triton_fusion_emitter_mem_utils_test.cc"]),
+    tags = ["gpu"],
     deps = [
         ":triton_fusion_emitter",
         "//xla/hlo/ir:hlo",
diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
index e3d939e873a22..59a20bd4c3d1e 100644
--- a/xla/service/gpu/gpu_executable.cc
+++ b/xla/service/gpu/gpu_executable.cc
@@ -72,9 +72,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_executor.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -805,12 +803,10 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
   se::StreamExecutor* executor = run_options->stream()->parent();
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // GpuExecutable always bound to a single GpuContext during its execution, so
   // we activate it once to skip expensive context activations later.
   se::gpu::GpuExecutor* gpu_executor = se::gpu::ExtractGpuExecutor(executor);
   se::gpu::ScopedActivateContext activation(gpu_executor);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Force synchronous execution if the allocator requires it.
   const bool block_host_until_done =
diff --git a/xla/service/gpu/model/BUILD b/xla/service/gpu/model/BUILD
index 5825878005729..e988e3607558f 100644
--- a/xla/service/gpu/model/BUILD
+++ b/xla/service/gpu/model/BUILD
@@ -25,6 +25,7 @@ cc_library(
     name = "analytical_latency_estimator",
     srcs = ["analytical_latency_estimator.cc"],
     hdrs = ["analytical_latency_estimator.h"],
+    tags = ["gpu"],
     deps = [
         ":gpu_collective_performance_model",
         ":gpu_hlo_cost_analysis",
@@ -102,6 +103,7 @@ cc_library(
     name = "gpu_cost_model_stats_collection",
     srcs = ["gpu_cost_model_stats_collection.cc"],
     hdrs = ["gpu_cost_model_stats_collection.h"],
+    tags = ["gpu"],
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
@@ -121,6 +123,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_cost_model_stats_collection_test",
     srcs = ["gpu_cost_model_stats_collection_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
@@ -191,6 +194,7 @@ cc_library(
     name = "gpu_performance_model_base",
     srcs = ["gpu_performance_model_base.cc"],
     hdrs = ["gpu_performance_model_base.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
@@ -219,6 +223,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_performance_model_base_test",
     srcs = ["gpu_performance_model_base_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
@@ -241,6 +246,7 @@ cc_library(
     name = "gpu_performance_model",
     srcs = ["gpu_performance_model.cc"],
     hdrs = ["gpu_performance_model.h"],
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":gpu_hlo_cost_analysis",
@@ -263,6 +269,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_performance_model_test",
     srcs = ["gpu_performance_model_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
@@ -294,6 +301,7 @@ cc_library(
     srcs = ["gpu_collective_performance_model.cc"],
     hdrs = ["gpu_collective_performance_model.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
@@ -332,6 +340,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_collective_performance_model_test",
     srcs = ["gpu_collective_performance_model_test.cc"],
+    tags = ["gpu"],
     deps = [
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tests:hlo_test_base",
@@ -344,6 +353,7 @@ cc_library(
     name = "gpu_indexing_performance_model",
     srcs = ["gpu_indexing_performance_model.cc"],
     hdrs = ["gpu_indexing_performance_model.h"],
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
@@ -384,6 +394,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_indexing_performance_model_test",
     srcs = ["gpu_indexing_performance_model_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
@@ -802,6 +813,7 @@ cc_library(
     name = "coalescing_analysis",
     srcs = ["coalescing_analysis.cc"],
     hdrs = ["coalescing_analysis.h"],
+    tags = ["gpu"],
     deps = [
         ":affine_map_evaluator",
         ":indexing_analysis",
@@ -825,6 +837,7 @@ cc_library(
 xla_cc_test(
     name = "coalescing_analysis_test",
     srcs = ["coalescing_analysis_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         "//xla:shape_util",
diff --git a/xla/service/gpu/runtime/BUILD b/xla/service/gpu/runtime/BUILD
index ba0ab788880b5..b8261fab685f1 100644
--- a/xla/service/gpu/runtime/BUILD
+++ b/xla/service/gpu/runtime/BUILD
@@ -50,9 +50,7 @@ cc_library(
     name = "command_buffer_cmd",
     srcs = ["command_buffer_cmd.cc"],
     hdrs = ["command_buffer_cmd.h"],
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]),
+    tags = ["gpu"],
     deps = [
         ":annotation",
         ":custom_call_thunk",
@@ -76,7 +74,6 @@ cc_library(
         "//xla/service:computation_placer",
         "//xla/service:custom_call_status_internal",
         "//xla/service:custom_call_status_public_headers",
-        "//xla/service:executable",
         "//xla/service:global_device_id",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
@@ -86,8 +83,8 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:lazy_op_runner",
         "//xla/stream_executor:trace_command_buffer_factory",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tsl/concurrency:ref_count",
@@ -114,6 +111,7 @@ cc_library(
     name = "command_buffer_cmd_emitter",
     srcs = ["command_buffer_cmd_emitter.cc"],
     hdrs = ["command_buffer_cmd_emitter.h"],
+    tags = ["gpu"],
     deps = [
         ":command_buffer_cmd",
         ":conditional_thunk",
@@ -144,9 +142,8 @@ cc_library(
 
 xla_test(
     name = "command_buffer_cmd_test",
-    srcs = if_gpu_is_configured(["command_buffer_cmd_test.cc"]),
+    srcs = ["command_buffer_cmd_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":command_buffer_cmd",
         ":thunk",
@@ -351,7 +348,7 @@ cc_library(
 
 xla_test(
     name = "dynamic_slice_thunk_test",
-    srcs = if_gpu_is_configured(["dynamic_slice_thunk_test.cc"]),
+    srcs = ["dynamic_slice_thunk_test.cc"],
     backend_tags = {
         "gpu_a100": if_google(["config-cuda-only"]),
         "gpu_v100": if_google(["config-cuda-only"]),
@@ -361,7 +358,6 @@ xla_test(
         "gpu_v100",
         "gpu_amd_any",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":custom_call_thunk",
         ":dynamic_slice_thunk",
@@ -375,7 +371,6 @@ xla_test(
         "//xla/service:executable",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
@@ -383,35 +378,33 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/stream_executor/gpu:gpu_test_kernels",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/platform:test",
         "@tsl//tsl/platform:test_main",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
+    ],
 )
 
 cc_library(
     name = "cholesky_thunk",
-    srcs = if_gpu_is_configured(["cholesky_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["cholesky_thunk.h"]),
-    deps = if_gpu_is_configured([
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:cusolver_context",
-        "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu/runtime:thunk",
+    srcs = ["cholesky_thunk.cc"],
+    hdrs = ["cholesky_thunk.h"],
+    tags = ["gpu"],
+    deps = [
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
         "//xla/hlo/ir:hlo",
-        "@tsl//tsl/platform:logging",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:cusolver_context",
+        "//xla/service/gpu:make_batch_pointers",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
@@ -419,14 +412,16 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@tsl//tsl/platform:errors",
+        "@tsl//tsl/platform:logging",
         "@tsl//tsl/platform:status",
-    ]),
+    ],
 )
 
 cc_library(
     name = "command_buffer_thunk",
     srcs = ["command_buffer_thunk.cc"],
     hdrs = ["command_buffer_thunk.h"],
+    tags = ["gpu"],
     deps = [
         ":annotation",
         ":command_buffer_cmd",
@@ -463,7 +458,6 @@ xla_test(
         "gpu_v100",
         "gpu_amd_any",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":command_buffer_cmd",
         ":command_buffer_thunk",
@@ -479,13 +473,16 @@ xla_test(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:gpu_test_kernels",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
         "//xla/stream_executor/gpu:gpu_types_header",
@@ -497,9 +494,7 @@ xla_test(
         "@tsl//tsl/platform:test",
         "@tsl//tsl/platform:test_main",
         "@tsl//tsl/profiler/lib:profiler_lock",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
+    ],
 )
 
 cc_library(
@@ -530,17 +525,17 @@ cc_library(
     name = "convolution_thunk",
     srcs = ["convolution_thunk.cc"],
     hdrs = ["convolution_thunk.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         ":thunk",
         "//xla:util",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_conv_runner",
+        "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:lazy_op_runner",
         "//xla/stream_executor:scratch_allocator",
+        "//xla/stream_executor/rocm:rocm_platform_id",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -549,10 +544,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@tsl//tsl/platform:errors",
-    ] + if_rocm_is_configured([
-        # keep sorted
-        "//xla/service/gpu:stream_executor_util",
-    ]),
+        "@tsl//tsl/platform:statusor",
+    ],
 )
 
 cc_library(
@@ -578,12 +571,10 @@ cc_library(
 
 cc_library(
     name = "cub_sort_thunk",
-    srcs = if_gpu_is_configured(["cub_sort_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["cub_sort_thunk.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    deps = if_gpu_is_configured([
+    srcs = ["cub_sort_thunk.cc"],
+    hdrs = ["cub_sort_thunk.h"],
+    tags = ["gpu"],
+    deps = ([
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -606,9 +597,7 @@ cc_library(
     name = "custom_call_thunk",
     srcs = ["custom_call_thunk.cc"],
     hdrs = ["custom_call_thunk.h"],
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]),
+    tags = ["gpu"],
     deps = [
         ":thunk",
         "//xla:executable_run_options",
@@ -880,9 +869,7 @@ cc_library(
     name = "nccl_collective_thunk",
     srcs = ["nccl_collective_thunk.cc"],
     hdrs = ["nccl_collective_thunk.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW=1",
-    ]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":nccl_api",
         ":nccl_clique",
@@ -904,7 +891,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:event",
         "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
         "@com_google_absl//absl/algorithm:container",
@@ -912,6 +898,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -920,11 +907,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1059,9 +1042,6 @@ cc_library(
     name = "sequential_thunk",
     srcs = ["sequential_thunk.cc"],
     hdrs = ["sequential_thunk.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         ":annotation",
         "//xla/service/gpu/runtime:thunk",
@@ -1136,6 +1116,7 @@ cc_library(
     name = "for_all_thunks",
     srcs = ["for_all_thunks.cc"],
     hdrs = ["for_all_thunks.h"],
+    tags = ["gpu"],
     deps = [
         ":command_buffer_thunk",
         ":conditional_thunk",
@@ -1151,6 +1132,7 @@ cc_library(
 xla_cc_test(
     name = "for_all_thunks_test",
     srcs = ["for_all_thunks_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":command_buffer_cmd",
         ":command_buffer_thunk",
@@ -1168,11 +1150,11 @@ xla_cc_test(
 
 cc_library(
     name = "triangular_solve_thunk",
-    srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["triangular_solve_thunk.h"]),
-    deps = if_gpu_is_configured([
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
+    srcs = ["triangular_solve_thunk.cc"],
+    hdrs = ["triangular_solve_thunk.h"],
+    tags = ["gpu"],
+    deps = [
+        "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -1185,10 +1167,12 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_asm_opts",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:logging",
         "@tsl//tsl/platform:status",
-    ]) + ["//xla:status_macros"],
+    ],
 )
 
 cc_library(
diff --git a/xla/service/gpu/runtime/command_buffer_cmd.cc b/xla/service/gpu/runtime/command_buffer_cmd.cc
index 0a1a46980ed72..fa41af403d48b 100644
--- a/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -45,6 +45,8 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_status_internal.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
@@ -59,13 +61,14 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/lazy_op_runner.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/trace_command_buffer_factory.h"
@@ -78,13 +81,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/custom_call_status.h"
-#include "xla/service/custom_call_status_internal.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#endif
-
 namespace xla::gpu {
 
 namespace {
@@ -555,20 +551,20 @@ CommandBufferCmd::BufferUsageVector ComputationIdCmd::buffers() {
 
 absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
                                           StateManager& state) {
-#if defined(GOOGLE_CUDA)
-  {
-    absl::MutexLock lock(&mutex_);
-    if (memset_kernels_.contains(params.executor)) return absl::OkStatus();
-  }
+  if (params.executor->GetPlatform()->id() == se::cuda::kCudaPlatformId) {
+    {
+      absl::MutexLock lock(&mutex_);
+      if (memset_kernels_.contains(params.executor)) return absl::OkStatus();
+    }
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
-                      CreateKernel("memset32", 3, kMemset32Kernel,
-                                   /*cubin_data=*/{}, params.executor,
-                                   /*shared_mem_bytes=*/0));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
+                        CreateKernel("memset32", 3, kMemset32Kernel,
+                                     /*cubin_data=*/{}, params.executor,
+                                     /*shared_mem_bytes=*/0));
 
-  absl::MutexLock lock(&mutex_);
-  memset_kernels_.emplace(params.executor, std::move(kernel));
-#endif  // GOOGLE_CUDA
+    absl::MutexLock lock(&mutex_);
+    memset_kernels_.emplace(params.executor, std::move(kernel));
+  }
   return absl::OkStatus();
 }
 
@@ -595,24 +591,29 @@ absl::Status ComputationIdCmd::Record(
           << "; execution_scope_id=" << execution_scope_id.value();
   VLOG(5) << "  Id: " << dest_ << " (" << dst.opaque() << ")";
 
-#if defined(GOOGLE_CUDA)
-  se::Kernel* memset_kernel = [&] {
+  const bool has_memset_kernel = [&] {
     absl::MutexLock lock(&mutex_);
-    return memset_kernels_[execute_params.stream->parent()].get();
+    return memset_kernels_.contains(execute_params.stream->parent());
   }();
 
-  if (memset_kernel == nullptr) {
-    return absl::InternalError(
-        "Memset kernel not loaded on a command buffer executor");
-  }
+  if (has_memset_kernel) {
+    se::Kernel* memset_kernel = [&] {
+      absl::MutexLock lock(&mutex_);
+      return memset_kernels_[execute_params.stream->parent()].get();
+    }();
 
-  auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst);
-  return command_buffer->Launch(execution_scope_id, se::ThreadDim(1),
-                                se::BlockDim(1), *memset_kernel, *args);
-#else
-  return command_buffer->Memset(execution_scope_id, &dst, value,
-                                /*num_elements=*/1);
-#endif  // GOOGLE_CUDA
+    if (memset_kernel == nullptr) {
+      return absl::InternalError(
+          "Memset kernel not loaded on a command buffer executor");
+    }
+
+    auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst);
+    return command_buffer->Launch(execution_scope_id, se::ThreadDim(1),
+                                  se::BlockDim(1), *memset_kernel, *args);
+  } else {
+    return command_buffer->Memset(execution_scope_id, &dst, value,
+                                  /*num_elements=*/1);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1439,7 +1440,6 @@ absl::Status CustomCallCmd::RecordLegacyCustomCall(
     }
   }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
       auto nested_cmd,
       se::TraceCommandBufferFactory::Create(
@@ -1460,11 +1460,6 @@ absl::Status CustomCallCmd::RecordLegacyCustomCall(
 
   return command_buffer->AddNestedCommandBuffer(execution_scope_id,
                                                 *nested_cmd);
-#else   //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return Unavailable(
-      "Custom calls on GPU are not supported in this configuration. Please "
-      "build with --config=cuda or --config=rocm");
-#endif  //   GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
 absl::Status CustomCallCmd::RecordXlaFfiCall(
@@ -1519,7 +1514,6 @@ absl::Status CustomCallCmd::RecordXlaFfiCall(
   builder.AddAttributes(attrs.Build());
   ffi::CallFrame call_frame = builder.Build();
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
       auto nested_cmd,
       se::TraceCommandBufferFactory::Create(
@@ -1537,11 +1531,6 @@ absl::Status CustomCallCmd::RecordXlaFfiCall(
 
   return command_buffer->AddNestedCommandBuffer(execution_scope_id,
                                                 *nested_cmd);
-#else   //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return Unavailable(
-      "Custom calls on GPU are not supported in this configuration. Please "
-      "build with --config=cuda or --config=rocm");
-#endif  //   GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
 CommandBufferCmd::BufferUsageVector CustomCallCmd::buffers() {
diff --git a/xla/service/gpu/runtime/command_buffer_thunk_test.cc b/xla/service/gpu/runtime/command_buffer_thunk_test.cc
index bce9d1927d05e..6d9b4514fb174 100644
--- a/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+++ b/xla/service/gpu/runtime/command_buffer_thunk_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
@@ -40,6 +42,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
@@ -47,8 +50,10 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -58,10 +63,6 @@ limitations under the License.
 #include "tsl/platform/test.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 
-#ifdef GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#endif
-
 namespace xla::gpu {
 
 using MemoryAccess = CommandBufferCmd::MemoryAccess;
@@ -101,14 +102,17 @@ KernelArgsPacking CreateDefaultArgsPacking() {
 }
 
 // Some of the tests rely on CUDA 12.3+ features.
-bool IsAtLeastCuda12300() {
-#if defined(TENSORFLOW_USE_ROCM)
-  return false;
-#endif
-#if CUDA_VERSION >= 12030
+static bool IsAtLeastCuda12300(
+    const stream_executor::StreamExecutor* executor) {
+  if (executor->GetPlatform()->id() != stream_executor::cuda::kCudaPlatformId) {
+    return false;
+  }
+  if (std::min({executor->GetDeviceDescription().runtime_version(),
+                executor->GetDeviceDescription().driver_version()}) <
+      stream_executor::SemanticVersion{12, 3, 0}) {
+    return false;
+  }
   return true;
-#endif
-  return false;
 }
 
 // Give a short aliases to execution threads.
@@ -592,12 +596,12 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
 }
 
 TEST(CommandBufferThunkTest, GemmCmd) {
-  if (!IsAtLeastCuda12300()) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "CUDA graph tracing is not supported";
   }
 
-  se::StreamExecutor* executor = GpuExecutor();
-
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
@@ -708,12 +712,12 @@ TEST(CommandBufferThunkTest, GemmCmd) {
 }
 
 TEST(CommandBufferThunkTest, CublasLtCmd) {
-  if (!IsAtLeastCuda12300()) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "CUDA graph tracing is not supported";
   }
 
-  se::StreamExecutor* executor = GpuExecutor();
-
   TF_ASSERT_OK_AND_ASSIGN(auto stream1, executor->CreateStream());
   TF_ASSERT_OK_AND_ASSIGN(auto stream2, executor->CreateStream());
 
@@ -967,12 +971,12 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
 }
 
 TEST(CommandBufferThunkTest, IfCmd) {
-  if (!IsAtLeastCuda12300()) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "CUDA graph conditionals are not supported";
   }
 
-  se::StreamExecutor* executor = GpuExecutor();
-
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t length = 4;
@@ -1055,12 +1059,12 @@ TEST(CommandBufferThunkTest, IfCmd) {
 }
 
 TEST(CommandBufferThunkTest, IfElseCmd) {
-  if (!IsAtLeastCuda12300()) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "CUDA graph conditionals are not supported";
   }
 
-  se::StreamExecutor* executor = GpuExecutor();
-
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t length = 4;
@@ -1148,12 +1152,12 @@ TEST(CommandBufferThunkTest, IfElseCmd) {
 }
 
 TEST(CommandBufferThunkTest, CaseCmd) {
-  if (!IsAtLeastCuda12300()) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "CUDA graph conditionals are not supported";
   }
 
-  se::StreamExecutor* executor = GpuExecutor();
-
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t length = 4;
@@ -1237,12 +1241,12 @@ TEST(CommandBufferThunkTest, CaseCmd) {
 }
 
 TEST(CommandBufferThunkTest, ForCmd) {
-  if (!IsAtLeastCuda12300()) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "CUDA graph conditionals are not supported";
   }
 
-  se::StreamExecutor* executor = GpuExecutor();
-
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t length = 4;
diff --git a/xla/service/gpu/runtime/convolution_thunk.cc b/xla/service/gpu/runtime/convolution_thunk.cc
index 8577fe9f55106..35b0e34e404d3 100644
--- a/xla/service/gpu/runtime/convolution_thunk.cc
+++ b/xla/service/gpu/runtime/convolution_thunk.cc
@@ -27,17 +27,18 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#if TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/stream_executor_util.h"
-#endif  // TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/gpu/stream_executor_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/lazy_op_runner.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -87,8 +88,9 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   RunConvOptions opts;
   opts.runner_cache = &GetOrCreateRunner(params.stream, &runner_created);
 
-#if TENSORFLOW_USE_ROCM
-  if (runner_created) {
+  if (params.stream->parent()->GetPlatform()->id() ==
+          se::rocm::kROCmPlatformId &&
+      runner_created) {
     TF_ASSIGN_OR_RETURN(
         GpuConvParams conv_params,
         GetGpuConvParams(config_, operand_se_buffers, result_se_buffers));
@@ -116,7 +118,6 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
         conv_params.output_buf, config_.conv_desc, &scratch_allocator,
         &profile_results);
   }
-#endif  // TENSORFLOW_USE_ROCM
 
   TF_RETURN_IF_ERROR(RunGpuConv(config_, absl::MakeSpan(operand_se_buffers),
                                 absl::MakeSpan(result_se_buffers), scratch,
diff --git a/xla/service/gpu/runtime/custom_call_thunk.cc b/xla/service/gpu/runtime/custom_call_thunk.cc
index 7cf44c109cb26..f34f202a613d4 100644
--- a/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -39,14 +39,11 @@ limitations under the License.
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#endif
-
 namespace xla {
 namespace gpu {
 
@@ -137,7 +134,6 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
     }
   }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   auto gpu_stream = se::gpu::AsGpuStreamValue(params.stream);
   XlaCustomCallStatus custom_call_status;
   call_target_(gpu_stream, buffers.data(), opaque_.data(), opaque_.size(),
@@ -148,11 +144,6 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
   } else {
     return absl::OkStatus();
   }
-#else   //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return Unavailable(
-      "Custom calls on GPU are not supported in this configuration. Please "
-      "build with --config=cuda or --config=rocm");
-#endif  //   GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
 absl::Status CustomCallThunk::ExecuteFfiHandler(
diff --git a/xla/service/gpu/runtime/custom_call_thunk.h b/xla/service/gpu/runtime/custom_call_thunk.h
index c65676381f9c8..41a8048570344 100644
--- a/xla/service/gpu/runtime/custom_call_thunk.h
+++ b/xla/service/gpu/runtime/custom_call_thunk.h
@@ -39,11 +39,8 @@ limitations under the License.
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_types.h"
-#endif
+#include "xla/stream_executor/stream.h"
 
 namespace xla {
 namespace gpu {
@@ -61,12 +58,7 @@ namespace gpu {
 // compiler is allowed to create.
 class CustomCallThunk : public Thunk {
  public:
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   using Stream = stream_executor::gpu::GpuStreamHandle;
-#else   //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  using Stream = void*;
-#endif  //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
   using CustomCallTarget = std::function<void(Stream, void**, const char*,
                                               size_t, XlaCustomCallStatus*)>;
 
diff --git a/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc b/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
index 75c700b75b4e3..e072d9badb27e 100644
--- a/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
+++ b/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -17,15 +17,15 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/string_view.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/service/buffer_assignment.h"
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
@@ -52,12 +53,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
-#if GOOGLE_CUDA
-#define PLATFORM "CUDA"
-#elif TENSORFLOW_USE_ROCM
-#define PLATFORM "ROCM"
-#endif
-
 namespace xla::gpu {
 
 namespace {
@@ -398,7 +393,9 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                            .Arg<ffi::AnyBuffer>()  // src
                            .Ret<ffi::AnyBuffer>()  // dst
 );
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", "CUDA",
+                         kMemcpy);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", "ROCM",
                          kMemcpy);
 
 TEST(DynamicSliceThunkTest, SlicedMemcpy) {
@@ -447,7 +444,11 @@ TEST(DynamicSliceThunkTest, SlicedMemcpy) {
 
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
-  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  absl::string_view platform_name =
+      executor->GetPlatform()->id() == se::cuda::kCudaPlatformId ? "CUDA"
+                                                                 : "ROCM";
+  auto registration =
+      xla::ffi::FindHandler("__xla_test$$memcpy", platform_name);
   ASSERT_TRUE(registration.ok());
 
   std::vector<std::optional<CustomCallThunk::Slice>> operands{
@@ -607,7 +608,11 @@ TEST(DynamicSliceThunkTest, SlicedOutputMemcpy) {
 
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
-  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  const absl::string_view platform_name =
+      executor->GetPlatform()->id() == se::cuda::kCudaPlatformId ? "CUDA"
+                                                                 : "ROCM";
+  auto registration =
+      xla::ffi::FindHandler("__xla_test$$memcpy", platform_name);
   ASSERT_TRUE(registration.ok());
 
   std::vector<std::optional<CustomCallThunk::Slice>> operands{
@@ -1248,7 +1253,11 @@ TEST(DynamicSliceThunkTest, SlicedMemcpyOOB) {
 
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
-  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  absl::string_view platform_name =
+      executor->GetPlatform()->id() == se::cuda::kCudaPlatformId ? "CUDA"
+                                                                 : "ROCM";
+  auto registration =
+      xla::ffi::FindHandler("__xla_test$$memcpy", platform_name);
   ASSERT_TRUE(registration.ok());
 
   std::vector<std::optional<CustomCallThunk::Slice>> operands{
diff --git a/xla/service/gpu/runtime/nccl_collective_thunk.cc b/xla/service/gpu/runtime/nccl_collective_thunk.cc
index 8e075c8d01c73..db772eccb2d47 100644
--- a/xla/service/gpu/runtime/nccl_collective_thunk.cc
+++ b/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "mlir/IR/Value.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/layout_util.h"
@@ -43,13 +43,13 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_clique.h"
 #include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD
index 490f5f4fce44f..a2d5f8857b070 100644
--- a/xla/service/gpu/tests/BUILD
+++ b/xla/service/gpu/tests/BUILD
@@ -701,6 +701,7 @@ lit_test_suite(
 # cc_binary(
 #     name = "xla-opt",
 #     srcs = ["xla-opt.cc"],
+#     tags = ["gpu"],
 #     deps = [
 #         "//xla/service/gpu/fusions/transforms:passes",
 #         "//xla/service/gpu/fusions/triton:passes",
diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD
index 842dffa0028a6..12718715497de 100644
--- a/xla/service/gpu/transforms/BUILD
+++ b/xla/service/gpu/transforms/BUILD
@@ -1443,6 +1443,7 @@ cc_library(
     name = "fusion_merger",
     srcs = ["fusion_merger.cc"],
     hdrs = ["fusion_merger.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:util",
@@ -1471,6 +1472,7 @@ xla_cc_test(
     name = "fusion_merger_test",
     srcs = ["fusion_merger_test.cc"],
     tags = [
+        "gpu",
         "nomsan",
     ],
     deps = [
@@ -1994,6 +1996,7 @@ cc_library(
     name = "multi_output_fusion",
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -2025,6 +2028,7 @@ xla_cc_test(
     name = "multi_output_fusion_test",
     srcs = ["multi_output_fusion_test.cc"],
     tags = [
+        "gpu",
         "nomsan",
     ],
     deps = [
@@ -2086,6 +2090,7 @@ cc_library(
     name = "priority_fusion",
     srcs = ["priority_fusion.cc"],
     hdrs = ["priority_fusion.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -2137,6 +2142,7 @@ xla_cc_test(
     name = "priority_fusion_test",
     srcs = ["priority_fusion_test.cc"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":priority_fusion",
         "//xla:shape_util",
@@ -2522,6 +2528,7 @@ cc_library(
     name = "softmax_rewriter_triton",
     srcs = ["softmax_rewriter_triton.cc"],
     hdrs = ["softmax_rewriter_triton.h"],
+    tags = ["gpu"],
     deps = [
         ":reduction_dimension_grouper",
         ":reduction_splitter",
@@ -2568,6 +2575,7 @@ cc_library(
 xla_cc_test(
     name = "softmax_rewriter_triton_test",
     srcs = ["softmax_rewriter_triton_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":softmax_rewriter_triton",
         "//xla:shape_util",
@@ -2593,11 +2601,9 @@ xla_cc_test(
 
 cc_library(
     name = "sort_rewriter",
-    srcs = if_gpu_is_configured(
-        ["sort_rewriter.cc"],
-        ["sort_rewriter_stub.cc"],
-    ),
+    srcs = ["sort_rewriter.cc"],
     hdrs = ["sort_rewriter.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:comparison_util",
         "//xla:shape_util",
@@ -2620,7 +2626,7 @@ cc_library(
 
 xla_test(
     name = "sort_rewriter_test",
-    srcs = if_cuda_is_configured(["sort_rewriter_test.cc"]),
+    srcs = ["sort_rewriter_test.cc"],
     backends = ["gpu"],
     tags = ["no_oss"],
     deps = [