Reverts 99353d8

PiperOrigin-RevId: 658402030
openxla · Aug 1, 2024 · ce96385 · ce96385
1 parent c83a127
commit ce96385
Show file tree

Hide file tree

Showing 13 changed files with 44 additions and 144 deletions.
diff --git a/xla/backends/interpreter/executor.h b/xla/backends/interpreter/executor.h
@@ -103,7 +103,7 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
       uint64_t size) override {
     return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
   }
-  void HostMemoryDeallocate(void *mem, uint64_t size) override {
+  void HostMemoryDeallocate(void *mem) override {
     delete[] static_cast<char *>(mem);
   }
 

diff --git a/xla/stream_executor/cuda/BUILD b/xla/stream_executor/cuda/BUILD
@@ -824,7 +824,6 @@ cuda_only_cc_library(
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:fingerprint",
         "@tsl//tsl/platform:logging",
-        "@tsl//tsl/platform:platform_port",
         "@tsl//tsl/platform:statusor",
     ] + if_cuda_is_configured([":delay_kernel_cuda"]),
     alwayslink = True,

diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc
@@ -86,7 +86,6 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/numa.h"
 #include "tsl/platform/statusor.h"
 
 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
@@ -154,27 +153,13 @@ GpuExecutor::~GpuExecutor() {
   }
 }
 
-static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
-                                            int device_ordinal);
-
 absl::Status GpuExecutor::Init() {
   TF_RETURN_IF_ERROR(GpuDriver::Init());
   TF_RETURN_IF_ERROR(GpuDriver::GetDevice(device_ordinal_, &device_));
   TF_RETURN_IF_ERROR(
       GpuDriver::CreateContext(device_ordinal_, device_, &context_));
   TF_RETURN_IF_ERROR(
       GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_));
-  std::optional<int> numa_node = TryToReadNumaNode(
-      absl::AsciiStrToLower(GpuDriver::GetPCIBusID(device_ordinal_)),
-      device_ordinal_);
-  if (!numa_node || *numa_node < 0) {
-    LOG(WARNING) << "NUMA node could not be determined for device "
-                 << device_ordinal_
-                 << ", host memory allocations will not be NUMA-pinned";
-    numa_node_ = tsl::port::kNUMANoAffinity;
-  } else {
-    numa_node_ = *numa_node;
-  }
   return absl::OkStatus();
 }
 
@@ -588,47 +573,6 @@ void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   GpuDriver::DeviceDeallocate(context_, mem->opaque());
 }
 
-// CUDA allocation/registration functions are necessary because the driver
-// internally sets up buffers for DMA operations (and page locks them). There's
-// no external interface for us to otherwise control these DMA settings.
-absl::StatusOr<std::unique_ptr<MemoryAllocation>>
-GpuExecutor::HostMemoryAllocate(uint64_t size) {
-  if (numa_node_ != tsl::port::kNUMANoAffinity) {
-    auto* buffer =
-        tsl::port::NUMAMalloc(numa_node_, size, /* minimum_alignment=*/16);
-    if (buffer == nullptr && size > 0) {
-      return absl::InternalError(absl::StrFormat(
-          "Failed to allocate host memory of size %d pinned to NUMA node %d",
-          size, numa_node_));
-    }
-    if (size > 0 && !GpuDriver::HostRegister(context_, buffer, size)) {
-      return absl::InternalError(
-          absl::StrFormat("Failed to register host memory of size %d pinned to "
-                          "NUMA node %d with the GPU driver",
-                          size, numa_node_));
-    }
-    return std::make_unique<HostMemoryAllocation>(buffer, size, this);
-  } else {
-    auto* buffer = GpuDriver::HostAllocate(context_, size);
-    if (buffer == nullptr && size > 0) {
-      return absl::InternalError(
-          absl::StrFormat("Failed to allocate HostMemory of size %d", size));
-    }
-    return std::make_unique<HostMemoryAllocation>(buffer, size, this);
-  }
-}
-
-void GpuExecutor::HostMemoryDeallocate(void* location, uint64_t size) {
-  if (numa_node_ != tsl::port::kNUMANoAffinity) {
-    if (size > 0) {
-      GpuDriver::HostUnregister(context_, location);
-    }
-    tsl::port::NUMAFree(location, size);
-  } else {
-    GpuDriver::HostDeallocate(context_, location);
-  }
-}
-
 bool GpuExecutor::SynchronizeAllActivity() {
   return GpuDriver::SynchronizeContext(context_);
 }
@@ -829,22 +773,22 @@ std::unique_ptr<GpuCommandBuffer> GpuExecutor::CreateCommandBuffer(
 GpuContext* GpuExecutor::gpu_context() { return context_; }
 
 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
-// of SysFS.
+// of SysFS. Returns -1 if it cannot.
 //
 // For anything more complicated/prod-focused than this, you'll likely want to
-// turn to gsys' topology modeling. nvmlDeviceGetMemoryAffinity could also be
-// used.
-static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
-                                            int device_ordinal) {
+// turn to gsys' topology modeling.
+static int TryToReadNumaNode(const std::string& pci_bus_id,
+                             int device_ordinal) {
 #if defined(PLATFORM_WINDOWS)
   // Windows support for NUMA is not currently implemented. Return node 0.
   return 0;
 #else
   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
+  static const int kUnknownNumaNode = -1;
 
   if (pci_bus_id.empty()) {
     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
-    return std::nullopt;
+    return kUnknownNumaNode;
   }
 
   std::string filename =
@@ -857,7 +801,7 @@ static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
   if (file == nullptr) {
     LOG(INFO) << "could not open file to read NUMA node: " << filename
               << "\nYour kernel may have been built without NUMA support.";
-    return std::nullopt;
+    return kUnknownNumaNode;
   }
 
   std::string content;
@@ -868,6 +812,17 @@ static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
 
   int32_t value;
   if (absl::SimpleAtoi(content, &value)) {
+    if (value < 0) {  // See http://b/18228951 for details on this path.
+      LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero."
+                   " See more at "
+                   "https://github.com/torvalds/linux/blob/v6.0/Documentation/"
+                   "ABI/testing/sysfs-bus-pci#L344-L355";
+      fclose(file);
+      return 0;
+    }
     fclose(file);
     return value;
   }
@@ -877,7 +832,7 @@ static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
       << content;
 
   fclose(file);
-  return std::nullopt;
+  return kUnknownNumaNode;
 #endif
 }
 
@@ -909,24 +864,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
     builder.set_pci_bus_id(pci_bus_id);
 
     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
-    std::optional<int> numa_node =
-        TryToReadNumaNode(pci_bus_id, device_ordinal);
-    if (numa_node.has_value()) {
-      if (*numa_node < 0) {  // See http://b/18228951 for details on this path.
-        LOG(INFO)
-            << "successful NUMA node read from SysFS had negative value ("
-            << *numa_node
-            << "), but there must be at least one NUMA node"
-               ", so returning NUMA node zero."
-               " See more at "
-               "https://github.com/torvalds/linux/blob/v6.0/Documentation/"
-               "ABI/testing/sysfs-bus-pci#L344-L355";
-        numa_node = 0;
-      }
-    } else {
-      numa_node = -1;
-    }
-    builder.set_numa_node(*numa_node);
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
+    builder.set_numa_node(numa_node);
   }
 
   {

diff --git a/xla/stream_executor/gpu/BUILD b/xla/stream_executor/gpu/BUILD
@@ -235,7 +235,6 @@ gpu_only_cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@tsl//tsl/platform:platform_port",
         "@tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -799,7 +798,6 @@ xla_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@tsl//tsl/platform:platform_port",
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/platform:test",
     ] + if_cuda([

diff --git a/xla/stream_executor/gpu/gpu_executor.h b/xla/stream_executor/gpu/gpu_executor.h
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_common.h"
-#include "tsl/platform/numa.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace stream_executor {
@@ -113,8 +112,7 @@ class GpuExecutor : public StreamExecutorCommon {
         device_ordinal_(device_ordinal),
         cc_major_(0),
         cc_minor_(0),
-        version_(0),
-        numa_node_(tsl::port::kNUMANoAffinity) {}
+        version_(0) {}
 
   // See the corresponding StreamExecutor methods for method comments on the
   // following overrides.
@@ -169,10 +167,23 @@ class GpuExecutor : public StreamExecutorCommon {
     return GpuCollectives::CollectiveMemoryDeallocate(context_, location);
   }
 
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
   absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
-      uint64_t size) override;
+      uint64_t size) override {
+    auto* buffer = GpuDriver::HostAllocate(context_, size);
+    if (buffer == nullptr && size > 0) {
+      return absl::InternalError(
+          absl::StrFormat("Failed to allocate HostMemory of size %d", size));
+    }
+    return std::make_unique<HostMemoryAllocation>(buffer, size, this);
+  }
 
-  void HostMemoryDeallocate(void* location, uint64_t size) override;
+  void HostMemoryDeallocate(void* location) override {
+    return GpuDriver::HostDeallocate(context_, location);
+  }
 
   absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override {
     return GpuDriver::GetPointerMemorySpace(
@@ -369,9 +380,6 @@ class GpuExecutor : public StreamExecutorCommon {
   // GPU ISA version for device_.
   int version_;
 
-  // NUMA node for device_.
-  int numa_node_;
-
   // Type erased XLA specific state attached to GpuExecutor.
   Object xla_state_;
 

diff --git a/xla/stream_executor/gpu/gpu_executor_test.cc b/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/numa.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -55,32 +54,4 @@ TEST_F(GetPointerMemorySpaceTest, Device) {
   executor->Deallocate(&mem);
 }
 
-using HostMemoryAllocateTest = GpuExecutorTest;
-
-TEST_F(HostMemoryAllocateTest, Numa) {
-  Platform* platform = GetPlatform();
-  const uint64_t kSize = 1024;
-  const int num_devices = platform->VisibleDeviceCount();
-  for (int device = 0; device < num_devices; ++device) {
-    TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
-                            platform->ExecutorForDevice(device));
-    ASSERT_TRUE(executor);
-    TF_ASSERT_OK_AND_ASSIGN(auto device_desc,
-                            executor->CreateDeviceDescription());
-    ASSERT_TRUE(device_desc);
-    TF_ASSERT_OK_AND_ASSIGN(auto host_ptr, executor->HostMemoryAllocate(kSize));
-    ASSERT_TRUE(host_ptr);
-    EXPECT_NE(host_ptr->opaque(), nullptr);
-    const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
-    if (numa_node == tsl::port::kNUMANoAffinity) {
-      // Could be because `executor` could not determine its own NUMA node, in
-      // which case numa_node() will be -1 or 0, depending on the failure mode.
-      EXPECT_LE(device_desc->numa_node(), 0);
-      EXPECT_GE(device_desc->numa_node(), -1);
-    } else {
-      EXPECT_EQ(device_desc->numa_node(), numa_node);
-    }
-  }
-}
-
 }  // namespace stream_executor
diff --git a/xla/stream_executor/host/host_executor.h b/xla/stream_executor/host/host_executor.h
@@ -84,7 +84,7 @@ class HostExecutor : public StreamExecutorCommon {
       uint64_t size) override {
     return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
   }
-  void HostMemoryDeallocate(void* mem, uint64_t size) override {
+  void HostMemoryDeallocate(void* mem) override {
     delete[] static_cast<char*>(mem);
   }
 

diff --git a/xla/stream_executor/host_memory_allocation.cc b/xla/stream_executor/host_memory_allocation.cc
@@ -27,7 +27,7 @@ HostMemoryAllocation::HostMemoryAllocation(void* ptr, uint64_t size,
 
 HostMemoryAllocation::~HostMemoryAllocation() {
   if (ptr_ != nullptr && executor_ != nullptr) {
-    executor_->HostMemoryDeallocate(ptr_, size_);
+    executor_->HostMemoryDeallocate(ptr_);
   }
 }
 

diff --git a/xla/stream_executor/integrations/device_mem_allocator.h b/xla/stream_executor/integrations/device_mem_allocator.h
@@ -82,7 +82,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {
         auto status = stream_exec_->CollectiveMemoryDeallocate(ptr);
         CHECK(status.ok()) << status.message();
       } else if (memory_type_ == MemoryType::kHost) {
-        stream_exec_->HostMemoryDeallocate(ptr, num_bytes);
+        stream_exec_->HostMemoryDeallocate(ptr);
       } else {
         DeviceMemoryBase device_ptr(ptr);
         stream_exec_->Deallocate(&device_ptr);

diff --git a/xla/stream_executor/mock_stream_executor.h b/xla/stream_executor/mock_stream_executor.h
@@ -89,8 +89,7 @@ class MockStreamExecutor : public StreamExecutor {
               (override));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<MemoryAllocation>>,
               HostMemoryAllocate, (uint64_t size), (override));
-  MOCK_METHOD(void, HostMemoryDeallocate, (void* mem, uint64_t size),
-              (override));
+  MOCK_METHOD(void, HostMemoryDeallocate, (void* mem), (override));
   MOCK_METHOD(bool, SynchronizeAllActivity, (), (override));
   MOCK_METHOD(absl::Status, SynchronousMemZero,
               (DeviceMemoryBase * location, uint64_t size), (override));

diff --git a/xla/stream_executor/rocm/rocm_executor.cc b/xla/stream_executor/rocm/rocm_executor.cc
@@ -447,20 +447,6 @@ void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   GpuDriver::DeviceDeallocate(context_, mem->opaque());
 }
 
-absl::StatusOr<std::unique_ptr<MemoryAllocation>>
-GpuExecutor::HostMemoryAllocate(uint64_t size) {
-  auto* buffer = GpuDriver::HostAllocate(context_, size);
-  if (buffer == nullptr && size > 0) {
-    return absl::InternalError(
-        absl::StrFormat("Failed to allocate HostMemory of size %d", size));
-  }
-  return std::make_unique<HostMemoryAllocation>(buffer, size, this);
-}
-
-void GpuExecutor::HostMemoryDeallocate(void* location, uint64_t size) {
-  return GpuDriver::HostDeallocate(context_, location);
-}
-
 bool GpuExecutor::SynchronizeAllActivity() {
   return GpuDriver::SynchronizeContext(context_);
 }

diff --git a/xla/stream_executor/stream_executor.h b/xla/stream_executor/stream_executor.h
@@ -200,7 +200,7 @@ class StreamExecutor {
       uint64_t size) = 0;
 
   // Deallocates a region of host memory allocated by HostMemoryAllocate().
-  virtual void HostMemoryDeallocate(void* mem, uint64_t size) = 0;
+  virtual void HostMemoryDeallocate(void* mem) = 0;
 
   // Returns the memory space of the given pointer.
   virtual absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) {

diff --git a/xla/stream_executor/tpu/tpu_executor.h b/xla/stream_executor/tpu/tpu_executor.h
@@ -137,7 +137,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
       uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  void HostMemoryDeallocate(void* mem, uint64_t size) override {
+  void HostMemoryDeallocate(void* mem) override {
     LOG(FATAL) << "not yet implemented";
   }
   absl::Status SynchronousMemZero(DeviceMemoryBase* location,