Skip to content

Commit

Permalink
Reverts 99353d8
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 658402030
  • Loading branch information
klucke authored and copybara-github committed Aug 1, 2024
1 parent c83a127 commit ce96385
Show file tree
Hide file tree
Showing 13 changed files with 44 additions and 144 deletions.
2 changes: 1 addition & 1 deletion xla/backends/interpreter/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
uint64_t size) override {
return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
}
void HostMemoryDeallocate(void *mem, uint64_t size) override {
void HostMemoryDeallocate(void *mem) override {
delete[] static_cast<char *>(mem);
}

Expand Down
1 change: 0 additions & 1 deletion xla/stream_executor/cuda/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -824,7 +824,6 @@ cuda_only_cc_library(
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:fingerprint",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:statusor",
] + if_cuda_is_configured([":delay_kernel_cuda"]),
alwayslink = True,
Expand Down
103 changes: 21 additions & 82 deletions xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ limitations under the License.
#include "tsl/platform/errors.h"
#include "tsl/platform/fingerprint.h"
#include "tsl/platform/logging.h"
#include "tsl/platform/numa.h"
#include "tsl/platform/statusor.h"

// LOG(ERROR) uses a const named ERROR, so a macro with the same name is
Expand Down Expand Up @@ -154,27 +153,13 @@ GpuExecutor::~GpuExecutor() {
}
}

static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
int device_ordinal);

absl::Status GpuExecutor::Init() {
TF_RETURN_IF_ERROR(GpuDriver::Init());
TF_RETURN_IF_ERROR(GpuDriver::GetDevice(device_ordinal_, &device_));
TF_RETURN_IF_ERROR(
GpuDriver::CreateContext(device_ordinal_, device_, &context_));
TF_RETURN_IF_ERROR(
GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_));
std::optional<int> numa_node = TryToReadNumaNode(
absl::AsciiStrToLower(GpuDriver::GetPCIBusID(device_ordinal_)),
device_ordinal_);
if (!numa_node || *numa_node < 0) {
LOG(WARNING) << "NUMA node could not be determined for device "
<< device_ordinal_
<< ", host memory allocations will not be NUMA-pinned";
numa_node_ = tsl::port::kNUMANoAffinity;
} else {
numa_node_ = *numa_node;
}
return absl::OkStatus();
}

Expand Down Expand Up @@ -588,47 +573,6 @@ void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
GpuDriver::DeviceDeallocate(context_, mem->opaque());
}

// CUDA allocation/registration functions are necessary because the driver
// internally sets up buffers for DMA operations (and page locks them). There's
// no external interface for us to otherwise control these DMA settings.
absl::StatusOr<std::unique_ptr<MemoryAllocation>>
GpuExecutor::HostMemoryAllocate(uint64_t size) {
if (numa_node_ != tsl::port::kNUMANoAffinity) {
auto* buffer =
tsl::port::NUMAMalloc(numa_node_, size, /* minimum_alignment=*/16);
if (buffer == nullptr && size > 0) {
return absl::InternalError(absl::StrFormat(
"Failed to allocate host memory of size %d pinned to NUMA node %d",
size, numa_node_));
}
if (size > 0 && !GpuDriver::HostRegister(context_, buffer, size)) {
return absl::InternalError(
absl::StrFormat("Failed to register host memory of size %d pinned to "
"NUMA node %d with the GPU driver",
size, numa_node_));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
} else {
auto* buffer = GpuDriver::HostAllocate(context_, size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}
}

void GpuExecutor::HostMemoryDeallocate(void* location, uint64_t size) {
if (numa_node_ != tsl::port::kNUMANoAffinity) {
if (size > 0) {
GpuDriver::HostUnregister(context_, location);
}
tsl::port::NUMAFree(location, size);
} else {
GpuDriver::HostDeallocate(context_, location);
}
}

bool GpuExecutor::SynchronizeAllActivity() {
return GpuDriver::SynchronizeContext(context_);
}
Expand Down Expand Up @@ -829,22 +773,22 @@ std::unique_ptr<GpuCommandBuffer> GpuExecutor::CreateCommandBuffer(
GpuContext* GpuExecutor::gpu_context() { return context_; }

// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
// of SysFS.
// of SysFS. Returns -1 if it cannot.
//
// For anything more complicated/prod-focused than this, you'll likely want to
// turn to gsys' topology modeling. nvmlDeviceGetMemoryAffinity could also be
// used.
static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
int device_ordinal) {
// turn to gsys' topology modeling.
static int TryToReadNumaNode(const std::string& pci_bus_id,
int device_ordinal) {
#if defined(PLATFORM_WINDOWS)
// Windows support for NUMA is not currently implemented. Return node 0.
return 0;
#else
VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
static const int kUnknownNumaNode = -1;

if (pci_bus_id.empty()) {
LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
return std::nullopt;
return kUnknownNumaNode;
}

std::string filename =
Expand All @@ -857,7 +801,7 @@ static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
if (file == nullptr) {
LOG(INFO) << "could not open file to read NUMA node: " << filename
<< "\nYour kernel may have been built without NUMA support.";
return std::nullopt;
return kUnknownNumaNode;
}

std::string content;
Expand All @@ -868,6 +812,17 @@ static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,

int32_t value;
if (absl::SimpleAtoi(content, &value)) {
if (value < 0) { // See http://b/18228951 for details on this path.
LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
<< value
<< "), but there must be at least one NUMA node"
", so returning NUMA node zero."
" See more at "
"https://github.com/torvalds/linux/blob/v6.0/Documentation/"
"ABI/testing/sysfs-bus-pci#L344-L355";
fclose(file);
return 0;
}
fclose(file);
return value;
}
Expand All @@ -877,7 +832,7 @@ static std::optional<int> TryToReadNumaNode(const std::string& pci_bus_id,
<< content;

fclose(file);
return std::nullopt;
return kUnknownNumaNode;
#endif
}

Expand Down Expand Up @@ -909,24 +864,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
builder.set_pci_bus_id(pci_bus_id);

// Read the NUMA node corresponding to the PCI bus ID out of sysfs.
std::optional<int> numa_node =
TryToReadNumaNode(pci_bus_id, device_ordinal);
if (numa_node.has_value()) {
if (*numa_node < 0) { // See http://b/18228951 for details on this path.
LOG(INFO)
<< "successful NUMA node read from SysFS had negative value ("
<< *numa_node
<< "), but there must be at least one NUMA node"
", so returning NUMA node zero."
" See more at "
"https://github.com/torvalds/linux/blob/v6.0/Documentation/"
"ABI/testing/sysfs-bus-pci#L344-L355";
numa_node = 0;
}
} else {
numa_node = -1;
}
builder.set_numa_node(*numa_node);
int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
builder.set_numa_node(numa_node);
}

{
Expand Down
2 changes: 0 additions & 2 deletions xla/stream_executor/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,6 @@ gpu_only_cc_library(
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/synchronization",
"@com_google_absl//absl/types:span",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:thread_annotations",
],
)
Expand Down Expand Up @@ -799,7 +798,6 @@ xla_test(
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:statusor",
"@tsl//tsl/platform:test",
] + if_cuda([
Expand Down
24 changes: 16 additions & 8 deletions xla/stream_executor/gpu/gpu_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ limitations under the License.
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_common.h"
#include "tsl/platform/numa.h"
#include "tsl/platform/thread_annotations.h"

namespace stream_executor {
Expand Down Expand Up @@ -113,8 +112,7 @@ class GpuExecutor : public StreamExecutorCommon {
device_ordinal_(device_ordinal),
cc_major_(0),
cc_minor_(0),
version_(0),
numa_node_(tsl::port::kNUMANoAffinity) {}
version_(0) {}

// See the corresponding StreamExecutor methods for method comments on the
// following overrides.
Expand Down Expand Up @@ -169,10 +167,23 @@ class GpuExecutor : public StreamExecutorCommon {
return GpuCollectives::CollectiveMemoryDeallocate(context_, location);
}

// CUDA allocation/registration functions are necessary because the driver
// internally sets up buffers for DMA operations (and page locks them).
// There's no external interface for us to otherwise control these DMA
// settings.
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
uint64_t size) override;
uint64_t size) override {
auto* buffer = GpuDriver::HostAllocate(context_, size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void HostMemoryDeallocate(void* location, uint64_t size) override;
void HostMemoryDeallocate(void* location) override {
return GpuDriver::HostDeallocate(context_, location);
}

absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override {
return GpuDriver::GetPointerMemorySpace(
Expand Down Expand Up @@ -369,9 +380,6 @@ class GpuExecutor : public StreamExecutorCommon {
// GPU ISA version for device_.
int version_;

// NUMA node for device_.
int numa_node_;

// Type erased XLA specific state attached to GpuExecutor.
Object xla_state_;

Expand Down
29 changes: 0 additions & 29 deletions xla/stream_executor/gpu/gpu_executor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ limitations under the License.
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream_executor.h"
#include "tsl/platform/numa.h"
#include "tsl/platform/statusor.h"
#include "tsl/platform/test.h"

Expand Down Expand Up @@ -55,32 +54,4 @@ TEST_F(GetPointerMemorySpaceTest, Device) {
executor->Deallocate(&mem);
}

using HostMemoryAllocateTest = GpuExecutorTest;

TEST_F(HostMemoryAllocateTest, Numa) {
Platform* platform = GetPlatform();
const uint64_t kSize = 1024;
const int num_devices = platform->VisibleDeviceCount();
for (int device = 0; device < num_devices; ++device) {
TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
platform->ExecutorForDevice(device));
ASSERT_TRUE(executor);
TF_ASSERT_OK_AND_ASSIGN(auto device_desc,
executor->CreateDeviceDescription());
ASSERT_TRUE(device_desc);
TF_ASSERT_OK_AND_ASSIGN(auto host_ptr, executor->HostMemoryAllocate(kSize));
ASSERT_TRUE(host_ptr);
EXPECT_NE(host_ptr->opaque(), nullptr);
const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
if (numa_node == tsl::port::kNUMANoAffinity) {
// Could be because `executor` could not determine its own NUMA node, in
// which case numa_node() will be -1 or 0, depending on the failure mode.
EXPECT_LE(device_desc->numa_node(), 0);
EXPECT_GE(device_desc->numa_node(), -1);
} else {
EXPECT_EQ(device_desc->numa_node(), numa_node);
}
}
}

} // namespace stream_executor
2 changes: 1 addition & 1 deletion xla/stream_executor/host/host_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class HostExecutor : public StreamExecutorCommon {
uint64_t size) override {
return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
}
void HostMemoryDeallocate(void* mem, uint64_t size) override {
void HostMemoryDeallocate(void* mem) override {
delete[] static_cast<char*>(mem);
}

Expand Down
2 changes: 1 addition & 1 deletion xla/stream_executor/host_memory_allocation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ HostMemoryAllocation::HostMemoryAllocation(void* ptr, uint64_t size,

HostMemoryAllocation::~HostMemoryAllocation() {
if (ptr_ != nullptr && executor_ != nullptr) {
executor_->HostMemoryDeallocate(ptr_, size_);
executor_->HostMemoryDeallocate(ptr_);
}
}

Expand Down
2 changes: 1 addition & 1 deletion xla/stream_executor/integrations/device_mem_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {
auto status = stream_exec_->CollectiveMemoryDeallocate(ptr);
CHECK(status.ok()) << status.message();
} else if (memory_type_ == MemoryType::kHost) {
stream_exec_->HostMemoryDeallocate(ptr, num_bytes);
stream_exec_->HostMemoryDeallocate(ptr);
} else {
DeviceMemoryBase device_ptr(ptr);
stream_exec_->Deallocate(&device_ptr);
Expand Down
3 changes: 1 addition & 2 deletions xla/stream_executor/mock_stream_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ class MockStreamExecutor : public StreamExecutor {
(override));
MOCK_METHOD(absl::StatusOr<std::unique_ptr<MemoryAllocation>>,
HostMemoryAllocate, (uint64_t size), (override));
MOCK_METHOD(void, HostMemoryDeallocate, (void* mem, uint64_t size),
(override));
MOCK_METHOD(void, HostMemoryDeallocate, (void* mem), (override));
MOCK_METHOD(bool, SynchronizeAllActivity, (), (override));
MOCK_METHOD(absl::Status, SynchronousMemZero,
(DeviceMemoryBase * location, uint64_t size), (override));
Expand Down
14 changes: 0 additions & 14 deletions xla/stream_executor/rocm/rocm_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -447,20 +447,6 @@ void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
GpuDriver::DeviceDeallocate(context_, mem->opaque());
}

absl::StatusOr<std::unique_ptr<MemoryAllocation>>
GpuExecutor::HostMemoryAllocate(uint64_t size) {
auto* buffer = GpuDriver::HostAllocate(context_, size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void GpuExecutor::HostMemoryDeallocate(void* location, uint64_t size) {
return GpuDriver::HostDeallocate(context_, location);
}

bool GpuExecutor::SynchronizeAllActivity() {
return GpuDriver::SynchronizeContext(context_);
}
Expand Down
2 changes: 1 addition & 1 deletion xla/stream_executor/stream_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ class StreamExecutor {
uint64_t size) = 0;

// Deallocates a region of host memory allocated by HostMemoryAllocate().
virtual void HostMemoryDeallocate(void* mem, uint64_t size) = 0;
virtual void HostMemoryDeallocate(void* mem) = 0;

// Returns the memory space of the given pointer.
virtual absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) {
Expand Down
2 changes: 1 addition & 1 deletion xla/stream_executor/tpu/tpu_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
uint64_t size) override {
LOG(FATAL) << "not yet implemented";
}
void HostMemoryDeallocate(void* mem, uint64_t size) override {
void HostMemoryDeallocate(void* mem) override {
LOG(FATAL) << "not yet implemented";
}
absl::Status SynchronousMemZero(DeviceMemoryBase* location,
Expand Down

0 comments on commit ce96385

Please sign in to comment.