Skip to content

Commit

Permalink
[allocator] Move getFreeMutex (pytorch#87237)
Browse files Browse the repository at this point in the history
It isn't used at all the allocators and this change makes that more clear.
Pull Request resolved: pytorch#87237
Approved by: https://github.com/wconstab
  • Loading branch information
zdevito authored and pytorchmergebot committed Oct 19, 2022
1 parent 89e6078 commit f56ce8d
Show file tree
Hide file tree
Showing 10 changed files with 15 additions and 40 deletions.
3 changes: 1 addition & 2 deletions aten/src/ATen/native/cuda/jit_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include <c10/util/irange.h>
#include <c10/util/hash.h>
#include <c10/util/Optional.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <ATen/jit_macros.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/OffsetCalculator.cuh>
Expand Down Expand Up @@ -927,7 +926,7 @@ void __inline__ initializeCudaContext() {
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx));
if (!pctx) {
std::unique_lock<std::mutex> cudaFreeMutexLock(
*(c10::cuda::CUDACachingAllocator::getFreeMutex()));
*(c10::cuda::getFreeMutex()));
cudaFree(nullptr);
}
}
Expand Down
11 changes: 0 additions & 11 deletions c10/cuda/CUDACachingAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1894,9 +1894,6 @@ class NativeCachingAllocator {
// allocated blocks by device pointer
ska::flat_hash_map<void*, Block*> allocated_blocks;

// lock around calls to cudaFree (to prevent deadlocks with NCCL)
mutable std::mutex cuda_free_mutex;

void add_allocated_block(Block* block) {
std::lock_guard<std::mutex> lock(mutex);
allocated_blocks[block->ptr] = block;
Expand All @@ -1905,10 +1902,6 @@ class NativeCachingAllocator {
public:
std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;

std::mutex* getCudaFreeMutex() const {
return &cuda_free_mutex;
}

Block* get_allocated_block(void* ptr, bool remove = false) {
std::lock_guard<std::mutex> lock(mutex);
auto it = allocated_blocks.find(ptr);
Expand Down Expand Up @@ -2156,10 +2149,6 @@ void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
caching_allocator.recordStream(ptr, stream);
}

std::mutex* getFreeMutex() {
return caching_allocator.getCudaFreeMutex();
}

static inline void assertValidDevice(int device) {
const auto device_num = caching_allocator.device_allocator.size();
TORCH_CHECK(
Expand Down
6 changes: 0 additions & 6 deletions c10/cuda/CUDACachingAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,6 @@ using OutOfMemoryObserver = std::function<void(
_(C10_CUDA_API void, \
notifyCaptureDestroy, \
(int device, MempoolId_t mempool_id)) \
_(C10_CUDA_API std::mutex*, getFreeMutex, ()) \
_(C10_CUDA_API std::shared_ptr<void>, getIpcDevPtr, (std::string handle)) \
_(C10_CUDA_API void, \
recordHistory, \
Expand Down Expand Up @@ -324,11 +323,6 @@ inline void notifyCaptureEnded(int device, CaptureId_t graph_id) {
inline void notifyCaptureDestroy(int device, MempoolId_t mempool_id) {
return Chosen::notifyCaptureDestroy(device, mempool_id);
}

inline std::mutex* getFreeMutex() {
return Chosen::getFreeMutex();
}

// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
return Chosen::getIpcDevPtr(handle);
Expand Down
12 changes: 0 additions & 12 deletions c10/cuda/CUDAMallocAsyncAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,6 @@ std::vector<UsageStream> dummy_unifying_free_streams;
// Keeping it simple with an ordinary mutex for now.
std::mutex general_mutex;

// Copy-paste CUDACachingAllocator's
// lock around calls to cudaFree (to prevent deadlocks with NCCL)
// is this safe?
std::mutex cuda_free_mutex;

/**
* Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -602,10 +597,6 @@ void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
}
}

std::mutex* getFreeMutex() {
return &cuda_free_mutex;
}

std::shared_ptr<void> getIpcDevPtr(std::string handle) {
TORCH_CHECK(
false,
Expand Down Expand Up @@ -899,9 +890,6 @@ void notifyCaptureEnded(int device, CaptureId_t graph_id) {
void notifyCaptureDestroy(int device, MempoolId_t mempool_id) {
NOT_AVAILABLE("notifyCaptureDestroy");
}
std::mutex* getFreeMutex() {
NOT_AVAILABLE("getFreeMutex");
}
std::shared_ptr<void> getIpcDevPtr(std::string handle) {
NOT_AVAILABLE("getIpcDevPtr");
}
Expand Down
5 changes: 5 additions & 0 deletions c10/cuda/CUDAMiscFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,10 @@ const char* get_cuda_check_suffix() noexcept {
"\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.";
}
}
std::mutex* getFreeMutex() {
static std::mutex cuda_free_mutex;
return &cuda_free_mutex;
}

} // namespace cuda
} // namespace c10
5 changes: 4 additions & 1 deletion c10/cuda/CUDAMiscFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

#include <c10/cuda/CUDAMacros.h>

#include <mutex>

namespace c10 {
namespace cuda {
C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
}
C10_CUDA_API std::mutex* getFreeMutex();
} // namespace cuda
} // namespace c10
5 changes: 2 additions & 3 deletions torch/csrc/cuda/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <ATen/cuda/Sleep.h>
#include <ATen/cuda/detail/CUDAHooks.h>
#include <ATen/cuda/jiterator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h>
#include <ATen/cuda/CUDAGraphsUtils.cuh>
#ifdef USE_NCCL
Expand Down Expand Up @@ -407,7 +406,7 @@ PyObject* THCPModule_cudaSleep(PyObject* _unused, PyObject* cycles) {
static PyGILState_STATE cudaMutexGILState;

PyObject* THCPModule_cudaLockMutex(PyObject* module, PyObject* noargs) {
auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex();
auto mutex = c10::cuda::getFreeMutex();
// This has to be a busy loop because we **absolutely need to** hold the GIL
// or it's a recipe for a deadlock otherwise (if we let other Python threads
// run while we have the cudaMutex, but not the GIL, they might try to e.g.
Expand All @@ -427,7 +426,7 @@ PyObject* THCPModule_cudaLockMutex(PyObject* module, PyObject* noargs) {
}

PyObject* THCPModule_cudaUnlockMutex(PyObject* module, PyObject* noargs) {
auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex();
auto mutex = c10::cuda::getFreeMutex();
PyGILState_Release(cudaMutexGILState);
mutex->unlock();
Py_RETURN_NONE;
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/cuda/nccl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ void check_inputs(
} // namespace detail

AutoNcclGroup::AutoNcclGroup() {
(c10::cuda::CUDACachingAllocator::getFreeMutex())->lock();
(c10::cuda::getFreeMutex())->lock();
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
detail::NCCL_CHECK(ncclGroupStart());
#endif
Expand All @@ -307,7 +307,7 @@ AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
detail::NCCL_CHECK(ncclGroupEnd());
#endif
(c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock();
(c10::cuda::getFreeMutex())->unlock();
}

bool is_available(TensorList tensors) {
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/cuda/nccl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/Optional.h>

#include <cstddef>
Expand Down
3 changes: 1 addition & 2 deletions torch/csrc/jit/codegen/cuda/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>

#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/irange.h>

#include <torch/csrc/jit/codegen/cuda/contiguity.h>
Expand Down Expand Up @@ -941,7 +940,7 @@ void initializeCudaContext() {
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx));
if (!pctx) {
std::unique_lock<std::mutex> cudaFreeMutexLock(
*(c10::cuda::CUDACachingAllocator::getFreeMutex()));
*(c10::cuda::getFreeMutex()));
cudaFree(nullptr);
}
}
Expand Down

0 comments on commit f56ce8d

Please sign in to comment.