From f56ce8dbad728ca59a29b7dd089f5a705a40f70d Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Tue, 18 Oct 2022 13:24:52 -0700 Subject: [PATCH] [allocator] Move getFreeMutex (#87237) It isn't used at all the allocators and this change makes that more clear. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87237 Approved by: https://github.com/wconstab --- aten/src/ATen/native/cuda/jit_utils.cpp | 3 +-- c10/cuda/CUDACachingAllocator.cpp | 11 ----------- c10/cuda/CUDACachingAllocator.h | 6 ------ c10/cuda/CUDAMallocAsyncAllocator.cpp | 12 ------------ c10/cuda/CUDAMiscFunctions.cpp | 5 +++++ c10/cuda/CUDAMiscFunctions.h | 5 ++++- torch/csrc/cuda/Module.cpp | 5 ++--- torch/csrc/cuda/nccl.cpp | 4 ++-- torch/csrc/cuda/nccl.h | 1 - torch/csrc/jit/codegen/cuda/executor_utils.cpp | 3 +-- 10 files changed, 15 insertions(+), 40 deletions(-) diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index f86b624b84f7c..b292d488708bf 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -927,7 +926,7 @@ void __inline__ initializeCudaContext() { AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx)); if (!pctx) { std::unique_lock cudaFreeMutexLock( - *(c10::cuda::CUDACachingAllocator::getFreeMutex())); + *(c10::cuda::getFreeMutex())); cudaFree(nullptr); } } diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 09648f971e202..8406b7eb5c965 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1894,9 +1894,6 @@ class NativeCachingAllocator { // allocated blocks by device pointer ska::flat_hash_map allocated_blocks; - // lock around calls to cudaFree (to prevent deadlocks with NCCL) - mutable std::mutex cuda_free_mutex; - void add_allocated_block(Block* block) { std::lock_guard lock(mutex); allocated_blocks[block->ptr] = block; @@ -1905,10 +1902,6 @@ class NativeCachingAllocator { public: std::vector> device_allocator; - std::mutex* getCudaFreeMutex() const { - return &cuda_free_mutex; - } - Block* get_allocated_block(void* ptr, bool remove = false) { std::lock_guard lock(mutex); auto it = allocated_blocks.find(ptr); @@ -2156,10 +2149,6 @@ void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) { caching_allocator.recordStream(ptr, stream); } -std::mutex* getFreeMutex() { - return caching_allocator.getCudaFreeMutex(); -} - static inline void assertValidDevice(int device) { const auto device_num = caching_allocator.device_allocator.size(); TORCH_CHECK( diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index acc0d403808f6..9665e4e40aeef 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -211,7 +211,6 @@ using OutOfMemoryObserver = std::function, getIpcDevPtr, (std::string handle)) \ _(C10_CUDA_API void, \ recordHistory, \ @@ -324,11 +323,6 @@ inline void notifyCaptureEnded(int device, CaptureId_t graph_id) { inline void notifyCaptureDestroy(int device, MempoolId_t mempool_id) { return Chosen::notifyCaptureDestroy(device, mempool_id); } - -inline std::mutex* getFreeMutex() { - return Chosen::getFreeMutex(); -} - // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE inline std::shared_ptr getIpcDevPtr(std::string handle) { return Chosen::getIpcDevPtr(handle); diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp index 52aa32925ab1e..51bc5fd94b890 100644 --- a/c10/cuda/CUDAMallocAsyncAllocator.cpp +++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp @@ -77,11 +77,6 @@ std::vector dummy_unifying_free_streams; // Keeping it simple with an ordinary mutex for now. std::mutex general_mutex; -// Copy-paste CUDACachingAllocator's -// lock around calls to cudaFree (to prevent deadlocks with NCCL) -// is this safe? -std::mutex cuda_free_mutex; - /** * Note [Avoid freeing uncaptured ptrs during CUDA graph capture] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -602,10 +597,6 @@ void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) { } } -std::mutex* getFreeMutex() { - return &cuda_free_mutex; -} - std::shared_ptr getIpcDevPtr(std::string handle) { TORCH_CHECK( false, @@ -899,9 +890,6 @@ void notifyCaptureEnded(int device, CaptureId_t graph_id) { void notifyCaptureDestroy(int device, MempoolId_t mempool_id) { NOT_AVAILABLE("notifyCaptureDestroy"); } -std::mutex* getFreeMutex() { - NOT_AVAILABLE("getFreeMutex"); -} std::shared_ptr getIpcDevPtr(std::string handle) { NOT_AVAILABLE("getIpcDevPtr"); } diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp index 7655ca8c6a600..c9e6de12dca36 100644 --- a/c10/cuda/CUDAMiscFunctions.cpp +++ b/c10/cuda/CUDAMiscFunctions.cpp @@ -16,5 +16,10 @@ const char* get_cuda_check_suffix() noexcept { "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1."; } } +std::mutex* getFreeMutex() { + static std::mutex cuda_free_mutex; + return &cuda_free_mutex; +} + } // namespace cuda } // namespace c10 diff --git a/c10/cuda/CUDAMiscFunctions.h b/c10/cuda/CUDAMiscFunctions.h index eca8fd042f615..e911a8a4e3f78 100644 --- a/c10/cuda/CUDAMiscFunctions.h +++ b/c10/cuda/CUDAMiscFunctions.h @@ -4,8 +4,11 @@ #include +#include + namespace c10 { namespace cuda { C10_CUDA_API const char* get_cuda_check_suffix() noexcept; -} +C10_CUDA_API std::mutex* getFreeMutex(); +} // namespace cuda } // namespace c10 diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index 2cd370e31d2eb..70725bb483d06 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #ifdef USE_NCCL @@ -407,7 +406,7 @@ PyObject* THCPModule_cudaSleep(PyObject* _unused, PyObject* cycles) { static PyGILState_STATE cudaMutexGILState; PyObject* THCPModule_cudaLockMutex(PyObject* module, PyObject* noargs) { - auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex(); + auto mutex = c10::cuda::getFreeMutex(); // This has to be a busy loop because we **absolutely need to** hold the GIL // or it's a recipe for a deadlock otherwise (if we let other Python threads // run while we have the cudaMutex, but not the GIL, they might try to e.g. @@ -427,7 +426,7 @@ PyObject* THCPModule_cudaLockMutex(PyObject* module, PyObject* noargs) { } PyObject* THCPModule_cudaUnlockMutex(PyObject* module, PyObject* noargs) { - auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex(); + auto mutex = c10::cuda::getFreeMutex(); PyGILState_Release(cudaMutexGILState); mutex->unlock(); Py_RETURN_NONE; diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index 83729084aeb1d..a1d96f7e5d6cd 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -297,7 +297,7 @@ void check_inputs( } // namespace detail AutoNcclGroup::AutoNcclGroup() { - (c10::cuda::CUDACachingAllocator::getFreeMutex())->lock(); + (c10::cuda::getFreeMutex())->lock(); #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) detail::NCCL_CHECK(ncclGroupStart()); #endif @@ -307,7 +307,7 @@ AutoNcclGroup::~AutoNcclGroup() noexcept(false) { #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) detail::NCCL_CHECK(ncclGroupEnd()); #endif - (c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock(); + (c10::cuda::getFreeMutex())->unlock(); } bool is_available(TensorList tensors) { diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h index 35bf3ad255e91..f9f4fa8b1353b 100644 --- a/torch/csrc/cuda/nccl.h +++ b/torch/csrc/cuda/nccl.h @@ -2,7 +2,6 @@ #include #include -#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 99d573a39f396..db9764eb3059e 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include @@ -941,7 +940,7 @@ void initializeCudaContext() { AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx)); if (!pctx) { std::unique_lock cudaFreeMutexLock( - *(c10::cuda::CUDACachingAllocator::getFreeMutex())); + *(c10::cuda::getFreeMutex())); cudaFree(nullptr); } }