Skip to content

Commit

Permalink
fix copy_from
Browse files Browse the repository at this point in the history
  • Loading branch information
lzhangzz committed Nov 27, 2024
1 parent 3d4f22f commit 329e441
Showing 1 changed file with 46 additions and 12 deletions.
58 changes: 46 additions & 12 deletions src/turbomind/python/bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj)
return dlmt->dl_tensor;
}

static void safe_memcpy(void* dst, const void* src, size_t size)
{
cudaPointerAttributes dat{};
cudaPointerAttributes sat{};
ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst));
ft::check_cuda_error(cudaPointerGetAttributes(&sat, src));
try {
if (dat.devicePointer && sat.devicePointer) {
// Both can be accessed from current context
ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
}
else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) {
if (dat.device != sat.device) {
// On different devices, try peer memcpy
ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size));
}
else {
// Same device, switch to the device first (this is unlikely)
ft::CudaDeviceGuard guard(dat.device);
ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
}
}
else {
// Unknown case, give it a try anyway
ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
}
}
catch (...) {
int device_id{-1};
cudaGetDevice(&device_id);
TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d",
(int)dat.type,
dat.device,
dat.devicePointer,
dat.hostPointer,
(int)sat.type,
sat.device,
sat.devicePointer,
sat.hostPointer,
std::to_string(size).c_str(),
device_id);
throw;
}
}

PYBIND11_MODULE(_turbomind, m)
{
// nccl param
Expand Down Expand Up @@ -293,18 +338,7 @@ PYBIND11_MODULE(_turbomind, m)
std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
cudaPointerAttributes at{};
ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data));
{
// Switch to the same device where TM's tenosr memory resides because it's allocated
// from a pool with no peer access enabled (can't be accessed from a context of other
// devices)
ft::CudaDeviceGuard guard{at.device};
ft::check_cuda_error(cudaMemcpy(const_cast<void*>(self->data),
const_cast<void*>(src->data),
num_bytes,
cudaMemcpyDefault));
}
safe_memcpy(const_cast<void*>(self->data), src->data, num_bytes);
break;
}
default:
Expand Down

0 comments on commit 329e441

Please sign in to comment.