Skip to content

Commit

Permalink
hip/cuda allocators short circuit copy construct
Browse files Browse the repository at this point in the history
use hip/cuda memory management API when copy constructing POD arrays of
the same type, rather than allocate-move-copy construct.
  • Loading branch information
burlen committed Aug 24, 2023
1 parent 8a9cb07 commit c96416f
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 126 deletions.
100 changes: 59 additions & 41 deletions hamr_cuda_malloc_allocator_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,61 +521,79 @@ cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::
return nullptr;
}

// move the existing array to the GPU
U *tmp = nullptr;
if (!cudaVals)
if (std::is_same<T,U>::value)
{
size_t n_bytes_vals = n_elem*sizeof(U);

if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
// if the source and dest are the same type, and both are POD, we can
// short circuit the copy constructor and directly copy the data
cudaMemcpyKind dir = cudaVals ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
if ((ierr = cudaMemcpy(ptr, vals, n_bytes, dir)) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMalloc " << n_elem << " of "
<< typeid(T).name() << " total " << n_bytes_vals << "bytes. "
" Failed to cudaMemcpy a " << (cudaVals ? "device" : "host")
<< " array of " << n_elem << " of " << typeid(T).name()
<< " total " << n_bytes << "bytes. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}
}
else
{
// move the existing array to the GPU
U *tmp = nullptr;
if (!cudaVals)
{
size_t n_bytes_vals = n_elem*sizeof(U);

if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMalloc " << n_elem << " of "
<< typeid(T).name() << " total " << n_bytes_vals << "bytes. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMemcpy array of " << n_elem
<< " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

vals = tmp;
}

if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
// get launch parameters
int device_id = -1;
dim3 block_grid;
int n_blocks = 0;
dim3 thread_grid = 0;
if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
n_blocks, thread_grid))
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMemcpy array of " << n_elem
<< " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
" Failed to determine launch properties. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

vals = tmp;
}

// get launch parameters
int device_id = -1;
dim3 block_grid;
int n_blocks = 0;
dim3 thread_grid = 0;
if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
n_blocks, thread_grid))
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to determine launch properties. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

// construct
cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
if ((ierr = cudaGetLastError()) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the construct kernel. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}
// construct
cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
if ((ierr = cudaGetLastError()) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the construct kernel. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

// free up temporary buffers
if (!cudaVals)
{
cudaFree(tmp);
// free up temporary buffers
if (!cudaVals)
{
cudaFree(tmp);
}
}

#if defined(HAMR_VERBOSE)
Expand Down
102 changes: 60 additions & 42 deletions hamr_cuda_malloc_async_allocator_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -501,62 +501,80 @@ cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::va
return nullptr;
}

// move the existing array to the GPU
U *tmp = nullptr;
if (!cudaVals)
if (std::is_same<T,U>::value)
{
size_t n_bytes_vals = n_elem*sizeof(U);

if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
// if the source and dest are the same type, and both are POD, we can
// short circuit the copy constructor and directly copy the data
cudaMemcpyKind dir = cudaVals ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
if ((ierr = cudaMemcpyAsync(ptr, vals, n_bytes, dir, str)) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMalloc " << n_elem << " of "
<< typeid(T).name() << " total " << n_bytes_vals << "bytes. "
" Failed to cudaMemcpy a " << (cudaVals ? "device" : "host")
<< " array of " << n_elem << " of " << typeid(T).name()
<< " total " << n_bytes << "bytes. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}
}
else
{
// move the existing array to the GPU
U *tmp = nullptr;
if (!cudaVals)
{
size_t n_bytes_vals = n_elem*sizeof(U);

if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMalloc " << n_elem << " of "
<< typeid(T).name() << " total " << n_bytes_vals << "bytes. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
cudaMemcpyHostToDevice, str)) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMemcpy array of " << n_elem
<< " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

vals = tmp;
}

if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
cudaMemcpyHostToDevice, str)) != cudaSuccess)
// get launch parameters
int device_id = -1;
dim3 block_grid;
int n_blocks = 0;
dim3 thread_grid = 0;
if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
n_blocks, thread_grid))
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to cudaMemcpy array of " << n_elem
<< " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
" Failed to determine launch properties. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

vals = tmp;
}

// get launch parameters
int device_id = -1;
dim3 block_grid;
int n_blocks = 0;
dim3 thread_grid = 0;
if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
n_blocks, thread_grid))
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to determine launch properties. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

// construct
cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
if ((ierr = cudaGetLastError()) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the construct kernel. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}
// construct
cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
if ((ierr = cudaGetLastError()) != cudaSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the construct kernel. "
<< cudaGetErrorString(ierr) << std::endl;
return nullptr;
}

// free up temporary buffers
if (!cudaVals)
{
cudaFreeAsync(tmp, str);
// free up temporary buffers
if (!cudaVals)
{
cudaFreeAsync(tmp, str);
}
}

#if defined(HAMR_VERBOSE)
Expand Down
104 changes: 61 additions & 43 deletions hamr_hip_malloc_allocator_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,63 +494,81 @@ hip_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::t
return nullptr;
}

// move the existing array to the GPU
U *tmp = nullptr;
if (!hipVals)
if (std::is_same<T,U>::value)
{
size_t n_bytes_vals = n_elem*sizeof(U);

if ((ierr = hipMalloc(&tmp, n_bytes_vals)) != hipSuccess)
// if the source and dest are the same type, and both are POD, we can
// short circuit the copy constructor and directly copy the data
hipMemcpyKind dir = hipVals ? hipMemcpyDeviceToDevice : hipMemcpyHostToDevice;
if ((ierr = hipMemcpy(ptr, vals, n_bytes, dir)) != hipSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to hipMalloc " << n_elem << " of "
<< typeid(T).name() << " total " << n_bytes_vals << "bytes. "
" Failed to hipMemcpy a " << (hipVals ? "device" : "host")
<< " array of " << n_elem << " of " << typeid(T).name()
<< " total " << n_bytes << "bytes. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}
}
else
{
// move the existing array to the GPU
U *tmp = nullptr;
if (!hipVals)
{
size_t n_bytes_vals = n_elem*sizeof(U);

if ((ierr = hipMalloc(&tmp, n_bytes_vals)) != hipSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to hipMalloc " << n_elem << " of "
<< typeid(T).name() << " total " << n_bytes_vals << "bytes. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}

if ((ierr = hipMemcpy(tmp, vals, n_bytes_vals,
hipMemcpyHostToDevice)) != hipSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to hipMemcpy array of " << n_elem
<< " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}

vals = tmp;
}

if ((ierr = hipMemcpy(tmp, vals, n_bytes_vals,
hipMemcpyHostToDevice)) != hipSuccess)
// get launch parameters
int device_id = -1;
dim3 block_grid;
int n_blocks = 0;
dim3 thread_grid = 0;
if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
n_blocks, thread_grid))
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to hipMemcpy array of " << n_elem
<< " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
" Failed to determine launch properties. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}

vals = tmp;
}

// get launch parameters
int device_id = -1;
dim3 block_grid;
int n_blocks = 0;
dim3 thread_grid = 0;
if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
n_blocks, thread_grid))
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to determine launch properties. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}

// construct
hip_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
if ((ierr = hipGetLastError()) != hipSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the construct kernel. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}
// construct
hip_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
if ((ierr = hipGetLastError()) != hipSuccess)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the construct kernel. "
<< hipGetErrorString(ierr) << std::endl;
return nullptr;
}

// free up temporary buffers
if (!hipVals)
{
ierr = hipFree(tmp);
(void) ierr;
// free up temporary buffers
if (!hipVals)
{
ierr = hipFree(tmp);
(void) ierr;
}
}

#if defined(HAMR_VERBOSE)
Expand Down

0 comments on commit c96416f

Please sign in to comment.