Skip to content

Commit

Permalink
[ROCm] Support for building DeepRec on ROCm2.10.0. (#302)
Browse files Browse the repository at this point in the history
  • Loading branch information
simple86 authored Jul 15, 2022
1 parent cf3a3b1 commit 1370df7
Show file tree
Hide file tree
Showing 19 changed files with 99 additions and 61 deletions.
2 changes: 1 addition & 1 deletion tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ using nvinfer1::PluginFormat;

template <typename SrcT, typename DstT>
__global__ void Cast(const SrcT* input, int num_elements, DstT* output) {
for (int i : CudaGridRangeX(num_elements)) {
for (int i : GpuGridRangeX(num_elements)) {
output[i] = static_cast<DstT>(input[i]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/framework/resource_mgr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ Status ResourceMgr::DoCreate(const string& container, TypeIndex type,
// key can contain a StringPiece that borrows from the string in the value.
ResourceAndName resource_and_name(resource, name);
StringPiece borrowed_name(*resource_and_name.name);
Container::value_type key_and_value(Key(type.hash_code(), borrowed_name),
Container::value_type key_and_value(Key(type.hash_code(), std::string(borrowed_name.data(), borrowed_name.size())),
std::move(resource_and_name));

if ((*b)->insert(std::move(key_and_value)).second) {
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/core/kernels/bias_grad_ali_op_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ void TwoColumnWiseReduction(const float* A, int m, int n, int lda, float* Y,

void MultipleColumnWiseReduction(const float* A, int m, int n, int lda,
float* Y, bool overwrite = true) {
#if defined(__GNUC__) && (__GNUC__ >6)
#ifdef __AVX512F__
if (n >= block_size_avx512) {
int block_num = n / block_size_avx512;
Expand Down Expand Up @@ -241,6 +242,7 @@ void MultipleColumnWiseReduction(const float* A, int m, int n, int lda,
}
return;
}
#endif
#endif
// Normal cases.
if (overwrite) {
Expand Down Expand Up @@ -270,6 +272,7 @@ void SumIntoOneRow(const float* A, int m, int n, int lda, float* Y,
}
}

#if defined(__GNUC__) && (__GNUC__ >6)
#ifdef __AVX512F__
void ColumnParallel_512(const CPUDevice& d, float* input_data, float* output_data,
int sum_size, int channel) {
Expand Down Expand Up @@ -355,6 +358,7 @@ void ColumnParallel_256(const CPUDevice& d, float* input_data, float* output_dat
_mm256_mask_storeu_ps(output_data + offset_block_end, mask, sum);
}
#endif
#endif

template <typename T>
void BiasGrad2DInternal(const CPUDevice& d, typename TTypes<T>::ConstFlat input,
Expand Down Expand Up @@ -453,6 +457,7 @@ struct BiasGrad2D<CPUDevice, float> {
Eigen::DSizes<int, 2>& two_dims,
typename TTypes<float>::Flat output) {
auto thread_num = d.numThreads();
#if defined(__GNUC__) && (__GNUC__ >6)
#ifdef __AVX512F__
if (two_dims[1] >= block_size_avx512 * thread_num) {
ColumnParallel_512(d, (float*)(input.data()), output.data(),
Expand All @@ -465,6 +470,7 @@ struct BiasGrad2D<CPUDevice, float> {
two_dims[0], two_dims[1]);
return;
}
#endif
#endif
BiasGrad2DInternal<float>(d, input, two_dims, output);
}
Expand Down
10 changes: 5 additions & 5 deletions tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM

#define EIGEN_USE_GPU

Expand Down Expand Up @@ -111,7 +111,7 @@ struct BatchSelectFunctor<GPUDevice, T> {
template <typename T>
__global__ void Select4ElementThenScalarFunctorKernel(
const bool *c, const T *t, const T *e, size_t num, T *o) {
CUDA_1D_KERNEL_LOOP(i, num) {
GPU_1D_KERNEL_LOOP(i, num) {
if (c[i]) {
o[i] = t[0];
} else {
Expand All @@ -123,7 +123,7 @@ __global__ void Select4ElementThenScalarFunctorKernel(
template <typename T>
__global__ void Select4ElementElseScalarFunctorKernel(
const bool *c, const T *t, const T *e, size_t num, T *o) {
CUDA_1D_KERNEL_LOOP(i, num) {
GPU_1D_KERNEL_LOOP(i, num) {
if (c[i]) {
o[i] = t[i];
} else {
Expand Down Expand Up @@ -174,7 +174,7 @@ struct Select4ElementScalarFunctor<GPUDevice, T> {
template <typename T>
__global__ void BatchSelect4BroadcastingThenScalarFunctorKernel(
const bool *c, const T *t, const T *e, size_t batch, size_t batch_size, T *o) {
CUDA_1D_KERNEL_LOOP(i, batch * batch_size) {
GPU_1D_KERNEL_LOOP(i, batch * batch_size) {
size_t offset = i / batch_size;
if (c[offset]) {
o[i] = t[0];
Expand All @@ -187,7 +187,7 @@ __global__ void BatchSelect4BroadcastingThenScalarFunctorKernel(
template <typename T>
__global__ void BatchSelect4BroadcastingElseScalarFunctorKernel(
const bool *c, const T *t, const T *e, size_t batch, size_t batch_size, T *o) {
CUDA_1D_KERNEL_LOOP(i, batch * batch_size) {
GPU_1D_KERNEL_LOOP(i, batch * batch_size) {
size_t offset = i / batch_size;
if (c[offset]) {
o[i] = t[i];
Expand Down
8 changes: 6 additions & 2 deletions tensorflow/core/kernels/cwise_op_select.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ limitations under the License.

#define EIGEN_USE_THREADS

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

Expand Down Expand Up @@ -375,7 +375,7 @@ class SelectV2Op : public OpKernel {

TF_CALL_ALL_TYPES(REGISTER_SELECT);

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM

// Registration of the GPU implementations.
#define REGISTER_SELECT_GPU(type) \
Expand Down Expand Up @@ -524,6 +524,7 @@ struct SelectFunctorBase<Device, float> {
typename TTypes<bool>::ConstFlat cond_flat,
typename TTypes<float>::ConstFlat then_flat,
typename TTypes<float>::ConstFlat else_flat) {
#if defined(__GNUC__) && (__GNUC__ >6)
#ifdef __AVX512F__
const size_t num = cond_flat.size();
const bool* c = cond_flat.data();
Expand Down Expand Up @@ -557,6 +558,7 @@ struct SelectFunctorBase<Device, float> {
Assign(d, out, out);
#else
Assign(d, out, cond_flat.select(then_flat, else_flat));
#endif
#endif
}
};
Expand Down Expand Up @@ -834,6 +836,7 @@ struct BatchSelectFunctor<CPUDevice, float> {
const float* t = then_flat_outer_dims.data();
const float* e = else_flat_outer_dims.data();

#if defined(__GNUC__) && (__GNUC__ >6)
#ifdef __AVX512F__
size_t quotient = batch_size / float_alignment;
int remainder = batch_size - (quotient * float_alignment);
Expand Down Expand Up @@ -888,6 +891,7 @@ struct BatchSelectFunctor<CPUDevice, float> {
}
}
};
#endif
#endif
auto cost = Eigen::TensorOpCost(sizeof(float) * batch_size * 2, // ld bytes
sizeof(float) * batch_size, // st bytes
Expand Down
4 changes: 4 additions & 0 deletions tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,11 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
c, status,
errors::Internal("Failed to launch copy from device to host."), done);

#if GOOGLE_CUDA
cudaDeviceSynchronize();
#elif TENSORFLOW_USE_ROCM
hipDeviceSynchronize();
#endif

OpOutputList outputs;
this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/dynamic_stitch_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ class DynamicStitchOpImplBase : public OpKernel {
}
};

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM

template <typename T>
void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
Expand Down Expand Up @@ -570,7 +570,7 @@ TF_CALL_variant(REGISTER_DYNAMIC_STITCH);
TF_CALL_QUANTIZED_TYPES(REGISTER_DYNAMIC_STITCH);
#undef REGISTER_DYNAMIC_STITCH

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
#define REGISTER_DYNAMIC_STITCH_GPU(type) \
REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \
.Device(DEVICE_GPU) \
Expand Down
14 changes: 7 additions & 7 deletions tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM

#define EIGEN_USE_GPU

Expand Down Expand Up @@ -52,7 +52,7 @@ __global__ void DynamicStitchKernelV2(const int32 slice_size,
const int32* input_indices,
T** input_ptrs,
T* output) {
CUDA_1D_KERNEL_LOOP(output_index, output_size) {
GPU_1D_KERNEL_LOOP(output_index, output_size) {
const int32 slice_id = output_index / slice_size;
const int32 slice_offset = output_index % slice_size;
const int32 input_index = input_indices[slice_id];
Expand All @@ -65,7 +65,7 @@ __global__ void DynamicStitchKernelV2(const int32 slice_size,
__global__ void InitializeIndicesFlatWork(int32* indices_flat_work,
const int32 flat_work_size,
const int32 val) {
CUDA_1D_KERNEL_LOOP(output_index, flat_work_size) {
GPU_1D_KERNEL_LOOP(output_index, flat_work_size) {
indices_flat_work[output_index] = val;
}
}
Expand All @@ -80,7 +80,7 @@ __global__ void DynamicStitchPrepKernel(const int32* indices_flat,
const int32 slice_size,
const int32 output_size) {

CUDA_1D_KERNEL_LOOP(output_index, output_size) {
GPU_1D_KERNEL_LOOP(output_index, output_size) {
// for indices
indices_flat_work[indices_flat[output_index]] = output_index;
// find the partition id
Expand Down Expand Up @@ -123,7 +123,7 @@ void DynamicStitchGPUImplV2(const Eigen::GpuDevice& gpu_device,
Tensor* input_ptrs,
T* output) {
const int32 output_size = first_dim_size * slice_size;
auto config = GetCudaLaunchConfig(output_size, gpu_device);
auto config = GetGpuLaunchConfig(output_size, gpu_device);

DynamicStitchKernelV2<T>
<<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
Expand All @@ -146,13 +146,13 @@ void DynamicStitchGPUPrep(const Eigen::GpuDevice& gpu_device,
const int32 first_dim_size) {

// initialize indices_flat_work by -1
auto config = GetCudaLaunchConfig(first_dim_size, gpu_device);
auto config = GetGpuLaunchConfig(first_dim_size, gpu_device);
InitializeIndicesFlatWork
<<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
indices_flat_work->flat<int32>().data(),
first_dim_size, -1);

config = GetCudaLaunchConfig(data_elements_size, gpu_device);
config = GetGpuLaunchConfig(data_elements_size, gpu_device);
DynamicStitchPrepKernel<T>
<<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
indices_flat->flat<int32>().data(),
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/kernels/reshape_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL

#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) //|| \
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
// A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel
Expand Down
6 changes: 3 additions & 3 deletions tensorflow/core/kernels/segment_reduction_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class SegmentReductionOp : public OpKernel {
}
};

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
// SegmentSumGPUOp is a segment sum operator implemented for GPU only.
// TODO: This implementation of SegmentSumGPUOp is sometimes slower than
// its unsorted counterpart (mostly when problem size is small).
Expand Down Expand Up @@ -352,7 +352,7 @@ REGISTER_COMPLEX_CPU_KERNELS_ALL(complex128);
#undef REGISTER_REAL_CPU_KERNELS_ALL
#undef REGISTER_COMPLEX_CPU_KERNELS_ALL

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
#define REGISTER_GPU_SORTED_KERNELS(type, index_type) \
REGISTER_KERNEL_BUILDER(Name("SegmentSum") \
.Device(DEVICE_GPU) \
Expand Down Expand Up @@ -637,7 +637,7 @@ REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
#undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL
#undef REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
#define REGISTER_GPU_KERNEL_UNSORTEDSEGMENT( \
name, type, index_type, initial_value_functor, reduction_kernel_functor) \
REGISTER_KERNEL_BUILDER( \
Expand Down
6 changes: 3 additions & 3 deletions tensorflow/core/kernels/segment_reduction_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class OpKernelContext;

namespace functor {

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
typedef Eigen::GpuDevice GPUDevice;
// Functor for SegmentSumGPUOp.
// output_rows: the number of output segments (unique segment ids in
Expand Down Expand Up @@ -88,7 +88,7 @@ struct SetValueDefault {
Tensor* target,
T default_value);
};
#endif
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <typename Device, typename T, typename Index, typename InitialValueF,
typename ReductionF>
Expand All @@ -99,7 +99,7 @@ struct UnsortedSegmentFunctor {
typename TTypes<T, 2>::Tensor output);
};

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
// reduction functors for the gpu
template <typename T>
struct SumOpGpu {
Expand Down
Loading

0 comments on commit 1370df7

Please sign in to comment.