DeepRec-AI
diff --git a/‎tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/framework/resource_mgr.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/framework/resource_mgr.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/kernels/bias_grad_ali_op_cpu.h
Lines changed: 6 additions & 0 deletions b/‎tensorflow/core/kernels/bias_grad_ali_op_cpu.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
Lines changed: 5 additions & 5 deletions b/‎tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
Lines changed: 5 additions & 5 deletions
diff --git a/‎tensorflow/core/kernels/cwise_op_select.cc
Lines changed: 6 additions & 2 deletions b/‎tensorflow/core/kernels/cwise_op_select.cc
Lines changed: 6 additions & 2 deletions
diff --git a/‎tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
Lines changed: 4 additions & 0 deletions b/‎tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/dynamic_stitch_op.cc
Lines changed: 2 additions & 2 deletions b/‎tensorflow/core/kernels/dynamic_stitch_op.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
Lines changed: 7 additions & 7 deletions b/‎tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
Lines changed: 7 additions & 7 deletions
diff --git a/‎tensorflow/core/kernels/reshape_op.cc
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/kernels/reshape_op.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/kernels/segment_reduction_ops.cc
Lines changed: 3 additions & 3 deletions b/‎tensorflow/core/kernels/segment_reduction_ops.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎tensorflow/core/kernels/segment_reduction_ops.h
Lines changed: 3 additions & 3 deletions b/‎tensorflow/core/kernels/segment_reduction_ops.h
Lines changed: 3 additions & 3 deletions
@@ -38,7 +38,7 @@ using nvinfer1::PluginFormat;
 
 template <typename SrcT, typename DstT>
 __global__ void Cast(const SrcT* input, int num_elements, DstT* output) {
-  for (int i : CudaGridRangeX(num_elements)) {
+  for (int i : GpuGridRangeX(num_elements)) {
     output[i] = static_cast<DstT>(input[i]);
   }
 }
 
@@ -178,7 +178,7 @@ Status ResourceMgr::DoCreate(const string& container, TypeIndex type,
   // key can contain a StringPiece that borrows from the string in the value.
   ResourceAndName resource_and_name(resource, name);
   StringPiece borrowed_name(*resource_and_name.name);
-  Container::value_type key_and_value(Key(type.hash_code(), borrowed_name),
+  Container::value_type key_and_value(Key(type.hash_code(), std::string(borrowed_name.data(), borrowed_name.size())),
                                       std::move(resource_and_name));
 
   if ((*b)->insert(std::move(key_and_value)).second) {
 
@@ -195,6 +195,7 @@ void TwoColumnWiseReduction(const float* A, int m, int n, int lda, float* Y,
 
 void MultipleColumnWiseReduction(const float* A, int m, int n, int lda,
                                 float* Y, bool overwrite = true) {
+#if defined(__GNUC__) && (__GNUC__ >6)
 #ifdef __AVX512F__
   if (n >= block_size_avx512) {
     int block_num = n / block_size_avx512;
@@ -241,6 +242,7 @@ void MultipleColumnWiseReduction(const float* A, int m, int n, int lda,
     }
     return;
   }
+#endif
 #endif
   // Normal cases.
   if (overwrite) {
@@ -270,6 +272,7 @@ void SumIntoOneRow(const float* A, int m, int n, int lda, float* Y,
   }
 }
 
+#if defined(__GNUC__) && (__GNUC__ >6)
 #ifdef __AVX512F__
 void ColumnParallel_512(const CPUDevice& d, float* input_data, float* output_data,
     int sum_size, int channel) {
@@ -355,6 +358,7 @@ void ColumnParallel_256(const CPUDevice& d, float* input_data, float* output_dat
   _mm256_mask_storeu_ps(output_data + offset_block_end, mask, sum);
 }
 #endif
+#endif
 
 template <typename T>
 void BiasGrad2DInternal(const CPUDevice& d, typename TTypes<T>::ConstFlat input,
@@ -453,6 +457,7 @@ struct BiasGrad2D<CPUDevice, float> {
                   Eigen::DSizes<int, 2>& two_dims,
                   typename TTypes<float>::Flat output) {
     auto thread_num = d.numThreads();
+#if defined(__GNUC__) && (__GNUC__ >6)
 #ifdef __AVX512F__
     if (two_dims[1] >= block_size_avx512 * thread_num) {
       ColumnParallel_512(d, (float*)(input.data()), output.data(),
@@ -465,6 +470,7 @@ struct BiasGrad2D<CPUDevice, float> {
           two_dims[0], two_dims[1]);
       return;
     }
+#endif
 #endif
     BiasGrad2DInternal<float>(d, input, two_dims, output);
   }
 
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -111,7 +111,7 @@ struct BatchSelectFunctor<GPUDevice, T> {
 template <typename T>
 __global__ void Select4ElementThenScalarFunctorKernel(
     const bool *c, const T *t, const T *e, size_t num, T *o) {
-  CUDA_1D_KERNEL_LOOP(i, num) {
+  GPU_1D_KERNEL_LOOP(i, num) {
     if (c[i]) {
       o[i] = t[0];
     } else {
@@ -123,7 +123,7 @@ __global__ void Select4ElementThenScalarFunctorKernel(
 template <typename T>
 __global__ void Select4ElementElseScalarFunctorKernel(
     const bool *c, const T *t, const T *e, size_t num, T *o) {
-  CUDA_1D_KERNEL_LOOP(i, num) {
+  GPU_1D_KERNEL_LOOP(i, num) {
     if (c[i]) {
       o[i] = t[i];
     } else {
@@ -174,7 +174,7 @@ struct Select4ElementScalarFunctor<GPUDevice, T> {
 template <typename T>
 __global__ void BatchSelect4BroadcastingThenScalarFunctorKernel(
     const bool *c, const T *t, const T *e, size_t batch, size_t batch_size, T *o) {
-  CUDA_1D_KERNEL_LOOP(i, batch * batch_size) {
+  GPU_1D_KERNEL_LOOP(i, batch * batch_size) {
     size_t offset = i / batch_size;
     if (c[offset]) {
       o[i] = t[0];
@@ -187,7 +187,7 @@ __global__ void BatchSelect4BroadcastingThenScalarFunctorKernel(
 template <typename T>
 __global__ void BatchSelect4BroadcastingElseScalarFunctorKernel(
     const bool *c, const T *t, const T *e, size_t batch, size_t batch_size, T *o) {
-  CUDA_1D_KERNEL_LOOP(i, batch * batch_size) {
+  GPU_1D_KERNEL_LOOP(i, batch * batch_size) {
     size_t offset = i / batch_size;
     if (c[offset]) {
       o[i] = t[i];
 
@@ -15,7 +15,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -375,7 +375,7 @@ class SelectV2Op : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_SELECT);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 
 // Registration of the GPU implementations.
 #define REGISTER_SELECT_GPU(type)                                    \
@@ -524,6 +524,7 @@ struct SelectFunctorBase<Device, float> {
                   typename TTypes<bool>::ConstFlat cond_flat,
                   typename TTypes<float>::ConstFlat then_flat,
                   typename TTypes<float>::ConstFlat else_flat) {
+#if defined(__GNUC__) && (__GNUC__ >6)
 #ifdef __AVX512F__
     const size_t num = cond_flat.size();
     const bool* c = cond_flat.data();
@@ -557,6 +558,7 @@ struct SelectFunctorBase<Device, float> {
     Assign(d, out, out);
 #else
     Assign(d, out, cond_flat.select(then_flat, else_flat));
+#endif
 #endif
   }
 };
@@ -834,6 +836,7 @@ struct BatchSelectFunctor<CPUDevice, float> {
     const float* t = then_flat_outer_dims.data();
     const float* e = else_flat_outer_dims.data();
 
+#if defined(__GNUC__) && (__GNUC__ >6)
 #ifdef __AVX512F__
     size_t quotient = batch_size / float_alignment;
     int remainder = batch_size - (quotient * float_alignment);
@@ -888,6 +891,7 @@ struct BatchSelectFunctor<CPUDevice, float> {
         }
       }
     };
+#endif
 #endif
     auto cost = Eigen::TensorOpCost(sizeof(float) * batch_size * 2,  // ld bytes
                                     sizeof(float) * batch_size,      // st bytes
 
@@ -307,7 +307,11 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         c, status,
         errors::Internal("Failed to launch copy from device to host."), done);
 
+#if GOOGLE_CUDA
     cudaDeviceSynchronize();
+#elif TENSORFLOW_USE_ROCM
+    hipDeviceSynchronize();
+#endif
 
     OpOutputList outputs;
     this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
 
@@ -189,7 +189,7 @@ class DynamicStitchOpImplBase : public OpKernel {
   }
 };
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 
 template <typename T>
 void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
@@ -570,7 +570,7 @@ TF_CALL_variant(REGISTER_DYNAMIC_STITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 #define REGISTER_DYNAMIC_STITCH_GPU(type)                \
   REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
                               .Device(DEVICE_GPU)        \
 
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -52,7 +52,7 @@ __global__ void DynamicStitchKernelV2(const int32 slice_size,
                                       const int32* input_indices,
                                       T** input_ptrs,
                                       T* output) {
-  CUDA_1D_KERNEL_LOOP(output_index, output_size) {
+  GPU_1D_KERNEL_LOOP(output_index, output_size) {
     const int32 slice_id = output_index / slice_size;
     const int32 slice_offset = output_index % slice_size;
     const int32 input_index = input_indices[slice_id];
@@ -65,7 +65,7 @@ __global__ void DynamicStitchKernelV2(const int32 slice_size,
 __global__ void InitializeIndicesFlatWork(int32* indices_flat_work,
                                           const int32 flat_work_size,
                                           const int32 val) {
-  CUDA_1D_KERNEL_LOOP(output_index, flat_work_size) {
+  GPU_1D_KERNEL_LOOP(output_index, flat_work_size) {
     indices_flat_work[output_index] = val;
   }
 }
@@ -80,7 +80,7 @@ __global__ void DynamicStitchPrepKernel(const int32* indices_flat,
                                         const int32 slice_size,
                                         const int32 output_size) {
 
-  CUDA_1D_KERNEL_LOOP(output_index, output_size) {
+  GPU_1D_KERNEL_LOOP(output_index, output_size) {
     // for indices
     indices_flat_work[indices_flat[output_index]] = output_index;
     // find the partition id
@@ -123,7 +123,7 @@ void DynamicStitchGPUImplV2(const Eigen::GpuDevice& gpu_device,
                             Tensor* input_ptrs,
                             T* output) {
   const int32 output_size = first_dim_size * slice_size;
-  auto config = GetCudaLaunchConfig(output_size, gpu_device);
+  auto config = GetGpuLaunchConfig(output_size, gpu_device);
 
   DynamicStitchKernelV2<T>
       <<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
@@ -146,13 +146,13 @@ void DynamicStitchGPUPrep(const Eigen::GpuDevice& gpu_device,
                           const int32 first_dim_size) {
 
   // initialize indices_flat_work by -1
-  auto config = GetCudaLaunchConfig(first_dim_size, gpu_device);
+  auto config = GetGpuLaunchConfig(first_dim_size, gpu_device);
   InitializeIndicesFlatWork
     <<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
       indices_flat_work->flat<int32>().data(),
       first_dim_size, -1);
 
-  config = GetCudaLaunchConfig(data_elements_size, gpu_device);
+  config = GetGpuLaunchConfig(data_elements_size, gpu_device);
   DynamicStitchPrepKernel<T>
       <<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
           indices_flat->flat<int32>().data(),
 
@@ -86,7 +86,7 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
 #undef REGISTER_SYCL_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) //|| \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 
@@ -203,7 +203,7 @@ class SegmentReductionOp : public OpKernel {
   }
 };
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 //  SegmentSumGPUOp is a segment sum operator implemented for GPU only.
 //  TODO: This implementation of SegmentSumGPUOp is sometimes slower than
 //  its unsorted counterpart (mostly when problem size is small).
@@ -352,7 +352,7 @@ REGISTER_COMPLEX_CPU_KERNELS_ALL(complex128);
 #undef REGISTER_REAL_CPU_KERNELS_ALL
 #undef REGISTER_COMPLEX_CPU_KERNELS_ALL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_SORTED_KERNELS(type, index_type)                  \
   REGISTER_KERNEL_BUILDER(Name("SegmentSum")                           \
                               .Device(DEVICE_GPU)                      \
@@ -637,7 +637,7 @@ REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL_UNSORTEDSEGMENT(                                 \
     name, type, index_type, initial_value_functor, reduction_kernel_functor) \
   REGISTER_KERNEL_BUILDER(                                                   \
 
@@ -34,7 +34,7 @@ class OpKernelContext;
 
 namespace functor {
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
 // Functor for SegmentSumGPUOp.
 // output_rows: the number of output segments (unique segment ids in
@@ -88,7 +88,7 @@ struct SetValueDefault {
                   Tensor* target,
                   T default_value);
 };
-#endif
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 
 
 template <typename Device, typename T, typename Index, typename InitialValueF,
           typename ReductionF>
@@ -99,7 +99,7 @@ struct UnsortedSegmentFunctor {
                   typename TTypes<T, 2>::Tensor output);
 };
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA //|| TENSORFLOW_USE_ROCM
 // reduction functors for the gpu
 template <typename T>
 struct SumOpGpu {
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ using nvinfer1::PluginFormat;`
`38`	`38`
`39`	`39`	`template <typename SrcT, typename DstT>`
`40`	`40`	`__global__ void Cast(const SrcT* input, int num_elements, DstT* output) {`
`41`		`- for (int i : CudaGridRangeX(num_elements)) {`
	`41`	`+ for (int i : GpuGridRangeX(num_elements)) {`
`42`	`42`	`output[i] = static_cast<DstT>(input[i]);`
`43`	`43`	`}`
`44`	`44`	`}`