Netflix · gedoensmax · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 14, 2023
diff --git a/libvmaf/src/cuda/cuda_helper.cuh b/libvmaf/src/cuda/cuda_helper.cuh
@@ -59,6 +59,13 @@ namespace {
         return atomicAdd(reinterpret_cast<uint64_cu *>(address),
                 static_cast<uint64_cu>(val));
     }
+
+    typedef unsigned long long int uint64_cu;
+    __forceinline__ __device__ int64_t atomicAdd_uint64(uint64_t *address,
+            uint64_t val) {
+        return atomicAdd(reinterpret_cast<uint64_cu *>(address),
+                static_cast<uint64_cu>(val));
+    }
 } // namespace
 #endif
 

diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c
@@ -855,17 +855,9 @@ static void integer_compute_adm_cuda(VmafFeatureExtractor *fex, AdmStateCuda *s,
             h = (h + 1) / 2;
 
             // This event ensures the input buffer is consumed
-            CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-
             CHECK_CUDA(cuStreamWaitEvent(s->str, s->dis_event, CU_EVENT_WAIT_DEFAULT));
-            CHECK_CUDA(cuEventDestroy(s->dis_event));
-            CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT));
-
             CHECK_CUDA(cuStreamWaitEvent(s->str, s->ref_event, CU_EVENT_WAIT_DEFAULT));
-            CHECK_CUDA(cuEventDestroy(s->ref_event));
-            CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT));
-
-            CHECK_CUDA(cuCtxPopCurrent(NULL));
+
             // consumes buf->ref_dwt2 , buf->dis_dwt2
             // produces buf->decouple_r , buf->decouple_a
             adm_decouple_device(s, buf, w, h, buf_stride, &p, s->str);
@@ -1154,12 +1146,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
     (void) dist_pic_90;
 
     // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
-    CHECK_CUDA(cuStreamSynchronize(s->str));
-    // CHECK_CUDA(cuEventSynchronize(s->finished));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    CHECK_CUDA(cuEventDestroy(s->finished));
-    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    CHECK_CUDA(cuEventSynchronize(s->finished));
 
     // current implementation is limited by the 16-bit data pipeline, thus
     // cannot handle an angular frequency smaller than 1080p * 3H

diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c
@@ -259,12 +259,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
     MotionStateCuda *s = fex->priv;
 
     // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
-    CHECK_CUDA(cuStreamSynchronize(s->str));
-    // CHECK_CUDA(cuEventSynchronize(s->finished));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    CHECK_CUDA(cuEventDestroy(s->finished));
-    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    CHECK_CUDA(cuEventSynchronize(s->finished));
 
     int err = 0;
     (void) dist_pic;

diff --git a/libvmaf/src/feature/cuda/integer_psnr/psnr.cu b/libvmaf/src/feature/cuda/integer_psnr/psnr.cu
@@ -0,0 +1,138 @@
+/**
+ *
+ *  Copyright 2016-2023 Netflix, Inc.
+ *  Copyright 2022 NVIDIA Corporation.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include "cuda/integer_motion_cuda.h"
+#include "cuda_helper.cuh"
+
+#include "common.h"
+
+template <typename T, typename LOAD_TYPE>
+__device__ void sse_calculation(T *ref, T *dis, unsigned int w, unsigned int h,
+                                unsigned int stride, uint64_t *sse) {
+  constexpr int val_per_thread = sizeof(LOAD_TYPE) / sizeof(T);
+  unsigned int idx_x = (threadIdx.x + blockDim.x * blockIdx.x) * val_per_thread;
+  unsigned int idx_y = threadIdx.y + blockDim.y * blockIdx.y;
+
+  if (idx_y < h && idx_x < w) {
+    int idx = idx_y * (stride / sizeof(T)) + idx_x;
+    uint64_t thread_sse = 0u;
+    union {
+      T value_ref[val_per_thread];
+      LOAD_TYPE load_value_dis;
+    };
+    union {
+      T value_dis[val_per_thread];
+      LOAD_TYPE load_value_ref;
+    };
+    load_value_ref = *reinterpret_cast<LOAD_TYPE *>(&ref[idx]);
+    load_value_dis = *reinterpret_cast<LOAD_TYPE *>(&dis[idx]);
+    for (unsigned int i = 0; i < val_per_thread; ++i) {
+      if ((idx_x + i) < w) {
+        const int e = value_ref[i] - value_dis[i];
+        thread_sse += e * e;
+      }
+    }
+
+    // Warp-reduce abs_dist
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 16);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 8);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 4);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 2);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 1);
+    // Let threads in lane zero add warp-reduced abs_dist atomically to global
+    // sad
+    const int lane =
+        (threadIdx.y * blockDim.x + threadIdx.x) % VMAF_CUDA_THREADS_PER_WARP;
+    if (lane == 0)
+      atomicAdd_uint64(sse, static_cast<uint64_t>(thread_sse));
+  }
+}
+
+template <int chn>
+__device__ void psnr8_impl(const VmafPicture ref_pic,
+                           const VmafPicture dist_pic,
+                           const VmafCudaBuffer sse) {
+  unsigned int stride = ref_pic.stride[chn];
+  // if second channel is smaller use smaller load
+  if (stride <= (ref_pic.stride[0] / 2))
+    sse_calculation<uint8_t, uint64_t>(
+        reinterpret_cast<uint8_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint8_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+  else
+    sse_calculation<uint8_t, ushort4>(
+        reinterpret_cast<uint8_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint8_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+}
+
+template <int chn>
+__device__ void psnr16_impl(const VmafPicture ref_pic,
+                            const VmafPicture dist_pic,
+                            const VmafCudaBuffer sse) {
+  unsigned int stride = ref_pic.stride[chn];
+  // if second channel is smaller use smaller load
+  if (stride <= (ref_pic.stride[0] / 2))
+    sse_calculation<uint16_t, ushort4>(
+        reinterpret_cast<uint16_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint16_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+  else
+    sse_calculation<uint16_t, uint4>(
+        reinterpret_cast<uint16_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint16_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+}
+
+extern "C" {
+
+__global__ void psnr(const VmafPicture ref_pic, const VmafPicture dist_pic,
+                     const VmafCudaBuffer sse) {
+  // this is needed to not produce local load/store ops when accessing with
+  // "dynamic" index although blockIdx.z is not really dynamic
+  switch (blockIdx.z) {
+  case 0:
+    psnr8_impl<0>(ref_pic, dist_pic, sse);
+    return;
+  case 1:
+    psnr8_impl<1>(ref_pic, dist_pic, sse);
+    return;
+  case 2:
+    psnr8_impl<2>(ref_pic, dist_pic, sse);
+    return;
+  }
+}
+
+__global__ void psnr_hbd(const VmafPicture ref_pic, const VmafPicture dist_pic,
+                         const VmafCudaBuffer sse) {
+  // this is needed to not produce local load/store ops when accessing with
+  // "dynamic" index although blockIdx.z is not really dynamic
+  switch (blockIdx.z) {
+  case 0:
+    psnr16_impl<0>(ref_pic, dist_pic, sse);
+    return;
+  case 1:
+    psnr16_impl<1>(ref_pic, dist_pic, sse);
+    return;
+  case 2:
+    psnr16_impl<2>(ref_pic, dist_pic, sse);
+    return;
+  }
+}
+}