diff --git a/src/nyx/features/gabor.cpp b/src/nyx/features/gabor.cpp index efeafff2..3c6fa750 100644 --- a/src/nyx/features/gabor.cpp +++ b/src/nyx/features/gabor.cpp @@ -272,81 +272,42 @@ void GaborFeature::calculate_gpu_multi_filter (LR& r, size_t roiidx) // Examine the baseline signal -#ifdef GAMLEDAGE - unsigned long baselineScore = 0; // abundance of baseline signal pixels over its average - double maxval = 0.0; // max value of the baseline signal - - std::vector& pix_plane = responses[0]; - - ImageMatrix e2img; - e2img.allocate(Im0.width, Im0.height); - PixIntens* e2img_ptr = e2img.writable_data_ptr(); - - for (int k = 0; k < Im0.height * Im0.width; ++k) - e2img_ptr[k] = pix_plane[k]; - - // Values that we need for scoring filter responses - Moments2 local_stats; - e2img.GetStats(local_stats); - maxval = local_stats.max__(); - double cmpval = local_stats.min__(); - - // Score the baseline signal - for (auto a : pix_plane) - if (double(a) > cmpval) - baselineScore++; -#else + // we need to get these 3 values from response[0], the baseline signal + PixIntens maxval = 0, + cmpval = UINT16_MAX; // min + size_t baselineScore = 0; - size_t roiLen = Im0.width * Im0.height; - Moments2 stats; - for (size_t i = 0; i < roiLen; i++) + size_t wh = Im0.width * Im0.height; + for (size_t i = 0; i < wh; i++) { - PixIntens p = NyxusGpu::gabor_energy_image.hobuffer[i]; - stats.add(p); + auto a = responses[0][i]; + maxval = std::max(a, maxval); + cmpval = std::min(a, cmpval); } - double maxval = stats.max__(), - cmpval = stats.min__(); - size_t baselineScore = 0; - for (size_t i = 0; i < roiLen; i++) + + for (size_t i = 0; i < wh; i++) { - PixIntens p = NyxusGpu::gabor_energy_image.hobuffer[i]; - if (p > cmpval) + auto a = responses[0][i]; + if (a > cmpval) baselineScore++; } -#endif -#ifdef GAMLEDAGE // Iterate high-pass filter response signals and score them - for (auto j=1; j< freqs.size(); j++) - { - std::vector& responsePixplane = responses [j]; - - // score it - unsigned long afterGaborScore = 0; - for (PixIntens a : responsePixplane) - if (double(a) / maxval > GRAYthr) - afterGaborScore++; - - // save the score as feature value - fvals[j-1] = (double)afterGaborScore / (double)baselineScore; - } -#else - for (auto k = 1; k < freqs.size(); k++) { - size_t offs = k * roiLen; + size_t offs = k * wh; size_t afterGaborScore = 0; // score this filter response - for (size_t i = 0; i < roiLen; i++) + for (size_t i = 0; i < wh; i++) { - PixIntens p = NyxusGpu::gabor_energy_image.hobuffer [offs + i]; - if (double(p) / maxval > GRAYthr) + double a = responses[k][i]; + if (a / maxval > GRAYthr) afterGaborScore++; } // save the score as feature value fvals[k - 1] = (double)afterGaborScore / (double)baselineScore; } -#endif + } #endif @@ -607,10 +568,10 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( int kerside, int num_filters) { -//xxxxxxx int n_gab = n; - readOnlyPixels pix_plane = Im.ReadablePixels(); + // result in NyxusGpu::gabor_energy_image.hobuffer + // of length [ num_filters * Im.width * Im.height ] bool success = CuGabor::conv_dud_gpu_fft_multi_filter ( auxC, pix_plane.data(), @@ -621,38 +582,22 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( kerside, num_filters, GaborFeature::dev_filterbank); - if (!success) - std::cerr << "\n\n\nUnable to calculate Gabor features on GPU \n\n\n"; - -#ifdef GAMLEDAGE - for (int idx, i = 0; i < num_filters; ++i) + if (!success) { - idx = 2 * i * (Im.width + kerside - 1) * (Im.height + kerside - 1); - decltype(Im.height) b = 0; - - for (auto y = (int)ceil((double)kerside / 2); b < Im.height; y++) - { - decltype(Im.width) a = 0; - - for (auto x = (int)ceil((double)kerside / 2); a < Im.width; x++) - { - size_t oi = (y * 2 * (Im.width + kerside - 1) + x * 2); // offset within the image, complex layout - if (std::isnan(auxC[idx + oi]) || std::isnan(auxC[idx + oi+1])) - { - out[i][(b * Im.width + a)] = (PixIntens) std::numeric_limits::quiet_NaN(); - a++; - continue; - } - out[i][(b * Im.width + a)] = (PixIntens) sqrt(pow(auxC[idx + oi], 2) + pow(auxC[idx + oi+1], 2)); - a++; - } - b++; - } + std::cerr << "\n\n\nUnable to calculate Gabor features on GPU \n\n\n"; + return; } -#else + // GPU: 'NyxusGpu::gabor_energy_image.hobuffer' already has the above out[i][.] result -#endif + // Convert it to a 'paged' layout + size_t wh = Im.width * Im.height; + for (int f = 0; f < num_filters; f++) + { + size_t skip = f * wh; + for (size_t i = 0; i < wh; i++) + out[f][i] = NyxusGpu::gabor_energy_image.hobuffer [skip + i]; + } } #endif diff --git a/src/nyx/gpu/gabor.cu b/src/nyx/gpu/gabor.cu index 21912db2..08c78d60 100644 --- a/src/nyx/gpu/gabor.cu +++ b/src/nyx/gpu/gabor.cu @@ -285,50 +285,58 @@ namespace CuGabor { return true; } - // layouts are different! Complex column-major, energy row-major + // Calculates padded convolution results montage into non-padded energy montage. + // Assumes parallelism by pixels of the result (non-padded E montage). __global__ void kerCalcEnergy( // out PixIntens* nonpadded_e_montage, // montage of smaller (nonpadded) images of size ROI_w*ROI_h // in cufftDoubleComplex* padded_montage, // montage of complex - size_t padded_frame_offset, + size_t skipBig, size_t padded_w, size_t padded_h, - size_t roi_offset, - size_t roi_w, - size_t roi_h, + size_t skipSmall, + size_t nonpadded_w, + size_t nonpadded_h, size_t kerside) { int c = threadIdx.x + blockIdx.x * blockDim.x; int r = threadIdx.y + blockIdx.y * blockDim.y; - if (c >= roi_w || r >= roi_h) + if (c >= nonpadded_w || r >= nonpadded_h) return; - size_t idx_p = r * padded_w + c + kerside / 2; + // for a future 3D implementation: + // size_t skipBig = k * padded_w * padded_h; + // size_t skipSmall = k * nonpadded_w * nonpadded_h; - double plen = (double)(padded_w * padded_h); - cufftDoubleComplex w = padded_montage[padded_frame_offset + idx_p]; + size_t halfpad = kerside / 2; + size_t idxBig = (r + halfpad) * (nonpadded_w + kerside - 1) + c + halfpad; - double wx = w.x / plen, - wy = w.y / plen, + double bigLen = (double)(padded_w * padded_h); + cufftDoubleComplex w = padded_montage [skipBig + idxBig]; + + double wx = w.x / bigLen, + wy = w.y / bigLen, e = sqrt(wx * wx + wy * wy); - size_t idx_roi = r * roi_w + c; - nonpadded_e_montage[roi_offset + idx_roi] = e; + size_t idxSmall = r * nonpadded_w + c; + nonpadded_e_montage [skipSmall + idxSmall] = e; } - bool conv_dud_gpu_fft_multi_filter( + bool conv_dud_gpu_fft_multi_filter ( double* out, const unsigned int* image, double* kernel, - int image_n, int image_m, - int kernel_n, int kernel_m, + int image_w, + int image_h, + int kernel_w, + int kernel_h, int n_filters, double* dev_filterbank) { // calculate new size of image based on padding size - int row_size = image_m + kernel_m - 1; - int col_size = image_n + kernel_n - 1; + int row_size = image_h + kernel_h - 1; + int col_size = image_w + kernel_w - 1; int size = row_size * col_size; // image+kernel size [pixels] size_t bufsize = 2 * size * n_filters; @@ -351,6 +359,7 @@ namespace CuGabor { cufftDoubleComplex* d_kernel = NyxusGpu::gabor_linear_kernel.devbuffer; cudaError_t ok; + // prepare the image for (int batch = 0; batch < n_filters; ++batch) { size_t batch_idx = batch * size; @@ -359,7 +368,7 @@ namespace CuGabor { dim3 tpb(block, block); dim3 bpg(ceil(row_size / block) + 1, ceil(col_size / block) + 1); - re_2_complex_img << > > (d_image, row_size, col_size, image_m, image_n, NyxusGpu::dev_imat1, batch_idx); + re_2_complex_img << > > (d_image, row_size, col_size, image_h, image_w, NyxusGpu::dev_imat1, batch_idx); CHECKERR(cudaDeviceSynchronize()); CHECKERR(cudaGetLastError()); @@ -367,7 +376,8 @@ namespace CuGabor { int n[2] = { row_size, col_size }; - bool ok2 = dense_2_padded_filterbank(d_kernel, dev_filterbank, image_m, image_n, kernel_m, kernel_n, n_filters); + // prepare the kernel + bool ok2 = dense_2_padded_filterbank(d_kernel, dev_filterbank, image_h, image_w, kernel_h, kernel_w, n_filters); if (!ok2) { std::cerr << "ERROR: CuGabor::dense_2_padded_filterbank failed \n"; @@ -415,56 +425,74 @@ namespace CuGabor { CHECKCUFFTERR(call); // - // no need to memcpy(result<-d_result) above ^^^ ! + // no need to memcpy(result<-d_result) above // + // convert the montage from complex padded layout ('d_result') to real non-padded energy ('NyxusGpu::gabor_energy_image.devbuffer') for (int k = 0; k < n_filters; k++) { size_t padded_off = k * size, - roi_off = k * image_m * image_n; + roi_off = k * image_h * image_w; int block = 16; dim3 tpb(block, block); - dim3 bpg(ceil(image_m / block) + 1, ceil(image_n / block) + 1); - - /* - // out - PixIntens* nonpadded_e_montage, // montage of smaller (nonpadded) images of size ROI_w*ROI_h - // in - cufftDoubleComplex* padded_montage, // montage of complex - size_t padded_frame_offset, - size_t padded_w, - size_t padded_h, - size_t roi_offset, - size_t roi_w, - size_t roi_h, - size_t kerside - */ - kerCalcEnergy << < bpg, tpb >> > ( + dim3 bpg(ceil(image_w / block) + 1, ceil(image_h / block) + 1); + + kerCalcEnergy <<< bpg, tpb >>> ( // out NyxusGpu::gabor_energy_image.devbuffer, // in d_result, padded_off, - row_size, // padded w - col_size, // padded h + col_size, // padded w + row_size, // padded h roi_off, - image_m, // roi w - image_n, // roi h - kernel_n); + image_w, // roi w + image_h, // roi h + kernel_w); CHECKERR(cudaDeviceSynchronize()); CHECKERR(cudaGetLastError()); } - // free device memory - cufftDestroy(plan); - cufftDestroy(plan_k); - // download energy on host - size_t szb = n_filters * image_m * image_n * sizeof(NyxusGpu::gabor_energy_image.hobuffer[0]); + size_t szb = n_filters * image_h * image_w * sizeof(NyxusGpu::gabor_energy_image.hobuffer[0]); CHECKERR(cudaMemcpy(NyxusGpu::gabor_energy_image.hobuffer, NyxusGpu::gabor_energy_image.devbuffer, szb, cudaMemcpyDeviceToHost)); + /* The above parallel code implements the following single - thread logic : + * (Converting big complex -> small energy) + * + * OK(NyxusGpu::gabor_result.download()); + * + * for (int k = 0; k < n_filters; k++) + * { + * size_t skipBig = k * row_size * col_size; + * size_t skipSmall = k * image_m * image_n; + * + * for (int r = 0; r < image_m; r++) + * for (int c = 0; c < image_n; c++) + * { + * size_t halfpad = kernel_n / 2; + * size_t idxBig = (r + halfpad) * (image_n + kernel_n - 1) + c + halfpad; + * + * double bigLen = (double)(row_size * col_size); + * cufftDoubleComplex w = NyxusGpu::gabor_result.hobuffer[skipBig + idxBig]; + * + * double wx = w.x / bigLen, + * wy = w.y / bigLen, + * e = sqrt (wx * wx + wy * wy); + * + * size_t idxSmall = r * image_n + c; + * NyxusGpu::gabor_energy_image.hobuffer [skipSmall + idxSmall] = e; + * } + * } + * + */ + + // free device memory + cufftDestroy(plan); + cufftDestroy(plan_k); + return true; } diff --git a/src/nyx/gpucache.cpp b/src/nyx/gpucache.cpp index 5f93fc6a..9c670fa5 100644 --- a/src/nyx/gpucache.cpp +++ b/src/nyx/gpucache.cpp @@ -254,6 +254,16 @@ bool GpuCache::alloc(size_t roi_buf_len, size_t num_rois__) return true; } +template<> +bool GpuCache::download() +{ + size_t szb = total_len * sizeof(devbuffer[0]); + if (!NyxusGpu::download_on_host((void*)hobuffer, (void*)devbuffer, szb)) + return false; + + return true; +} + template<> bool GpuCache::clear() {