diff --git a/src/nyx/features/gabor.cpp b/src/nyx/features/gabor.cpp
index efeafff2..3c6fa750 100644
--- a/src/nyx/features/gabor.cpp
+++ b/src/nyx/features/gabor.cpp
@@ -272,81 +272,42 @@ void GaborFeature::calculate_gpu_multi_filter (LR& r, size_t roiidx)
 
     // Examine the baseline signal
 
-#ifdef GAMLEDAGE
-    unsigned long baselineScore = 0;    // abundance of baseline signal pixels over its average
-    double maxval = 0.0;                // max value of the baseline signal
-
-    std::vector<PixIntens>& pix_plane = responses[0];
-
-    ImageMatrix e2img;
-    e2img.allocate(Im0.width, Im0.height);
-    PixIntens* e2img_ptr = e2img.writable_data_ptr();
-
-    for (int k = 0; k < Im0.height * Im0.width; ++k)
-        e2img_ptr[k] = pix_plane[k];
-            
-    // Values that we need for scoring filter responses
-    Moments2 local_stats;
-    e2img.GetStats(local_stats);
-    maxval = local_stats.max__();
-    double cmpval = local_stats.min__();
-
-    // Score the baseline signal    
-    for (auto a : pix_plane)
-        if (double(a) > cmpval)
-            baselineScore++;
-#else
+    // we need to get these 3 values from response[0], the baseline signal
+    PixIntens maxval = 0,
+        cmpval = UINT16_MAX; // min
+    size_t baselineScore = 0;
 
-    size_t roiLen = Im0.width * Im0.height;
-    Moments2 stats;
-    for (size_t i = 0; i < roiLen; i++)
+    size_t wh = Im0.width * Im0.height;
+    for (size_t i = 0; i < wh; i++)
     {
-        PixIntens p = NyxusGpu::gabor_energy_image.hobuffer[i];
-        stats.add(p);
+        auto a = responses[0][i];
+        maxval = std::max(a, maxval);
+        cmpval = std::min(a, cmpval);
     }
-    double maxval = stats.max__(),
-        cmpval = stats.min__();
-    size_t baselineScore = 0;
-    for (size_t i = 0; i < roiLen; i++)
+
+    for (size_t i = 0; i < wh; i++)
     {
-        PixIntens p = NyxusGpu::gabor_energy_image.hobuffer[i];
-        if (p > cmpval)
+        auto a = responses[0][i];
+        if (a > cmpval)
             baselineScore++;
     }
-#endif
 
-#ifdef GAMLEDAGE        
     // Iterate high-pass filter response signals and score them 
-    for (auto j=1; j< freqs.size(); j++)
-    {
-        std::vector<PixIntens>& responsePixplane = responses [j];
-
-        // score it
-        unsigned long afterGaborScore = 0;
-        for (PixIntens a : responsePixplane)
-            if (double(a) / maxval > GRAYthr)
-                afterGaborScore++;
-
-        // save the score as feature value
-        fvals[j-1] = (double)afterGaborScore / (double)baselineScore;
-    }
-#else
-
     for (auto k = 1; k < freqs.size(); k++)
     {
-        size_t offs = k * roiLen;
+        size_t offs = k * wh;
         size_t afterGaborScore = 0;
         // score this filter response
-        for (size_t i = 0; i < roiLen; i++)
+        for (size_t i = 0; i < wh; i++)
         {
-            PixIntens p = NyxusGpu::gabor_energy_image.hobuffer [offs + i];
-            if (double(p) / maxval > GRAYthr)
+            double a = responses[k][i]; 
+            if (a / maxval > GRAYthr)
                 afterGaborScore++;
         }
         // save the score as feature value
         fvals[k - 1] = (double)afterGaborScore / (double)baselineScore;
     }
-#endif
+
 }
 #endif
 
@@ -607,10 +568,10 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
     int kerside,
     int num_filters) 
 {
-//xxxxxxx    int n_gab = n;
-
     readOnlyPixels pix_plane = Im.ReadablePixels();
 
+    // result in NyxusGpu::gabor_energy_image.hobuffer 
+    // of length [ num_filters * Im.width * Im.height ]
     bool success = CuGabor::conv_dud_gpu_fft_multi_filter (
         auxC, 
         pix_plane.data(), 
@@ -621,38 +582,22 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
         kerside,
         num_filters,
         GaborFeature::dev_filterbank);
-    if (!success) 
-        std::cerr << "\n\n\nUnable to calculate Gabor features on GPU \n\n\n";
-
-#ifdef GAMLEDAGE
-    for (int idx, i = 0; i < num_filters; ++i)
+    if (!success)
     {
-        idx = 2 * i * (Im.width + kerside - 1) * (Im.height + kerside - 1);
-        decltype(Im.height) b = 0;
-        
-        for (auto y = (int)ceil((double)kerside / 2); b < Im.height; y++)
-        {
-            decltype(Im.width) a = 0;
-
-            for (auto x = (int)ceil((double)kerside / 2); a < Im.width; x++)
-            {
-                size_t oi = (y * 2 * (Im.width + kerside - 1) + x * 2); // offset within the image, complex layout
-                if (std::isnan(auxC[idx + oi]) || std::isnan(auxC[idx + oi+1]))
-                {
-                    out[i][(b * Im.width + a)] = (PixIntens) std::numeric_limits<double>::quiet_NaN();
-                    a++;
-                    continue;
-                }
-                out[i][(b * Im.width + a)] = (PixIntens) sqrt(pow(auxC[idx + oi], 2) + pow(auxC[idx + oi+1], 2));
-                a++;
-            }
-            b++;
-        }
+        std::cerr << "\n\n\nUnable to calculate Gabor features on GPU \n\n\n";
+        return;
     }
-#else
+
     // GPU: 'NyxusGpu::gabor_energy_image.hobuffer' already has the above out[i][.] result
 
-#endif
+    // Convert it to a 'paged' layout
+    size_t wh = Im.width * Im.height;
+    for (int f = 0; f < num_filters; f++)
+    {
+        size_t skip = f * wh;
+        for (size_t i = 0; i < wh; i++)
+            out[f][i] = NyxusGpu::gabor_energy_image.hobuffer [skip + i];
+    }
 
 }
 #endif
diff --git a/src/nyx/gpu/gabor.cu b/src/nyx/gpu/gabor.cu
index 21912db2..08c78d60 100644
--- a/src/nyx/gpu/gabor.cu
+++ b/src/nyx/gpu/gabor.cu
@@ -285,50 +285,58 @@ namespace CuGabor {
         return true;
     }
 
-    // layouts are different! Complex column-major, energy row-major
+    // Calculates padded convolution results montage into non-padded energy montage.
+    // Assumes parallelism by pixels of the result (non-padded E montage).
     __global__ void kerCalcEnergy(
         // out
         PixIntens* nonpadded_e_montage,  // montage of smaller (nonpadded) images of size ROI_w*ROI_h
         // in
         cufftDoubleComplex* padded_montage, // montage of complex
-        size_t padded_frame_offset,
+        size_t skipBig,
         size_t padded_w,
         size_t padded_h,
-        size_t roi_offset,
-        size_t roi_w,
-        size_t roi_h,
+        size_t skipSmall,
+        size_t nonpadded_w,
+        size_t nonpadded_h,
         size_t kerside)
     {
         int c = threadIdx.x + blockIdx.x * blockDim.x;
         int r = threadIdx.y + blockIdx.y * blockDim.y;
-        if (c >= roi_w || r >= roi_h)
+        if (c >= nonpadded_w || r >= nonpadded_h)
             return;
 
-        size_t idx_p = r * padded_w + c + kerside / 2;
+        // for a future 3D implementation:
+        //      size_t skipBig = k * padded_w * padded_h;
+        //      size_t skipSmall = k * nonpadded_w * nonpadded_h;
 
-        double plen = (double)(padded_w * padded_h);
-        cufftDoubleComplex w = padded_montage[padded_frame_offset + idx_p];
+        size_t halfpad = kerside / 2;
+        size_t idxBig = (r + halfpad) * (nonpadded_w + kerside - 1) + c + halfpad;
 
-        double wx = w.x / plen,
-            wy = w.y / plen,
+        double bigLen = (double)(padded_w * padded_h);
+        cufftDoubleComplex w = padded_montage [skipBig + idxBig];
+
+        double wx = w.x / bigLen,
+            wy = w.y / bigLen,
             e = sqrt(wx * wx + wy * wy);
 
-        size_t idx_roi = r * roi_w + c;
-        nonpadded_e_montage[roi_offset + idx_roi] = e;
+        size_t idxSmall = r * nonpadded_w + c;
+        nonpadded_e_montage [skipSmall + idxSmall] = e;
     }
 
-    bool conv_dud_gpu_fft_multi_filter(
+    bool conv_dud_gpu_fft_multi_filter (
         double* out,
         const unsigned int* image,
         double* kernel,
-        int image_n, int image_m,
-        int kernel_n, int kernel_m,
+        int image_w, 
+        int image_h, 
+        int kernel_w, 
+        int kernel_h,
         int n_filters,
         double* dev_filterbank)
     {
         // calculate new size of image based on padding size
-        int row_size = image_m + kernel_m - 1;
-        int col_size = image_n + kernel_n - 1;
+        int row_size = image_h + kernel_h - 1;
+        int col_size = image_w + kernel_w - 1;
         int size = row_size * col_size; // image+kernel size [pixels]
 
         size_t bufsize = 2 * size * n_filters;
@@ -351,6 +359,7 @@ namespace CuGabor {
         cufftDoubleComplex* d_kernel = NyxusGpu::gabor_linear_kernel.devbuffer;
         cudaError_t ok;
 
+        // prepare the image
         for (int batch = 0; batch < n_filters; ++batch)
         {
             size_t batch_idx = batch * size;
@@ -359,7 +368,7 @@ namespace CuGabor {
             dim3 tpb(block, block);
             dim3 bpg(ceil(row_size / block) + 1, ceil(col_size / block) + 1);
 
-            re_2_complex_img << <bpg, tpb >> > (d_image, row_size, col_size, image_m, image_n, NyxusGpu::dev_imat1, batch_idx);
+            re_2_complex_img << <bpg, tpb >> > (d_image, row_size, col_size, image_h, image_w, NyxusGpu::dev_imat1, batch_idx);
 
             CHECKERR(cudaDeviceSynchronize());
             CHECKERR(cudaGetLastError());
@@ -367,7 +376,8 @@ namespace CuGabor {
 
         int n[2] = { row_size, col_size };
 
-        bool ok2 = dense_2_padded_filterbank(d_kernel, dev_filterbank, image_m, image_n, kernel_m, kernel_n, n_filters);
+        // prepare the kernel
+        bool ok2 = dense_2_padded_filterbank(d_kernel, dev_filterbank, image_h, image_w, kernel_h, kernel_w, n_filters);
         if (!ok2)
         {
             std::cerr << "ERROR: CuGabor::dense_2_padded_filterbank failed \n";
@@ -415,56 +425,74 @@ namespace CuGabor {
         CHECKCUFFTERR(call);
 
         //
-        // no need to memcpy(result<-d_result) above ^^^ !
+        // no need to memcpy(result<-d_result) above
         //
 
+        // convert the montage from complex padded layout ('d_result') to real non-padded energy ('NyxusGpu::gabor_energy_image.devbuffer')
         for (int k = 0; k < n_filters; k++)
         {
             size_t padded_off = k * size,
-                roi_off = k * image_m * image_n;
+                roi_off = k * image_h * image_w;
 
             int block = 16;
             dim3 tpb(block, block);
-            dim3 bpg(ceil(image_m / block) + 1, ceil(image_n / block) + 1);
-
-            /*
-                    // out
-                    PixIntens* nonpadded_e_montage,  // montage of smaller (nonpadded) images of size ROI_w*ROI_h
-                    // in
-                    cufftDoubleComplex* padded_montage, // montage of complex
-                    size_t padded_frame_offset,
-                    size_t padded_w,
-                    size_t padded_h,
-                    size_t roi_offset,
-                    size_t roi_w,
-                    size_t roi_h,
-                    size_t kerside
-             */
-            kerCalcEnergy << < bpg, tpb >> > (
+            dim3 bpg(ceil(image_w / block) + 1, ceil(image_h / block) + 1);
+
+            kerCalcEnergy <<< bpg, tpb >>> (
                 // out
                 NyxusGpu::gabor_energy_image.devbuffer,
                 // in
                 d_result,
                 padded_off,
-                row_size,   // padded w
-                col_size,   // padded h
+                col_size,   // padded w
+                row_size,   // padded h
                 roi_off,
-                image_m,    // roi w
-                image_n,    // roi h
-                kernel_n);
+                image_w,    // roi w
+                image_h,    // roi h
+                kernel_w);
 
             CHECKERR(cudaDeviceSynchronize());
             CHECKERR(cudaGetLastError());
         }
 
-        // free device memory
-        cufftDestroy(plan);
-        cufftDestroy(plan_k);
-
         // download energy on host
-        size_t szb = n_filters * image_m * image_n * sizeof(NyxusGpu::gabor_energy_image.hobuffer[0]);
+        size_t szb = n_filters * image_h * image_w * sizeof(NyxusGpu::gabor_energy_image.hobuffer[0]);
         CHECKERR(cudaMemcpy(NyxusGpu::gabor_energy_image.hobuffer, NyxusGpu::gabor_energy_image.devbuffer, szb, cudaMemcpyDeviceToHost));
 
+        /* The above parallel code implements the following single - thread logic :
+        * (Converting big complex -> small energy)
+        * 
+        * OK(NyxusGpu::gabor_result.download());
+        *
+        * for (int k = 0; k < n_filters; k++)
+        * {
+        *    size_t skipBig = k * row_size * col_size;
+        *    size_t skipSmall = k * image_m * image_n;
+        *
+        *   for (int r = 0; r < image_m; r++)
+        *       for (int c = 0; c < image_n; c++)
+        *        {
+        *            size_t halfpad = kernel_n / 2;
+        *            size_t idxBig = (r + halfpad) * (image_n + kernel_n - 1) + c + halfpad;
+        *
+        *            double bigLen = (double)(row_size * col_size);
+        *            cufftDoubleComplex w = NyxusGpu::gabor_result.hobuffer[skipBig + idxBig];
+        *
+        *            double wx = w.x / bigLen,
+        *                wy = w.y / bigLen,
+        *                e = sqrt (wx * wx + wy * wy);
+        *
+        *            size_t idxSmall = r * image_n + c;
+        *            NyxusGpu::gabor_energy_image.hobuffer [skipSmall + idxSmall] = e;
+        *        }
+        * }
+        *
+        */
+
+        // free device memory
+        cufftDestroy(plan);
+        cufftDestroy(plan_k);
+        
         return true;
     }
 
diff --git a/src/nyx/gpucache.cpp b/src/nyx/gpucache.cpp
index 5f93fc6a..9c670fa5 100644
--- a/src/nyx/gpucache.cpp
+++ b/src/nyx/gpucache.cpp
@@ -254,6 +254,16 @@ bool GpuCache<cufftDoubleComplex>::alloc(size_t roi_buf_len, size_t num_rois__)
 	return true;
 }
 
+template<>
+bool GpuCache<cufftDoubleComplex>::download()
+{
+	size_t szb = total_len * sizeof(devbuffer[0]);
+	if (!NyxusGpu::download_on_host((void*)hobuffer, (void*)devbuffer, szb))
+		return false;
+
+	return true;
+}
+
 template<>
 bool GpuCache<PixIntens>::clear()
 {