diff --git a/vkFFT/vkFFT.h b/vkFFT/vkFFT.h index 64f990ef..ccfeca2e 100644 --- a/vkFFT/vkFFT.h +++ b/vkFFT/vkFFT.h @@ -1697,28 +1697,61 @@ static inline VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayou res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif - for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later - if (!strcmp(floatType, "double")) { - double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "float")) { - float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (j < (sc->raderContainer[i].prime - 2)) { - sc->tempLen = sprintf(sc->tempStr, ", "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; + if (sc->raderContainer[i].type == 0) { + for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later + if (!strcmp(floatType, "double")) { + double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (!strcmp(floatType, "float")) { + float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (j < (sc->raderContainer[i].prime - 2)) { + sc->tempLen = sprintf(sc->tempStr, ", "); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + else { + sc->tempLen = sprintf(sc->tempStr, "};\n"); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } } - else { - sc->tempLen = sprintf(sc->tempStr, "};\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; + } + else { + long double double_PI = 3.14159265358979323846264338327950288419716939937510L; + for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later + uint64_t g_pow = 1; + for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) { + g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime; + } + if (!strcmp(floatType, "double")) { + double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", (double)cos(2.0 * g_pow * double_PI / sc->raderContainer[i].prime), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (!strcmp(floatType, "float")) { + float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", (float)cos(2.0 * g_pow * double_PI / sc->raderContainer[i].prime), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (j < (sc->raderContainer[i].prime - 2)) { + sc->tempLen = sprintf(sc->tempStr, ", "); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + else { + sc->tempLen = sprintf(sc->tempStr, "};\n"); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } } } #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) @@ -1730,29 +1763,62 @@ static inline VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayou res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif - for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later - if (!strcmp(floatType, "double")) { - double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "float")) { - float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } + if (sc->raderContainer[i].type == 0) { + for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later + if (!strcmp(floatType, "double")) { + double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (!strcmp(floatType, "float")) { + float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } - if (j < (sc->raderContainer[i].prime - 2)) { - sc->tempLen = sprintf(sc->tempStr, ", "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; + if (j < (sc->raderContainer[i].prime - 2)) { + sc->tempLen = sprintf(sc->tempStr, ", "); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + else { + sc->tempLen = sprintf(sc->tempStr, "};\n"); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } } - else { - sc->tempLen = sprintf(sc->tempStr, "};\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; + } + else { + long double double_PI = 3.14159265358979323846264338327950288419716939937510L; + for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later + uint64_t g_pow = 1; + for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) { + g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime; + } + if (!strcmp(floatType, "double")) { + double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", (double)(-sin(2.0 * g_pow * double_PI / sc->raderContainer[i].prime)), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (!strcmp(floatType, "float")) { + float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; + sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", (float)(-sin(2.0 * g_pow * double_PI / sc->raderContainer[i].prime)), LFending); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + if (j < (sc->raderContainer[i].prime - 2)) { + sc->tempLen = sprintf(sc->tempStr, ", "); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } + else { + sc->tempLen = sprintf(sc->tempStr, "};\n"); + res = VkAppendLine(sc); + if (res != VKFFT_SUCCESS) return res; + } } } } @@ -36234,7 +36300,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla axis->axisBlock[1] = final_rader_thread_count; if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; } - + if (axis->specializationConstants.useRaderFFT) { + if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum; + } uint64_t scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch; if (scale > 1) axis->groupedBatch *= scale; axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize; @@ -36276,6 +36344,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla axis->axisBlock[1] = final_rader_thread_count; if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; } + if (axis->specializationConstants.useRaderFFT) { + if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum; + } axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0]; if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { @@ -36313,6 +36384,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla axis->axisBlock[1] = final_rader_thread_count; if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; } + if (axis->specializationConstants.useRaderFFT) { + if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum; + } axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0]; if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];