From fe114a3752f858063fa9b0a733ecba37501572c7 Mon Sep 17 00:00:00 2001 From: John Doe Date: Sat, 16 May 2020 23:24:55 +0800 Subject: [PATCH] cumulative bug fixes --- src/DCTFFTW.hpp | 5 +- src/FakeBlockData.hpp | 2 +- src/MVBlockFPS.hxx | 28 ++-- src/MVFlow.hxx | 31 ++-- src/MVFlowBlur.hxx | 20 +-- src/MVFlowFPS.hxx | 120 +++++--------- src/MVFlowInter.hxx | 122 +++++--------- src/MVInterface.h | 2 +- src/MVMask.hxx | 10 +- src/MaskFun.hpp | 360 +++++++++++++++++++++++------------------- src/PlaneOfBlocks.h | 115 ++++++-------- src/SimpleResize.hpp | 44 +++++- 12 files changed, 392 insertions(+), 467 deletions(-) diff --git a/src/DCTFFTW.hpp b/src/DCTFFTW.hpp index 3496b96..e0b4a92 100644 --- a/src/DCTFFTW.hpp +++ b/src/DCTFFTW.hpp @@ -6,9 +6,8 @@ #include "DCT.hpp" #include "Include/Interface.hxx" -static auto &&g_fftw_plans_mutex = std::mutex{}; - class DCTFFTW final :public DCTClass { + static inline auto g_fftw_plans_mutex = std::mutex{}; self(fSrc, static_cast(nullptr)); self(dctplan, static_cast(nullptr)); self(fSrcDCT, static_cast(nullptr)); @@ -38,7 +37,9 @@ class DCTFFTW final :public DCTClass { auto &operator=(DCTFFTW &&) = delete; auto &operator=(const DCTFFTW &) = delete; ~DCTFFTW() override { + g_fftw_plans_mutex.lock(); fftw_destroy_plan(dctplan); + g_fftw_plans_mutex.unlock(); fftw_free(fSrc); fftw_free(fSrcDCT); } diff --git a/src/FakeBlockData.hpp b/src/FakeBlockData.hpp index 21fca05..e026acf 100644 --- a/src/FakeBlockData.hpp +++ b/src/FakeBlockData.hpp @@ -30,6 +30,6 @@ class FakeBlockData final { return Vector; } auto GetSAD() const { - return reinterpret_cast(Vector.sad); + return Vector.sad; } }; \ No newline at end of file diff --git a/src/MVBlockFPS.hxx b/src/MVBlockFPS.hxx index 7d5c2fa..1b6b37a 100644 --- a/src/MVBlockFPS.hxx +++ b/src/MVBlockFPS.hxx @@ -338,28 +338,20 @@ static const VSFrameRef *VS_CC mvblockfpsGetFrame(int32_t n, int32_t activationR MakeSADMaskTime(ballsF, nBlkX, nBlkY, 4.0 / (ml * nBlkSizeX * nBlkSizeY), 1.0, nPel, smallMaskF, nBlkXP, time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY); MakeSADMaskTime(ballsB, nBlkX, nBlkY, 4.0 / (ml * nBlkSizeX * nBlkSizeY), 1.0, nPel, smallMaskB, nBlkXP, 256 - time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY); } - if (nBlkXP > nBlkX) - for (int j = 0; j < nBlkY; ++j) { - smallMaskF[j * nBlkXP + nBlkX] = smallMaskF[j * nBlkXP + nBlkX - 1]; - smallMaskB[j * nBlkXP + nBlkX] = smallMaskB[j * nBlkXP + nBlkX - 1]; - } - if (nBlkYP > nBlkY) - for (int i = 0; i < nBlkXP; ++i) { - smallMaskF[nBlkXP * nBlkY + i] = smallMaskF[nBlkXP * (nBlkY - 1) + i]; - smallMaskB[nBlkXP * nBlkY + i] = smallMaskB[nBlkXP * (nBlkY - 1) + i]; - } - upsizer->Resize(MaskFullYF, nPitchY, smallMaskF, nBlkXP); - upsizer->Resize(MaskFullYB, nPitchY, smallMaskB, nBlkXP); + CheckAndPadMaskSmall(smallMaskF, nBlkXP, nBlkYP, nBlkX, nBlkY); + CheckAndPadMaskSmall(smallMaskB, nBlkXP, nBlkYP, nBlkX, nBlkY); + upsizer->Resize(MaskFullYF, nPitchY, smallMaskF, nBlkXP, false); + upsizer->Resize(MaskFullYB, nPitchY, smallMaskB, nBlkXP, false); if (nSuperModeYUV & UVPLANES) { - upsizerUV->Resize(MaskFullUVF, nPitchUV, smallMaskF, nBlkXP); - upsizerUV->Resize(MaskFullUVB, nPitchUV, smallMaskB, nBlkXP); + upsizerUV->Resize(MaskFullUVF, nPitchUV, smallMaskF, nBlkXP, false); + upsizerUV->Resize(MaskFullUVB, nPitchUV, smallMaskB, nBlkXP, false); } } if (mode == 4 || mode == 5 || mode == 7 || mode == 8) { MultMasks(smallMaskF, smallMaskB, smallMaskO, nBlkXP, nBlkYP); - upsizer->Resize(MaskOccY, nPitchY, smallMaskO, nBlkXP); + upsizer->Resize(MaskOccY, nPitchY, smallMaskO, nBlkXP, false); if (nSuperModeYUV & UVPLANES) - upsizerUV->Resize(MaskOccUV, nPitchUV, smallMaskO, nBlkXP); + upsizerUV->Resize(MaskOccUV, nPitchUV, smallMaskO, nBlkXP, false); } auto pMaskFullYB = MaskFullYB; auto pMaskFullYF = MaskFullYF; @@ -938,9 +930,9 @@ static void VS_CC mvblockfpsCreate(const VSMap *in, VSMap *out, void *userData, d.nWidthUV = d.bleh->nWidth / d.bleh->xRatioUV; d.nPitchY = (d.nWidthP + 15) & (~15); d.nPitchUV = (d.nWidthPUV + 15) & (~15); - d.upsizer = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP); + d.upsizer = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, 0, 0, 0); if (d.nSuperModeYUV & UVPLANES) - d.upsizerUV = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP); + d.upsizerUV = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, 0, 0, 0); if (d.bleh->nOverlapX || d.bleh->nOverlapY) { d.OverWins = new OverlapWindows(d.bleh->nBlkSizeX, d.bleh->nBlkSizeY, d.bleh->nOverlapX, d.bleh->nOverlapY); if (d.nSuperModeYUV & UVPLANES) diff --git a/src/MVFlow.hxx b/src/MVFlow.hxx index d4c1b37..dde300e 100644 --- a/src/MVFlow.hxx +++ b/src/MVFlow.hxx @@ -176,16 +176,7 @@ else if (activationReason == arAllFramesReady) { auto VXSmallY = new int32_t[nBlkYP * nBlkXP]; auto VYSmallY = new int32_t[nBlkYP * nBlkXP]; MakeVectorSmallMasks(balls, nBlkX, nBlkY, VXSmallY, nBlkXP, VYSmallY, nBlkXP); - if (nBlkXP > nBlkX) - for (auto j = 0; j < nBlkY; ++j) { - VXSmallY[j * nBlkXP + nBlkX] = std::min(VXSmallY[j * nBlkXP + nBlkX - 1], 0); - VYSmallY[j * nBlkXP + nBlkX] = VYSmallY[j * nBlkXP + nBlkX - 1]; - } - if (nBlkYP > nBlkY) - for (auto i = 0; i < nBlkXP; ++i) { - VXSmallY[nBlkXP * nBlkY + i] = VXSmallY[nBlkXP * (nBlkY - 1) + i]; - VYSmallY[nBlkXP * nBlkY + i] = std::min(VYSmallY[nBlkXP * (nBlkY - 1) + i], 0); - } + CheckAndPadSmallY(VXSmallY, VYSmallY, nBlkXP, nBlkYP, nBlkX, nBlkY); auto fieldShift = 0; if (d->fields && nPel > 1 && ((nref - n) % 2 != 0)) { auto src = vsapi->getFrameFilter(n, d->finest, frameCtx); @@ -216,8 +207,8 @@ else if (activationReason == arAllFramesReady) { for (auto j = 0; j < nBlkYP; ++j) for (auto i = 0; i < nBlkXP; ++i) VYSmallY[j * nBlkXP + i] += fieldShift; - d->upsizer->Resize(VXFullY, VPitchY, VXSmallY, nBlkXP); - d->upsizer->Resize(VYFullY, VPitchY, VYSmallY, nBlkXP); + d->upsizer->Resize(VXFullY, VPitchY, VXSmallY, nBlkXP, true); + d->upsizer->Resize(VYFullY, VPitchY, VYSmallY, nBlkXP, false); auto nOffsetY = nRefPitches[0] * nVPadding * nPel + nHPadding * bytesPerSample * nPel; auto nOffsetUV = nRefPitches[1] * nVPaddingUV * nPel + nHPaddingUV * bytesPerSample * nPel; if (static_cast(d->mode) == FlowModes::Shift) @@ -232,8 +223,8 @@ else if (activationReason == arAllFramesReady) { auto VYSmallUV = new int32_t[nBlkYP * nBlkXP]; VectorSmallMaskYToHalfUV(VXSmallY, nBlkXP, nBlkYP, VXSmallUV, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallY, nBlkXP, nBlkYP, VYSmallUV, yRatioUV); - d->upsizerUV->Resize(VXFullUV, VPitchUV, VXSmallUV, nBlkXP); - d->upsizerUV->Resize(VYFullUV, VPitchUV, VYSmallUV, nBlkXP); + d->upsizerUV->Resize(VXFullUV, VPitchUV, VXSmallUV, nBlkXP, true); + d->upsizerUV->Resize(VYFullUV, VPitchUV, VYSmallUV, nBlkXP, false); if (static_cast(d->mode) == FlowModes::Shift) { if (d->vi->format->colorFamily == cmRGB) { flowMemset(pDst[1], 1.f, nHeightUV * nDstPitches[1]); @@ -408,8 +399,12 @@ auto CreateFlow(auto in, auto out, auto core, auto vsapi) { delete d.bleh; return d; } - d.nBlkXP = (d.bleh->nBlkX * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) ? d.bleh->nBlkX + 1 : d.bleh->nBlkX; - d.nBlkYP = (d.bleh->nBlkY * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) ? d.bleh->nBlkY + 1 : d.bleh->nBlkY; + d.nBlkXP = d.bleh->nBlkX; + while (d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) + d.nBlkXP++; + d.nBlkYP = d.bleh->nBlkY; + while (d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) + d.nBlkYP++; d.nWidthP = d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX; d.nHeightP = d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY; d.nWidthPUV = d.nWidthP / d.bleh->xRatioUV; @@ -420,9 +415,9 @@ auto CreateFlow(auto in, auto out, auto core, auto vsapi) { d.nVPaddingUV = d.bleh->nVPadding / d.bleh->yRatioUV; d.VPitchY = (d.nWidthP + 15) & (~15); d.VPitchUV = (d.nWidthPUV + 15) & (~15); - d.upsizer = new SimpleResize(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY); + d.upsizer = new SimpleResize(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY, d.bleh->nWidth, d.bleh->nHeight, d.bleh->nPel); if (d.vi->format->colorFamily != cmGray) - d.upsizerUV = new SimpleResize(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY); + d.upsizerUV = new SimpleResize(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY, d.nWidthUV, d.nHeightUV, d.bleh->nPel); if (static_cast(d.mode) == FlowModes::Fetch) d.flow_function = flowFetch; else if (static_cast(d.mode) == FlowModes::Shift) diff --git a/src/MVFlowBlur.hxx b/src/MVFlowBlur.hxx index b84be68..965dd1c 100644 --- a/src/MVFlowBlur.hxx +++ b/src/MVFlowBlur.hxx @@ -294,10 +294,10 @@ static const VSFrameRef *VS_CC mvflowblurGetFrame(int32_t n, int32_t activationR MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYF, nBlkX, VYSmallYF, nBlkX); - d->upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkX); - d->upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkX); - d->upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkX); - d->upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkX); + d->upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkX, true); + d->upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkX, false); + d->upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkX, true); + d->upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkX, false); FlowBlur(pDst[0], nDstPitches[0], pRef[0] + nOffsetY, nRefPitches[0], VXFullYB, VXFullYF, VYFullYB, VYFullYF, VPitchY, @@ -322,11 +322,11 @@ static const VSFrameRef *VS_CC mvflowblurGetFrame(int32_t n, int32_t activationR VectorSmallMaskYToHalfUV(VXSmallYF, nBlkX, nBlkY, VXSmallUVF, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallYF, nBlkX, nBlkY, VYSmallUVF, yRatioUV); - d->upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkX); - d->upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkX); + d->upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkX, true); + d->upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkX, false); - d->upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkX); - d->upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkX); + d->upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkX, true); + d->upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkX, false); FlowBlur(pDst[1], nDstPitches[1], pRef[1] + nOffsetUV, nRefPitches[1], @@ -614,9 +614,9 @@ static void VS_CC mvflowblurCreate(const VSMap *in, VSMap *out, void *userData, d.nVPaddingUV = d.bleh->nVPadding / d.bleh->yRatioUV; d.VPitchY = d.bleh->nWidth; d.VPitchUV = d.nWidthUV; - d.upsizer = new SimpleResize(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY); + d.upsizer = new SimpleResize(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY, d.mvClipB->nWidth, d.mvClipB->nHeight, d.mvClipB->nPel); if (d.vi->format->colorFamily != cmGray) - d.upsizerUV = new SimpleResize(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY); + d.upsizerUV = new SimpleResize(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY, d.nWidthUV, d.nHeightUV, d.mvClipB->nPel); data = new MVFlowBlurData; *data = d; vsapi->createFilter(in, out, "FlowBlur", mvflowblurInit, mvflowblurGetFrame, mvflowblurFree, fmParallel, 0, data, core); diff --git a/src/MVFlowFPS.hxx b/src/MVFlowFPS.hxx index f566ff1..212981b 100644 --- a/src/MVFlowFPS.hxx +++ b/src/MVFlowFPS.hxx @@ -261,89 +261,52 @@ static const VSFrameRef *VS_CC mvflowfpsGetFrame(int32_t n, int32_t activationRe int32_t nOffsetUV = nRefPitches[1] * nVPaddingUV * nPel + nHPaddingUV * bytesPerSample * nPel; if (nright != d->nrightLast) { - MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYB, nBlkXP, VYSmallYB, nBlkXP); - if (nBlkXP > nBlkX) {// fill right - for (int32_t j = 0; j nBlkY) {// fill bottom - for (int32_t i = 0; iResize(VXFullYB, VPitchY, VXSmallYB, nBlkXP); - upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP); + CheckAndPadSmallY(VXSmallYB, VYSmallYB, nBlkXP, nBlkYP, nBlkX, nBlkY); + upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkXP, true); + upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP, false); if (d->vi.format->colorFamily != cmGray) { VectorSmallMaskYToHalfUV(VXSmallYB, nBlkXP, nBlkYP, VXSmallUVB, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallYB, nBlkXP, nBlkYP, VYSmallUVB, yRatioUV); - upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP); - upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP); + upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP, true); + upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP, false); } } // analyze vectors field to detect occlusion // double occNormB = (256-time256)/(256*ml); // MakeVectorOcclusionMask(mvClipB, nBlkX, nBlkY, occNormB, 1.0, nPel, MaskSmallB, nBlkXP); MakeVectorOcclusionMaskTime(ballsB, true, nBlkX, nBlkY, ml, 1.0, nPel, MaskSmallB, nBlkXP, (256 - time256), nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY); - if (nBlkXP > nBlkX) // fill right - for (int32_t j = 0; j nBlkY) // fill bottom - for (int32_t i = 0; iResize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP); + CheckAndPadMaskSmall(MaskSmallB, nBlkXP, nBlkYP, nBlkX, nBlkY); + upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP, false); if (d->vi.format->colorFamily != cmGray) - upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP); + upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP, false); d->nrightLast = nright; if (nleft != d->nleftLast) { MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYF, nBlkXP, VYSmallYF, nBlkXP); - if (nBlkXP > nBlkX) {// fill right - for (int32_t j = 0; j nBlkY) {// fill bottom - for (int32_t i = 0; iResize(VXFullYF, VPitchY, VXSmallYF, nBlkXP); - upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP); + CheckAndPadSmallY(VXSmallYF, VYSmallYF, nBlkXP, nBlkYP, nBlkX, nBlkY); + upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP, true); + upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP, false); if (d->vi.format->colorFamily != cmGray) { VectorSmallMaskYToHalfUV(VXSmallYF, nBlkXP, nBlkYP, VXSmallUVF, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallYF, nBlkXP, nBlkYP, VYSmallUVF, yRatioUV); - upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP); - upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP); + upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP, true); + upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP, false); } } // analyze vectors field to detect occlusion // double occNormF = time256/(256*ml); // MakeVectorOcclusionMask(mvClipF, nBlkX, nBlkY, occNormF, 1.0, nPel, MaskSmallF, nBlkXP); MakeVectorOcclusionMaskTime(ballsF, false, nBlkX, nBlkY, ml, 1.0, nPel, MaskSmallF, nBlkXP, time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY); - if (nBlkXP > nBlkX) // fill right - for (int32_t j = 0; j nBlkY) // fill bottom - for (int32_t i = 0; iResize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP); + CheckAndPadMaskSmall(MaskSmallF, nBlkXP, nBlkYP, nBlkX, nBlkY); + upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP, false); if (d->vi.format->colorFamily != cmGray) - upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP); + upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP, false); d->nleftLast = nleft; @@ -364,28 +327,13 @@ static const VSFrameRef *VS_CC mvflowfpsGetFrame(int32_t n, int32_t activationRe // get vector mask from extra frames MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYBB, nBlkXP, VYSmallYBB, nBlkXP); MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYFF, nBlkXP, VYSmallYFF, nBlkXP); - if (nBlkXP > nBlkX) {// fill right - for (int32_t j = 0; j nBlkY) {// fill bottom - for (int32_t i = 0; iResize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP); - upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP); + CheckAndPadSmallY(VXSmallYBB, VYSmallYBB, nBlkXP, nBlkYP, nBlkX, nBlkY); + CheckAndPadSmallY(VXSmallYFF, VYSmallYFF, nBlkXP, nBlkYP, nBlkX, nBlkY); + upsizer->Resize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP, true); + upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP, false); - upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP); - upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP); + upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP, true); + upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP, false); FlowInterExtra(pDst[0], nDstPitches[0], pRef[0] + nOffsetY, pSrc[0] + nOffsetY, nRefPitches[0], VXFullYB, VXFullYF, VYFullYB, VYFullYF, MaskFullYB, MaskFullYF, VPitchY, @@ -396,11 +344,11 @@ static const VSFrameRef *VS_CC mvflowfpsGetFrame(int32_t n, int32_t activationRe VectorSmallMaskYToHalfUV(VXSmallYFF, nBlkXP, nBlkYP, VXSmallUVFF, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallYFF, nBlkXP, nBlkYP, VYSmallUVFF, yRatioUV); - upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP); - upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP); + upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP, true); + upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP, false); - upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP); - upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP); + upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP, true); + upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP, false); FlowInterExtra(pDst[1], nDstPitches[1], pRef[1] + nOffsetUV, pSrc[1] + nOffsetUV, nRefPitches[1], VXFullUVB, VXFullUVF, VYFullUVB, VYFullUVF, MaskFullUVB, MaskFullUVF, VPitchUV, @@ -858,8 +806,12 @@ static void VS_CC mvflowfpsCreate(const VSMap *in, VSMap *out, void *userData, V return; } - d.nBlkXP = (d.bleh->nBlkX * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) ? d.bleh->nBlkX + 1 : d.bleh->nBlkX; - d.nBlkYP = (d.bleh->nBlkY * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) ? d.bleh->nBlkY + 1 : d.bleh->nBlkY; + d.nBlkXP = d.bleh->nBlkX; + while (d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) + d.nBlkXP++; + d.nBlkYP = d.bleh->nBlkY; + while (d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) + d.nBlkYP++; d.nWidthP = d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX; d.nHeightP = d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY; @@ -907,8 +859,8 @@ static void VS_CC mvflowfpsCreate(const VSMap *in, VSMap *out, void *userData, V d.MaskSmallF = new double[d.nBlkXP * d.nBlkYP]; d.MaskFullYF = new double[d.nHeightP * d.VPitchY]; - d.upsizer = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP); - d.upsizer2 = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP); + d.upsizer = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, d.mvClipB->nWidth, d.mvClipB->nHeight, d.mvClipB->nPel); + d.upsizer2 = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, 0, 0, 0); if (d.vi.format->colorFamily != cmGray) { d.VXFullUVB = new int32_t[d.nHeightPUV * d.VPitchUV]; @@ -934,8 +886,8 @@ static void VS_CC mvflowfpsCreate(const VSMap *in, VSMap *out, void *userData, V d.MaskFullUVB = new double[d.nHeightPUV * d.VPitchUV]; d.MaskFullUVF = new double[d.nHeightPUV * d.VPitchUV]; - d.upsizerUV = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP); - d.upsizerUV2 = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP); + d.upsizerUV = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, d.nWidthUV, d.nHeightUV, d.mvClipB->nPel); + d.upsizerUV2 = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, 0, 0, 0); } diff --git a/src/MVFlowInter.hxx b/src/MVFlowInter.hxx index f0c6997..ef4a068 100644 --- a/src/MVFlowInter.hxx +++ b/src/MVFlowInter.hxx @@ -172,59 +172,25 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation double *MaskFullUVB = nullptr; double *MaskFullUVF = nullptr; - MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYB, nBlkXP, VYSmallYB, nBlkXP); MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYF, nBlkXP, VYSmallYF, nBlkXP); - if (nBlkXP > nBlkX) // fill right - { - for (int32_t j = 0; j nBlkY) // fill bottom - { - for (int32_t i = 0; i nBlkX) // fill right - { - for (int32_t j = 0; j nBlkY) // fill bottom - { - for (int32_t i = 0; iResize(VXFullYB, VPitchY, VXSmallYB, nBlkXP); - upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP); - upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP); - upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP); - upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP); - upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP); + // upsize (bilinear interpolate) vector masks to fullframe size + upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkXP, true); + upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP, false); + upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP, true); + upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP, false); + upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP, false); + upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP, false); if (d->vi->format->colorFamily != cmGray) { VXFullUVB = new int32_t[nHeightPUV * VPitchUV]; @@ -243,12 +209,12 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation VectorSmallMaskYToHalfUV(VXSmallYF, nBlkXP, nBlkYP, VXSmallUVF, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallYF, nBlkXP, nBlkYP, VYSmallUVF, yRatioUV); - upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP); - upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP); - upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP); - upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP); - upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP); - upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP); + upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP, true); + upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP, false); + upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP, true); + upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP, false); + upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP, false); + upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP, false); } @@ -275,32 +241,14 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation // get vector mask from extra frames MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYBB, nBlkXP, VYSmallYBB, nBlkXP); MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYFF, nBlkXP, VYSmallYFF, nBlkXP); - if (nBlkXP > nBlkX) // fill right - { - for (int32_t j = 0; j nBlkY) // fill bottom - { - for (int32_t i = 0; iResize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP); - upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP); - upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP); - upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP); + upsizer->Resize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP, true); + upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP, false); + upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP, true); + upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP, false); FlowInterExtra(pDst[0], nDstPitches[0], pRef[0] + nOffsetY, pSrc[0] + nOffsetY, nRefPitches[0], VXFullYB, VXFullYF, VYFullYB, VYFullYF, MaskFullYB, MaskFullYF, VPitchY, @@ -321,10 +269,10 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation VectorSmallMaskYToHalfUV(VXSmallYFF, nBlkXP, nBlkYP, VXSmallUVFF, xRatioUV); VectorSmallMaskYToHalfUV(VYSmallYFF, nBlkXP, nBlkYP, VYSmallUVFF, yRatioUV); - upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP); - upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP); - upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP); - upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP); + upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP, true); + upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP, false); + upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP, true); + upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP, false); FlowInterExtra(pDst[1], nDstPitches[1], pRef[1] + nOffsetUV, pSrc[1] + nOffsetUV, nRefPitches[1], VXFullUVB, VXFullUVF, VYFullUVB, VYFullUVF, MaskFullUVB, MaskFullUVF, VPitchUV, @@ -675,8 +623,12 @@ static void VS_CC mvflowinterCreate(const VSMap *in, VSMap *out, void *userData, delete d.mvClipF; return; } - d.nBlkXP = (d.bleh->nBlkX * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) ? d.bleh->nBlkX + 1 : d.bleh->nBlkX; - d.nBlkYP = (d.bleh->nBlkY * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) ? d.bleh->nBlkY + 1 : d.bleh->nBlkY; + d.nBlkXP = d.bleh->nBlkX; + while (d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) + d.nBlkXP++; + d.nBlkYP = d.bleh->nBlkY; + while (d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) + d.nBlkYP++; d.nWidthP = d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX; d.nHeightP = d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY; d.nWidthPUV = d.nWidthP / d.bleh->xRatioUV; @@ -687,11 +639,11 @@ static void VS_CC mvflowinterCreate(const VSMap *in, VSMap *out, void *userData, d.nVPaddingUV = d.bleh->nVPadding / d.bleh->yRatioUV; d.VPitchY = (d.nWidthP + 15) & (~15); d.VPitchUV = (d.nWidthPUV + 15) & (~15); - d.upsizer = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP); - d.upsizer2 = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP); + d.upsizer = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, d.mvClipB->nWidth, d.mvClipB->nHeight, d.mvClipB->nPel); + d.upsizer2 = new SimpleResize(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, 0, 0, 0); if (d.vi->format->colorFamily != cmGray) { - d.upsizerUV = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP); - d.upsizerUV2 = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP); + d.upsizerUV = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, d.nWidthUV, d.nHeightUV, d.mvClipB->nPel); + d.upsizerUV2 = new SimpleResize(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, 0, 0, 0); } data = new MVFlowInterData; *data = d; diff --git a/src/MVInterface.h b/src/MVInterface.h index 60cf515..a99a7b9 100644 --- a/src/MVInterface.h +++ b/src/MVInterface.h @@ -23,7 +23,7 @@ constexpr auto MVAnalysisDataVersion = 5; struct VectorStructure { self(x, 0_i32); self(y, 0_i32); - self(sad, -1.f); + self(sad, -1.); }; constexpr auto N_PER_BLOCK = sizeof(VectorStructure) / sizeof(std::int32_t); diff --git a/src/MVMask.hxx b/src/MVMask.hxx index 8c6d541..972a922 100644 --- a/src/MVMask.hxx +++ b/src/MVMask.hxx @@ -110,8 +110,8 @@ struct MVMaskData final { nWidthUV = bleh->nWidth / bleh->xRatioUV; nHeightBUV = nHeightB / bleh->yRatioUV; nWidthBUV = nWidthB / bleh->xRatioUV; - upsizer = new SimpleResize(nWidthB, nHeightB, bleh->nBlkX, bleh->nBlkY); - upsizerUV = new SimpleResize(nWidthBUV, nHeightBUV, bleh->nBlkX, bleh->nBlkY); + upsizer = new SimpleResize(nWidthB, nHeightB, bleh->nBlkX, bleh->nBlkY, 0, 0, 0); + upsizerUV = new SimpleResize(nWidthBUV, nHeightBUV, bleh->nBlkX, bleh->nBlkY, 0, 0, 0); } MVMaskData(const MVMaskData &) = delete; MVMaskData(MVMaskData &&) = delete; @@ -225,7 +225,7 @@ static auto VS_CC mvmaskGetFrame(int n, int activationReason, void **instanceDat if (kind == 5) std::memcpy(pDst[0], pSrc, nSrcPitch * nHeight * sizeof(float)); else { - upsizer->Resize(pDst[0], nDstPitches[0], smallMask.get(), nBlkX); + upsizer->Resize(pDst[0], nDstPitches[0], smallMask.get(), nBlkX, false); if (nWidth > nWidthB) for (auto h = 0; h < nHeight; ++h) for (auto w = nWidthB; w < nWidth; ++w) @@ -236,9 +236,9 @@ static auto VS_CC mvmaskGetFrame(int n, int activationReason, void **instanceDat }; auto SmallMaskToChroma = [&]() { if (!d->IsGray) { - upsizerUV->Resize(pDst[1], nDstPitches[1], smallMask.get(), nBlkX); + upsizerUV->Resize(pDst[1], nDstPitches[1], smallMask.get(), nBlkX, false); if (kind == 5) - upsizerUV->Resize(pDst[2], nDstPitches[2], smallMaskV.get(), nBlkX); + upsizerUV->Resize(pDst[2], nDstPitches[2], smallMaskV.get(), nBlkX, false); else std::memcpy(pDst[2], pDst[1], nHeightUV * nDstPitches[1] * sizeof(float)); if (nWidthUV > nWidthBUV) diff --git a/src/MaskFun.hpp b/src/MaskFun.hpp index 23b2a3e..9589e52 100644 --- a/src/MaskFun.hpp +++ b/src/MaskFun.hpp @@ -7,22 +7,50 @@ #include "MVFrame.h" #include "SADFunctions.hpp" +auto CheckAndPadMaskSmall(auto MaskSmall, auto nBlkXP, auto nBlkYP, auto nBlkX, auto nBlkY) { + if (nBlkXP > nBlkX) + for (auto j : Range{ nBlkY }) + for (auto dx : Range{ nBlkX, nBlkXP }) + MaskSmall[j * nBlkXP + dx] = MaskSmall[j * nBlkXP + nBlkX - 1]; + if (nBlkYP > nBlkY) + for (auto i : Range{ nBlkXP }) + for (auto dy : Range{ nBlkY, nBlkYP }) + MaskSmall[nBlkXP * dy + i] = MaskSmall[nBlkXP * (nBlkY - 1) + i]; +} + +auto CheckAndPadSmallY(auto VXSmallY, auto VYSmallY, auto nBlkXP, auto nBlkYP, auto nBlkX, auto nBlkY) { + using PixelType = std::decay_t; + constexpr auto Zero = static_cast(0); + if (nBlkXP > nBlkX) + for (auto j : Range{ nBlkY }) + for (auto dx : Range{ nBlkX, nBlkXP }) { + VXSmallY[j * nBlkXP + dx] = std::min(VXSmallY[j * nBlkXP + nBlkX - 1], Zero); + VYSmallY[j * nBlkXP + dx] = VYSmallY[j * nBlkXP + nBlkX - 1]; + } + if (nBlkYP > nBlkY) + for (auto i : Range{ nBlkXP }) + for (auto dy : Range{ nBlkY, nBlkYP }) { + VXSmallY[nBlkXP * dy + i] = VXSmallY[nBlkXP * (nBlkY - 1) + i]; + VYSmallY[nBlkXP * dy + i] = std::min(VYSmallY[nBlkXP * (nBlkY - 1) + i], Zero); + } +} + template -static auto RealMerge4PlanesToBig(uint8_t *pel2Plane_u8, int32_t pel2Pitch, const uint8_t *pPlane0_u8, const uint8_t *pPlane1_u8, - const uint8_t *pPlane2_u8, const uint8_t * pPlane3_u8, int32_t width, int32_t height, int32_t pitch) { - for (auto h = 0; h(pel2Plane_u8); - auto pPlane0 = reinterpret_cast(pPlane0_u8); - auto pPlane1 = reinterpret_cast(pPlane1_u8); +static auto RealMerge4PlanesToBig(uint8_t* pel2Plane_u8, int32_t pel2Pitch, const uint8_t* pPlane0_u8, const uint8_t* pPlane1_u8, + const uint8_t* pPlane2_u8, const uint8_t* pPlane3_u8, int32_t width, int32_t height, int32_t pitch) { + for (auto h = 0; h < height; ++h) { + for (auto w = 0; w < width; ++w) { + auto pel2Plane = reinterpret_cast(pel2Plane_u8); + auto pPlane0 = reinterpret_cast(pPlane0_u8); + auto pPlane1 = reinterpret_cast(pPlane1_u8); pel2Plane[w << 1] = pPlane0[w]; pel2Plane[(w << 1) + 1] = pPlane1[w]; } pel2Plane_u8 += pel2Pitch; - for (auto w = 0; w(pel2Plane_u8); - auto pPlane2 = reinterpret_cast(pPlane2_u8); - auto pPlane3 = reinterpret_cast(pPlane3_u8); + for (auto w = 0; w < width; ++w) { + auto pel2Plane = reinterpret_cast(pel2Plane_u8); + auto pPlane2 = reinterpret_cast(pPlane2_u8); + auto pPlane3 = reinterpret_cast(pPlane3_u8); pel2Plane[w << 1] = pPlane2[w]; pel2Plane[(w << 1) + 1] = pPlane3[w]; } @@ -34,60 +62,60 @@ static auto RealMerge4PlanesToBig(uint8_t *pel2Plane_u8, int32_t pel2Pitch, cons } } -static auto Merge4PlanesToBig(uint8_t *pel2Plane, int32_t pel2Pitch, const uint8_t *pPlane0, const uint8_t *pPlane1, const uint8_t *pPlane2, const uint8_t *pPlane3, int32_t width, int32_t height, int32_t pitch) { +static auto Merge4PlanesToBig(uint8_t* pel2Plane, int32_t pel2Pitch, const uint8_t* pPlane0, const uint8_t* pPlane1, const uint8_t* pPlane2, const uint8_t* pPlane3, int32_t width, int32_t height, int32_t pitch) { RealMerge4PlanesToBig(pel2Plane, pel2Pitch, pPlane0, pPlane1, pPlane2, pPlane3, width, height, pitch); } template -static void RealMerge16PlanesToBig(uint8_t *pel4Plane_u8, int32_t pel4Pitch, - const uint8_t *pPlane0_u8, const uint8_t *pPlane1_u8, const uint8_t *pPlane2_u8, const uint8_t * pPlane3_u8, - const uint8_t *pPlane4_u8, const uint8_t *pPlane5_u8, const uint8_t *pPlane6_u8, const uint8_t * pPlane7_u8, - const uint8_t *pPlane8_u8, const uint8_t *pPlane9_u8, const uint8_t *pPlane10_u8, const uint8_t * pPlane11_u8, - const uint8_t *pPlane12_u8, const uint8_t * pPlane13_u8, const uint8_t *pPlane14_u8, const uint8_t * pPlane15_u8, +static void RealMerge16PlanesToBig(uint8_t* pel4Plane_u8, int32_t pel4Pitch, + const uint8_t* pPlane0_u8, const uint8_t* pPlane1_u8, const uint8_t* pPlane2_u8, const uint8_t* pPlane3_u8, + const uint8_t* pPlane4_u8, const uint8_t* pPlane5_u8, const uint8_t* pPlane6_u8, const uint8_t* pPlane7_u8, + const uint8_t* pPlane8_u8, const uint8_t* pPlane9_u8, const uint8_t* pPlane10_u8, const uint8_t* pPlane11_u8, + const uint8_t* pPlane12_u8, const uint8_t* pPlane13_u8, const uint8_t* pPlane14_u8, const uint8_t* pPlane15_u8, int32_t width, int32_t height, int32_t pitch) { - for (auto h = 0; h(pel4Plane, pel4Pitch, pPlane0, pPlane1, pPlane2, pPlane3, pPlane4, pPlane5, pPlane6, pPlane7, pPlane8, pPlane9, pPlane10, pPlane11, pPlane12, pPlane13, pPlane14, pPlane15, width, height, pitch); } -static void MakeVectorSmallMasks(MVClipBalls &mvClip, int32_t nBlkX, int32_t nBlkY, int32_t *VXSmallY, int32_t pitchVXSmallY, int32_t *VYSmallY, int32_t pitchVYSmallY) { +static void MakeVectorSmallMasks(MVClipBalls& mvClip, int32_t nBlkX, int32_t nBlkY, int32_t* VXSmallY, int32_t pitchVXSmallY, int32_t* VYSmallY, int32_t pitchVYSmallY) { for (auto by = 0; by < nBlkY; ++by) for (auto bx = 0; bx < nBlkX; ++bx) { auto i = bx + by * nBlkX; - auto &block = mvClip[0][i]; + auto& block = mvClip[0][i]; int32_t vx = block.GetMV().x; int32_t vy = block.GetMV().y; - VXSmallY[bx + by*pitchVXSmallY] = vx; - VYSmallY[bx + by*pitchVYSmallY] = vy; + VXSmallY[bx + by * pitchVXSmallY] = vx; + VYSmallY[bx + by * pitchVYSmallY] = vy; } } -static void VectorSmallMaskYToHalfUV(int32_t *VSmallY, int32_t nBlkX, int32_t nBlkY, int32_t *VSmallUV, int32_t ratioUV) { +static void VectorSmallMaskYToHalfUV(int32_t* VSmallY, int32_t nBlkX, int32_t nBlkY, int32_t* VSmallUV, int32_t ratioUV) { if (ratioUV == 2) { for (auto by = 0; by < nBlkY; ++by) { for (auto bx = 0; bx < nBlkX; ++bx) @@ -154,12 +182,12 @@ static void VectorSmallMaskYToHalfUV(int32_t *VSmallY, int32_t nBlkX, int32_t nB } template -static void RealBlend(uint8_t * pdst, const uint8_t * psrc, const uint8_t * pref, int32_t height, int32_t width, int32_t dst_pitch, int32_t src_pitch, int32_t ref_pitch, int32_t time256) { - for (auto h = 0; h(pdst, psrc, pref, height, width, dst_pitch, src_pitch, ref_pitch, time256); } -static inline void ByteOccMask(double *occMask, int32_t occlusion, double occnorm, double fGamma) { +static inline void ByteOccMask(double* occMask, int32_t occlusion, double occnorm, double fGamma) { if (fGamma == 1.0) *occMask = std::max(*occMask, std::min((255. * occlusion * occnorm), 255.)); else *occMask = std::max(*occMask, std::min((255. * pow(occlusion * occnorm, fGamma)), 255.)); } -static void MakeVectorOcclusionMaskTime(MVClipBalls &mvClip, bool isb, int32_t nBlkX, int32_t nBlkY, double dMaskNormDivider, double fGamma, int32_t nPel, double *occMask, int32_t occMaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) { +static void MakeVectorOcclusionMaskTime(MVClipBalls& mvClip, bool isb, int32_t nBlkX, int32_t nBlkY, double dMaskNormDivider, double fGamma, int32_t nPel, double* occMask, int32_t occMaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) { for (auto i = 0; i < occMaskPitch * nBlkY; ++i) occMask[i] = 0.; int time4096X = time256 * 16 / (nBlkStepX * nPel); @@ -187,17 +215,17 @@ static void MakeVectorOcclusionMaskTime(MVClipBalls &mvClip, bool isb, int32_t n double occnormX = 80. / (dMaskNormDivider * nBlkStepX * nPel); double occnormY = 80. / (dMaskNormDivider * nBlkStepY * nPel); int32_t occlusion; - for (auto by = 0; by 255.) ? 255. : l; } -static void MakeSADMaskTime(MVClipBalls &mvClip, int32_t nBlkX, int32_t nBlkY, double dSADNormFactor, double fGamma, int32_t nPel, double *Mask, int32_t MaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) { +static void MakeSADMaskTime(MVClipBalls& mvClip, int32_t nBlkX, int32_t nBlkY, double dSADNormFactor, double fGamma, int32_t nPel, double* Mask, int32_t MaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) { for (auto i = 0; i < nBlkY * MaskPitch; ++i) Mask[i] = 0.; int32_t time4096X = (256 - time256) * 16 / (nBlkStepX * nPel); @@ -233,7 +261,7 @@ static void MakeSADMaskTime(MVClipBalls &mvClip, int32_t nBlkX, int32_t nBlkY, d for (auto by = 0; by < nBlkY; ++by) { for (auto bx = 0; bx < nBlkX; ++bx) { auto i = bx + by * nBlkX; - auto &block = mvClip[0][i]; + auto& block = mvClip[0][i]; int32_t vx = block.GetMV().x; int32_t vy = block.GetMV().y; int32_t bxi = bx - vx * time4096X / 4096; @@ -256,33 +284,33 @@ static inline float Median3r(float a, float b, float c) { } template -static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int32_t ref_pitch, - int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF, +static void RealFlowInterExtra(uint8_t* pdst8, int32_t dst_pitch, const uint8_t* prefB8, const uint8_t* prefF8, int32_t ref_pitch, + int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF, int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel, - int32_t *VXFullBB, int32_t *VXFullFF, int32_t *VYFullBB, int32_t *VYFullFF) { - const PixelType *prefB = reinterpret_cast(prefB8); - const PixelType *prefF = reinterpret_cast(prefF8); - PixelType *pdst = (PixelType *)pdst8; + int32_t* VXFullBB, int32_t* VXFullFF, int32_t* VYFullBB, int32_t* VYFullFF) { + const PixelType* prefB = reinterpret_cast(prefB8); + const PixelType* prefF = reinterpret_cast(prefF8); + PixelType* pdst = (PixelType*)pdst8; ref_pitch /= sizeof(PixelType); dst_pitch /= sizeof(PixelType); if (nPel == 1) { - for (auto h = 0; h> 8; auto vyF = (VYFullF[w] * time256) >> 8; - int32_t adrF = vyF*ref_pitch + vxF + w; + int32_t adrF = vyF * ref_pitch + vxF + w; float dstF = prefF[adrF]; auto vxFF = (VXFullFF[w] * time256) >> 8; auto vyFF = (VYFullFF[w] * time256) >> 8; - int32_t adrFF = vyFF*ref_pitch + vxFF + w; + int32_t adrFF = vyFF * ref_pitch + vxFF + w; float dstFF = prefF[adrFF]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - int32_t adrB = vyB*ref_pitch + vxB + w; + int32_t adrB = vyB * ref_pitch + vxB + w; float dstB = prefB[adrB]; auto vxBB = (VXFullBB[w] * (256 - time256)) >> 8; auto vyBB = (VYFullBB[w] * (256 - time256)) >> 8; - int32_t adrBB = vyBB*ref_pitch + vxBB + w; + int32_t adrBB = vyBB * ref_pitch + vxBB + w; float dstBB = prefB[adrBB]; float minfb; float maxfb; @@ -294,8 +322,8 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t maxfb = dstB; minfb = dstF; } - pdst[w] = static_cast((((Median3r(minfb, dstBB, maxfb)*MaskF[w] + dstF*(255 - MaskF[w])) / 256)*(256 - time256) + - ((Median3r(minfb, dstFF, maxfb)*MaskB[w] + dstB*(255 - MaskB[w])) / 256)*time256) / 256); + pdst[w] = static_cast((((Median3r(minfb, dstBB, maxfb) * MaskF[w] + dstF * (255 - MaskF[w])) / 256) * (256 - time256) + + ((Median3r(minfb, dstFF, maxfb) * MaskB[w] + dstB * (255 - MaskB[w])) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch; @@ -313,23 +341,23 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } else if (nPel == 2) { - for (auto h = 0; h> 8; auto vyF = (VYFullF[w] * time256) >> 8; - int32_t adrF = vyF*ref_pitch + vxF + (w << 1); + int32_t adrF = vyF * ref_pitch + vxF + (w << 1); float dstF = prefF[adrF]; auto vxFF = (VXFullFF[w] * time256) >> 8; auto vyFF = (VYFullFF[w] * time256) >> 8; - int32_t adrFF = vyFF*ref_pitch + vxFF + (w << 1); + int32_t adrFF = vyFF * ref_pitch + vxFF + (w << 1); float dstFF = prefF[adrFF]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - int32_t adrB = vyB*ref_pitch + vxB + (w << 1); + int32_t adrB = vyB * ref_pitch + vxB + (w << 1); float dstB = prefB[adrB]; auto vxBB = (VXFullBB[w] * (256 - time256)) >> 8; auto vyBB = (VYFullBB[w] * (256 - time256)) >> 8; - int32_t adrBB = vyBB*ref_pitch + vxBB + (w << 1); + int32_t adrBB = vyBB * ref_pitch + vxBB + (w << 1); float dstBB = prefB[adrBB]; float minfb; float maxfb; @@ -341,8 +369,8 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t maxfb = dstB; minfb = dstF; } - pdst[w] = static_cast((((Median3r(minfb, dstBB, maxfb)*MaskF[w] + dstF*(255 - MaskF[w])) / 256)*(256 - time256) + - ((Median3r(minfb, dstFF, maxfb)*MaskB[w] + dstB*(255 - MaskB[w])) / 256)*time256) / 256); + pdst[w] = static_cast((((Median3r(minfb, dstBB, maxfb) * MaskF[w] + dstF * (255 - MaskF[w])) / 256) * (256 - time256) + + ((Median3r(minfb, dstFF, maxfb) * MaskB[w] + dstB * (255 - MaskB[w])) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch << 1; @@ -360,20 +388,20 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } else if (nPel == 4) { - for (auto h = 0; h> 8; auto vyF = (VYFullF[w] * time256) >> 8; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)]; auto vxFF = (VXFullFF[w] * time256) >> 8; auto vyFF = (VYFullFF[w] * time256) >> 8; - float dstFF = prefF[vyFF*ref_pitch + vxFF + (w << 2)]; + float dstFF = prefF[vyFF * ref_pitch + vxFF + (w << 2)]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)]; + float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)]; auto vxBB = (VXFullBB[w] * (256 - time256)) >> 8; auto vyBB = (VYFullBB[w] * (256 - time256)) >> 8; - float dstBB = prefB[vyBB*ref_pitch + vxBB + (w << 2)]; + float dstBB = prefB[vyBB * ref_pitch + vxBB + (w << 2)]; float minfb; float maxfb; if (dstF > dstB) { @@ -384,8 +412,8 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t maxfb = dstB; minfb = dstF; } - pdst[w] = static_cast((((Median3r(minfb, dstBB, maxfb)*MaskF[w] + dstF*(255 - MaskF[w])) / 256)*(256 - time256) + - ((Median3r(minfb, dstFF, maxfb)*MaskB[w] + dstB*(255 - MaskB[w])) / 256)*time256) / 256); + pdst[w] = static_cast((((Median3r(minfb, dstBB, maxfb) * MaskF[w] + dstF * (255 - MaskF[w])) / 256) * (256 - time256) + + ((Median3r(minfb, dstFF, maxfb) * MaskB[w] + dstB * (255 - MaskB[w])) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch << 2; @@ -404,20 +432,20 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } -static void FlowInterExtra(uint8_t * pdst, int32_t dst_pitch, const uint8_t *prefB, const uint8_t *prefF, int32_t ref_pitch, - int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF, +static void FlowInterExtra(uint8_t* pdst, int32_t dst_pitch, const uint8_t* prefB, const uint8_t* prefF, int32_t ref_pitch, + int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF, int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel, - int32_t *VXFullBB, int32_t *VXFullFF, int32_t *VYFullBB, int32_t *VYFullFF) { + int32_t* VXFullBB, int32_t* VXFullFF, int32_t* VYFullBB, int32_t* VYFullFF) { RealFlowInterExtra(pdst, dst_pitch, prefB, prefF, ref_pitch, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, VPitch, width, height, time256, nPel, VXFullBB, VXFullFF, VYFullBB, VYFullFF); } template -static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int32_t ref_pitch, - int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF, +static void RealFlowInter(uint8_t* pdst8, int32_t dst_pitch, const uint8_t* prefB8, const uint8_t* prefF8, int32_t ref_pitch, + int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF, int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) { - const PixelType *prefB = reinterpret_cast(prefB8); - const PixelType *prefF = reinterpret_cast(prefF8); - PixelType *pdst = reinterpret_cast(pdst8); + const PixelType* prefB = reinterpret_cast(prefB8); + const PixelType* prefF = reinterpret_cast(prefF8); + PixelType* pdst = reinterpret_cast(pdst8); ref_pitch /= sizeof(PixelType); dst_pitch /= sizeof(PixelType); if (nPel == 1) { @@ -425,14 +453,14 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref for (auto w = 0; w < width; ++w) { auto vxF = (VXFullF[w] * time256) >> 8; auto vyF = (VYFullF[w] * time256) >> 8; - double dstF = prefF[vyF*ref_pitch + vxF + w]; + double dstF = prefF[vyF * ref_pitch + vxF + w]; float dstF0 = prefF[w]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - double dstB = prefB[vyB*ref_pitch + vxB + w]; + double dstB = prefB[vyB * ref_pitch + vxB + w]; float dstB0 = prefB[w]; - pdst[w] = static_cast((((dstF*(255 - MaskF[w]) + ((MaskF[w] * (dstB*(255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256)*(256 - time256) + - ((dstB*(255 - MaskB[w]) + ((MaskB[w] * (dstF*(255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256)*time256) / 256); + pdst[w] = static_cast((((dstF * (255 - MaskF[w]) + ((MaskF[w] * (dstB * (255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256) * (256 - time256) + + ((dstB * (255 - MaskB[w]) + ((MaskB[w] * (dstF * (255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch; @@ -450,14 +478,14 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref for (auto w = 0; w < width; ++w) { auto vxF = (VXFullF[w] * time256) >> 8; auto vyF = (VYFullF[w] * time256) >> 8; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 1)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 1)]; float dstF0 = prefF[(w << 1)]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 1)]; + float dstB = prefB[vyB * ref_pitch + vxB + (w << 1)]; float dstB0 = prefB[(w << 1)]; - pdst[w] = static_cast((((dstF*(255 - MaskF[w]) + ((MaskF[w] * (dstB*(255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256)*(256 - time256) + - ((dstB*(255 - MaskB[w]) + ((MaskB[w] * (dstF*(255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256)*time256) / 256); + pdst[w] = static_cast((((dstF * (255 - MaskF[w]) + ((MaskF[w] * (dstB * (255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256) * (256 - time256) + + ((dstB * (255 - MaskB[w]) + ((MaskB[w] * (dstF * (255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch << 1; @@ -475,14 +503,14 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref for (auto w = 0; w < width; ++w) { auto vxF = (VXFullF[w] * time256) >> 8; auto vyF = (VYFullF[w] * time256) >> 8; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)]; float dstF0 = prefF[(w << 2)]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)]; + float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)]; float dstB0 = prefB[(w << 2)]; - pdst[w] = static_cast((((dstF*(255 - MaskF[w]) + ((MaskF[w] * (dstB*(255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256)*(256 - time256) + - ((dstB*(255 - MaskB[w]) + ((MaskB[w] * (dstF*(255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256)*time256) / 256); + pdst[w] = static_cast((((dstF * (255 - MaskF[w]) + ((MaskF[w] * (dstB * (255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256) * (256 - time256) + + ((dstB * (255 - MaskB[w]) + ((MaskB[w] * (dstF * (255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch << 2; @@ -497,37 +525,37 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref } } -static void FlowInter(uint8_t *pdst, int32_t dst_pitch, const uint8_t *prefB, const uint8_t *prefF, int32_t ref_pitch, - int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF, +static void FlowInter(uint8_t* pdst, int32_t dst_pitch, const uint8_t* prefB, const uint8_t* prefF, int32_t ref_pitch, + int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF, int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) { RealFlowInter(pdst, dst_pitch, prefB, prefF, ref_pitch, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, VPitch, width, height, time256, nPel); } template -static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int32_t ref_pitch, - int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF, +static void RealFlowInterSimple(uint8_t* pdst8, int32_t dst_pitch, const uint8_t* prefB8, const uint8_t* prefF8, int32_t ref_pitch, + int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF, int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) { - const PixelType *prefB = reinterpret_cast(prefB8); - const PixelType *prefF = reinterpret_cast(prefF8); - PixelType *pdst = (PixelType *)pdst8; + const PixelType* prefB = reinterpret_cast(prefB8); + const PixelType* prefF = reinterpret_cast(prefF8); + PixelType* pdst = (PixelType*)pdst8; ref_pitch /= sizeof(PixelType); dst_pitch /= sizeof(PixelType); if (time256 == 128) { if (nPel == 1) { - for (auto h = 0; h> 1; auto vyF = VYFullF[w] >> 1; - int32_t addrF = vyF*ref_pitch + vxF + w; + int32_t addrF = vyF * ref_pitch + vxF + w; float dstF = prefF[addrF]; float dstF1 = prefF[addrF + 1]; auto vxB = VXFullB[w] >> 1; auto vyB = VYFullB[w] >> 1; - int32_t addrB = vyB*ref_pitch + vxB + w; + int32_t addrB = vyB * ref_pitch + vxB + w; float dstB = prefB[addrB]; float dstB1 = prefB[addrB + 1]; - pdst[w] = static_cast((((dstF + dstB) * 256) + (dstB - dstF)*(MaskF[w] - MaskB[w])) / 512); - pdst[w + 1] = static_cast((((dstF1 + dstB1) * 256) + (dstB1 - dstF1)*(MaskF[w + 1] - MaskB[w + 1])) / 512); + pdst[w] = static_cast((((dstF + dstB) * 256) + (dstB - dstF) * (MaskF[w] - MaskB[w])) / 512); + pdst[w + 1] = static_cast((((dstF1 + dstB1) * 256) + (dstB1 - dstF1) * (MaskF[w + 1] - MaskB[w + 1])) / 512); } pdst += dst_pitch; prefB += ref_pitch; @@ -541,15 +569,15 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } else if (nPel == 2) { - for (auto h = 0; h> 1; auto vyF = VYFullF[w] >> 1; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 1)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 1)]; auto vxB = VXFullB[w] >> 1; auto vyB = VYFullB[w] >> 1; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 1)]; - pdst[w] = static_cast((((dstF + dstB) * 256) + (dstB - dstF)*(MaskF[w] - MaskB[w])) / 512); + float dstB = prefB[vyB * ref_pitch + vxB + (w << 1)]; + pdst[w] = static_cast((((dstF + dstB) * 256) + (dstB - dstF) * (MaskF[w] - MaskB[w])) / 512); } pdst += dst_pitch; prefB += ref_pitch << 1; @@ -563,15 +591,15 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } else if (nPel == 4) { - for (auto h = 0; h> 1; auto vyF = VYFullF[w] >> 1; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)]; auto vxB = VXFullB[w] >> 1; auto vyB = VYFullB[w] >> 1; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)]; - pdst[w] = static_cast((((dstF + dstB) * 256) + (dstB - dstF)*(MaskF[w] - MaskB[w])) / 512); + float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)]; + pdst[w] = static_cast((((dstF + dstB) * 256) + (dstB - dstF) * (MaskF[w] - MaskB[w])) / 512); } pdst += dst_pitch; prefB += ref_pitch << 2; @@ -587,22 +615,22 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } else { if (nPel == 1) { - for (auto h = 0; h> 8; auto vyF = (VYFullF[w] * time256) >> 8; - int32_t addrF = vyF*ref_pitch + vxF + w; + int32_t addrF = vyF * ref_pitch + vxF + w; float dstF = prefF[addrF]; float dstF1 = prefF[addrF + 1]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - int32_t addrB = vyB*ref_pitch + vxB + w; + int32_t addrB = vyB * ref_pitch + vxB + w; float dstB = prefB[addrB]; float dstB1 = prefB[addrB + 1]; - pdst[w] = static_cast((((dstF * 255 + (dstB - dstF)*MaskF[w]))*(256 - time256) + - ((dstB * 255 - (dstB - dstF)*MaskB[w]))*time256) / 65536); - pdst[w + 1] = static_cast((((dstF1 * 255 + (dstB1 - dstF1)*MaskF[w + 1]))*(256 - time256) + - ((dstB1 * 255 - (dstB1 - dstF1)*MaskB[w + 1]))*time256) / 65536); + pdst[w] = static_cast((((dstF * 255 + (dstB - dstF) * MaskF[w])) * (256 - time256) + + ((dstB * 255 - (dstB - dstF) * MaskB[w])) * time256) / 65536); + pdst[w + 1] = static_cast((((dstF1 * 255 + (dstB1 - dstF1) * MaskF[w + 1])) * (256 - time256) + + ((dstB1 * 255 - (dstB1 - dstF1) * MaskB[w + 1])) * time256) / 65536); } pdst += dst_pitch; prefB += ref_pitch; @@ -616,16 +644,16 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } else if (nPel == 2) { - for (auto h = 0; h> 8; auto vyF = (VYFullF[w] * time256) >> 8; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 1)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 1)]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 1)]; - pdst[w] = static_cast((((dstF*(255 - MaskF[w]) + dstB*MaskF[w]) / 256)*(256 - time256) + - ((dstB*(255 - MaskB[w]) + dstF*MaskB[w]) / 256)*time256) / 256); + float dstB = prefB[vyB * ref_pitch + vxB + (w << 1)]; + pdst[w] = static_cast((((dstF * (255 - MaskF[w]) + dstB * MaskF[w]) / 256) * (256 - time256) + + ((dstB * (255 - MaskB[w]) + dstF * MaskB[w]) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch << 1; @@ -639,16 +667,16 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } else if (nPel == 4) { - for (auto h = 0; h> 8; auto vyF = (VYFullF[w] * time256) >> 8; - float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)]; + float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)]; auto vxB = (VXFullB[w] * (256 - time256)) >> 8; auto vyB = (VYFullB[w] * (256 - time256)) >> 8; - float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)]; - pdst[w] = static_cast((((dstF*(255 - MaskF[w]) + dstB*MaskF[w]) / 256)*(256 - time256) + - ((dstB*(255 - MaskB[w]) + dstF*MaskB[w]) / 256)*time256) / 256); + float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)]; + pdst[w] = static_cast((((dstF * (255 - MaskF[w]) + dstB * MaskF[w]) / 256) * (256 - time256) + + ((dstB * (255 - MaskB[w]) + dstF * MaskB[w]) / 256) * time256) / 256); } pdst += dst_pitch; prefB += ref_pitch << 2; @@ -664,8 +692,8 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t } } -static void FlowInterSimple(uint8_t *pdst, int32_t dst_pitch, const uint8_t *prefB, const uint8_t *prefF, int32_t ref_pitch, - int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF, +static void FlowInterSimple(uint8_t* pdst, int32_t dst_pitch, const uint8_t* prefB, const uint8_t* prefF, int32_t ref_pitch, + int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF, int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) { RealFlowInterSimple(pdst, dst_pitch, prefB, prefF, ref_pitch, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, VPitch, width, height, time256, nPel); } diff --git a/src/PlaneOfBlocks.h b/src/PlaneOfBlocks.h index 7bd68b6..c14b372 100644 --- a/src/PlaneOfBlocks.h +++ b/src/PlaneOfBlocks.h @@ -129,16 +129,16 @@ class PlaneOfBlocks { inline const uint8_t* GetRefBlockV(int32_t nVx, int32_t nVy) { if (nPel == 2) return pRefFrame->GetPlane(VPLANE)->GetAbsolutePointerPel2( - x[1] * 2 + nVx / xRatioUV, - y[1] * 2 + nVy / yRatioUV); + x[2] * 2 + nVx / xRatioUV, + y[2] * 2 + nVy / yRatioUV); else if (nPel == 1) return pRefFrame->GetPlane(VPLANE)->GetAbsolutePointerPel1( - x[1] + nVx / xRatioUV, - y[1] + nVy / yRatioUV); + x[2] + nVx / xRatioUV, + y[2] + nVy / yRatioUV); else return pRefFrame->GetPlane(VPLANE)->GetAbsolutePointerPel4( - x[1] * 4 + nVx / xRatioUV, - y[1] * 4 + nVy / yRatioUV); + x[2] * 4 + nVx / xRatioUV, + y[2] * 4 + nVy / yRatioUV); } inline const uint8_t* GetSrcBlock(int32_t nX, int32_t nY) { return pSrcFrame->GetPlane(YPLANE)->GetAbsolutePelPointer(nX, nY); @@ -253,7 +253,7 @@ class PlaneOfBlocks { bestMV.x = vx; bestMV.y = vy; nMinCost = cost; - bestMV.sad = static_cast(sad + saduv); + bestMV.sad = sad + saduv; } } inline void CheckMV(int32_t vx, int32_t vy) { @@ -270,7 +270,7 @@ class PlaneOfBlocks { bestMV.x = vx; bestMV.y = vy; nMinCost = cost; - bestMV.sad = static_cast(sad + saduv); + bestMV.sad = sad + saduv; } } inline void CheckMV2(int32_t vx, int32_t vy, int32_t* dir, int32_t val) { @@ -287,7 +287,7 @@ class PlaneOfBlocks { bestMV.x = vx; bestMV.y = vy; nMinCost = cost; - bestMV.sad = static_cast(sad + saduv); + bestMV.sad = sad + saduv; *dir = val; } } @@ -303,7 +303,7 @@ class PlaneOfBlocks { cost += saduv + ((penaltyNew * saduv) / 256); if (cost >= nMinCost) return; nMinCost = cost; - bestMV.sad = static_cast(sad + saduv); + bestMV.sad = sad + saduv; *dir = val; } } @@ -528,7 +528,7 @@ class PlaneOfBlocks { SATD = satds[nBlkSizeX][nBlkSizeY]; if (!chroma) SADCHROMA = nullptr; - dctpitch = max(nBlkSizeX, 16) * 4; + dctpitch = nBlkSizeX * sizeof(float); dctSrc = vs_aligned_malloc(nBlkSizeY * dctpitch, ALIGN_PLANES); dctRef = vs_aligned_malloc(nBlkSizeY * dctpitch, ALIGN_PLANES); @@ -542,7 +542,7 @@ class PlaneOfBlocks { freqSize = 8192 * nPel * 2; freqArray = new int32_t[freqSize]; - verybigSAD = static_cast(nBlkSizeX) * nBlkSizeY; + verybigSAD = 1. * nBlkSizeX * nBlkSizeY; } ~PlaneOfBlocks() { delete[] vectors; @@ -805,7 +805,7 @@ class PlaneOfBlocks { + SADCHROMA(pSrc[2], nSrcPitch[2], GetRefBlockV(0, 0), nRefPitch[2]) : 0.f; sad = LumaSAD(GetRefBlock(0, zeroMVfieldShifted.y)); sad += saduv; - bestMV.sad = static_cast(sad); + bestMV.sad = sad; nMinCost = sad + ((penaltyZero * sad) / 256); // v.1.11.0.2 VectorStructure bestMVMany[8]; @@ -831,7 +831,7 @@ class PlaneOfBlocks { { bestMV.x = globalMVPredictor.x; bestMV.y = globalMVPredictor.y; - bestMV.sad = static_cast(sad); + bestMV.sad = sad; nMinCost = cost; } if (tryMany) @@ -851,7 +851,7 @@ class PlaneOfBlocks { { bestMV.x = predictor.x; bestMV.y = predictor.y; - bestMV.sad = static_cast(sad); + bestMV.sad = sad; nMinCost = cost; } if (tryMany) @@ -864,11 +864,12 @@ class PlaneOfBlocks { // then all the other predictors int32_t npred = (temporal) ? 5 : 4; + constexpr auto epsilon = 1e-5; for (int32_t i = 0; i < npred; i++) { if (tryMany) - nMinCost = verybigSAD; + nMinCost = verybigSAD + epsilon; CheckMV0(predictors[i].x, predictors[i].y); if (tryMany) { @@ -880,9 +881,8 @@ class PlaneOfBlocks { } - if (tryMany) - { // select best of multi best - nMinCost = verybigSAD; + if (tryMany) { // select best of multi best + nMinCost = verybigSAD + epsilon; for (int32_t i = 0; i < npred + 3; i++) { if (nMinCostMany[i] < nMinCost) @@ -1149,6 +1149,7 @@ class PlaneOfBlocks { planeSAD = 0.0; badcount = 0; tryMany = _tryMany; + sumLumaChange = 0.; // Functions using double must not be used here for (blky = 0; blky < nBlkY; blky++) @@ -1226,25 +1227,17 @@ class PlaneOfBlocks { /* search the mv */ predictor = ClipMV(vectors[blkIdx]); if (temporal) - predictors[4] = ClipMV(*reinterpret_cast(&vecPrev[blkIdx * N_PER_BLOCK])); // temporal predictor + predictors[4] = ClipMV(reinterpret_cast(vecPrev[blkIdx * N_PER_BLOCK])); // temporal predictor else predictors[4] = ClipMV(zeroMV); PseudoEPZSearch(); - if (outfilebuf != nullptr) // write vector to outfile - { - outfilebuf[blkx * 4 + 0] = bestMV.x; - outfilebuf[blkx * 4 + 1] = bestMV.y; - outfilebuf[blkx * 4 + 2] = reinterpret_cast(bestMV.sad); - - } /* write the results */ - pBlkData[blkx * N_PER_BLOCK + 0] = bestMV.x; - pBlkData[blkx * N_PER_BLOCK + 1] = bestMV.y; - pBlkData[blkx * N_PER_BLOCK + 2] = reinterpret_cast(bestMV.sad); + auto& BlockData = reinterpret_cast(pBlkData[blkx * N_PER_BLOCK]); + BlockData = bestMV; if (smallestPlane) @@ -1261,8 +1254,6 @@ class PlaneOfBlocks { } } pBlkData += nBlkX * N_PER_BLOCK; - if (outfilebuf != nullptr) // write vector to outfile - outfilebuf += nBlkX * 4;// 4 int32_t word per block y[0] += (nBlkSizeY - nOverlapY); if (pSrcFrame->GetMode() & UPLANE) @@ -1327,7 +1318,7 @@ class PlaneOfBlocks { { vectors[index].x = 9 * v1.x + 3 * v2.x + 3 * v3.x + v4.x; vectors[index].y = 9 * v1.y + 3 * v2.y + 3 * v3.y + v4.y; - temp_sad = 9 * static_cast(v1.sad) + 3 * v2.sad + 3 * v3.sad + v4.sad; + temp_sad = 9 * v1.sad + 3 * v2.sad + 3 * v3.sad + v4.sad; } else if (nOverlapX <= (nBlkSizeX >> 1) && nOverlapY <= (nBlkSizeY >> 1)) // corrected in v1.4.11 { @@ -1346,11 +1337,11 @@ class PlaneOfBlocks { // Dead branch. The overlap is no longer allowed to be more than half the block size. vectors[index].x = (v1.x + v2.x + v3.x + v4.x) << 2; vectors[index].y = (v1.y + v2.y + v3.y + v4.y) << 2; - temp_sad = (static_cast(v1.sad) + v2.sad + v3.sad + v4.sad) * 4; + temp_sad = (v1.sad + v2.sad + v3.sad + v4.sad) * 4; } vectors[index].x = vectors[index].x ? vectors[index].x / abs(vectors[index].x) * ((abs(vectors[index].x) >> normFactor) << mulFactor) : 0; vectors[index].y = vectors[index].y ? vectors[index].y / abs(vectors[index].y) * ((abs(vectors[index].y) >> normFactor) << mulFactor) : 0; - vectors[index].sad = static_cast(temp_sad / 16); + vectors[index].sad = temp_sad / 16; } } } @@ -1358,25 +1349,22 @@ class PlaneOfBlocks { array[0] = nBlkCount * N_PER_BLOCK + 1; } auto WriteDefaultToArray(int32_t* array, int32_t divideMode) { - auto verybigSAD_f = static_cast(verybigSAD); array[0] = nBlkCount * N_PER_BLOCK + 1; - for (int32_t i = 0; i < nBlkCount * N_PER_BLOCK; i += N_PER_BLOCK) - { - array[i + 1] = 0; - array[i + 2] = 0; - array[i + 3] = reinterpret_cast(verybigSAD_f); + for (auto i : Range{ 0, nBlkCount * N_PER_BLOCK, N_PER_BLOCK }) { + auto& BlockData = reinterpret_cast(array[i + 1]); + BlockData.x = 0; + BlockData.y = 0; + BlockData.sad = verybigSAD; } - - if (nLogScale == 0) - { + if (nLogScale == 0) { array += array[0]; if (divideMode) { // reserve space for divided subblocks extra level array[0] = nBlkCount * N_PER_BLOCK * 4 + 1; // 4 subblocks - for (int32_t i = 0; i < nBlkCount * 4 * N_PER_BLOCK; i += N_PER_BLOCK) - { - array[i + 1] = 0; - array[i + 2] = 0; - array[i + 3] = reinterpret_cast(verybigSAD_f); + for (auto i : Range{ 0, nBlkCount * 4 * N_PER_BLOCK, N_PER_BLOCK }) { + auto& BlockData = reinterpret_cast(array[i + 1]); + BlockData.x = 0; + BlockData.y = 0; + BlockData.sad = verybigSAD; } array += array[0]; } @@ -1632,15 +1620,15 @@ class PlaneOfBlocks { // interpolate int32_t vector1_x = vectorOld1.x * nStepXold + deltaX * (vectorOld2.x - vectorOld1.x); // scaled by nStepXold to skip slow division int32_t vector1_y = vectorOld1.y * nStepXold + deltaX * (vectorOld2.y - vectorOld1.y); - auto vector1_sad = static_cast(vectorOld1.sad) * nStepXold + deltaX * (vectorOld2.sad - vectorOld1.sad); + auto vector1_sad = vectorOld1.sad * nStepXold + deltaX * (vectorOld2.sad - vectorOld1.sad); int32_t vector2_x = vectorOld3.x * nStepXold + deltaX * (vectorOld4.x - vectorOld3.x); int32_t vector2_y = vectorOld3.y * nStepXold + deltaX * (vectorOld4.y - vectorOld3.y); - auto vector2_sad = static_cast(vectorOld3.sad) * nStepXold + deltaX * (vectorOld4.sad - vectorOld3.sad); + auto vector2_sad = vectorOld3.sad * nStepXold + deltaX * (vectorOld4.sad - vectorOld3.sad); vectorOld.x = (vector1_x + deltaY * (vector2_x - vector1_x) / nStepYold) / nStepXold; vectorOld.y = (vector1_y + deltaY * (vector2_y - vector1_y) / nStepYold) / nStepXold; - vectorOld.sad = static_cast((vector1_sad + deltaY * (vector2_sad - vector1_sad) / nStepYold) / nStepXold); + vectorOld.sad = (vector1_sad + deltaY * (vector2_sad - vector1_sad) / nStepYold) / nStepXold; } else // nearest @@ -1660,7 +1648,7 @@ class PlaneOfBlocks { vectorOld.y = vectorOld.y ? vectorOld.y / abs(vectorOld.y) * ((abs(vectorOld.y) << nLogPel) >> nLogPelold) : 0; predictor = ClipMV(vectorOld); // predictor - predictor.sad = static_cast(static_cast(vectorOld.sad) * (nBlkSizeX * nBlkSizeY) / (nBlkSizeXold * nBlkSizeYold)); // normalized to new block size + predictor.sad = vectorOld.sad * (nBlkSizeX * nBlkSizeY) / (nBlkSizeXold * nBlkSizeYold); // normalized to new block size bestMV.x = predictor.x; bestMV.y = predictor.y; @@ -1680,7 +1668,7 @@ class PlaneOfBlocks { + SADCHROMA(pSrc[2], nSrcPitch[2], GetRefBlockV(predictor.x, predictor.y), nRefPitch[2]) : 0.f; double sad = LumaSAD(GetRefBlock(predictor.x, predictor.y)); sad += saduv; - bestMV.sad = static_cast(sad); + bestMV.sad = sad; nMinCost = sad; @@ -1741,23 +1729,10 @@ class PlaneOfBlocks { vectors[blkIdx].sad = bestMV.sad; - if (outfilebuf != nullptr) // write vector to outfile - { - outfilebuf[blkx * 4 + 0] = bestMV.x; - outfilebuf[blkx * 4 + 1] = bestMV.y; - outfilebuf[blkx * 4 + 2] = reinterpret_cast(bestMV.sad); - - } - /* write the results */ - pBlkData[blkx * N_PER_BLOCK + 0] = bestMV.x; - pBlkData[blkx * N_PER_BLOCK + 1] = bestMV.y; - pBlkData[blkx * N_PER_BLOCK + 2] = reinterpret_cast(bestMV.sad); - - + auto& BlockData = reinterpret_cast(pBlkData[blkx * N_PER_BLOCK]); + BlockData = bestMV; - if (smallestPlane) - sumLumaChange += LUMA(GetRefBlock(0, 0), nRefPitch[0]) - LUMA(pSrc[0], nSrcPitch[0]); if (iblkx < nBlkX - 1) { @@ -1769,8 +1744,6 @@ class PlaneOfBlocks { } } pBlkData += nBlkX * N_PER_BLOCK; - if (outfilebuf != nullptr) // write vector to outfile - outfilebuf += nBlkX * 4;// 4 int32_t word per block y[0] += (nBlkSizeY - nOverlapY); if (pSrcFrame->GetMode() & UPLANE) diff --git a/src/SimpleResize.hpp b/src/SimpleResize.hpp index e7004c4..d331e71 100644 --- a/src/SimpleResize.hpp +++ b/src/SimpleResize.hpp @@ -9,6 +9,9 @@ class SimpleResize { int32_t dst_height; int32_t src_width; int32_t src_height; + self(limit_width, 0); + self(limit_height, 0); + self(pel, 0); int32_t *vertical_offsets; double *vertical_weights; int32_t *horizontal_offsets; @@ -39,11 +42,14 @@ class SimpleResize { } } public: - SimpleResize(int32_t _dst_width, int32_t _dst_height, int32_t _src_width, int32_t _src_height) { + SimpleResize(int32_t _dst_width, int32_t _dst_height, int32_t _src_width, int32_t _src_height, auto limit_width, auto limit_height, auto pel) { src_width = _src_width; src_height = _src_height; dst_width = _dst_width; dst_height = _dst_height; + this->limit_width = limit_width; + this->limit_height = limit_height; + this->pel = pel; vertical_offsets = new int32_t[dst_height]; vertical_weights = new double[dst_height]; horizontal_offsets = new int32_t[dst_width]; @@ -57,7 +63,14 @@ class SimpleResize { delete[] horizontal_offsets; delete[] horizontal_weights; } - auto Resize(T *dstp, int32_t dst_stride, const T *srcp, int32_t src_stride) { + auto Resize(T *dstp, int32_t dst_stride, const T *srcp, int32_t src_stride, auto horizontal_vectors) { + constexpr auto limit_vectors = requires(T x) { x << 0; }; + auto minimum = static_cast(0); + auto maximum = static_cast(limit_height * pel - 1); + auto horizontal_step = horizontal_vectors ? pel : 0; + auto vertical_step = horizontal_vectors ? 0 : pel; + + const T *srcp1; const T *srcp2; auto workp = new double[src_width]; @@ -68,13 +81,32 @@ class SimpleResize { srcp2 = srcp1 + src_stride; for (auto x = 0; x < src_width; ++x) workp[x] = srcp1[x] * weight_top + srcp2[x] * weight_bottom; + + if (horizontal_vectors) { + minimum = 0; + maximum = limit_width * pel - 1; + } + for (auto x = 0; x < dst_width; ++x) { - double weight_right = horizontal_weights[x]; - double weight_left = 1. - weight_right; - int32_t offset = horizontal_offsets[x]; - dstp[x] = static_cast(workp[offset] * weight_left + workp[offset + 1] * weight_right); + auto weight_right = horizontal_weights[x]; + auto weight_left = 1. - weight_right; + auto offset = horizontal_offsets[x]; + auto result = static_cast(workp[offset] * weight_left + workp[offset + 1] * weight_right); + + if constexpr (limit_vectors) { + result = std::max(minimum, std::min(result, maximum)); + minimum -= horizontal_step; + maximum -= horizontal_step; + } + + dstp[x] = result; } dstp += dst_stride; + + if constexpr (limit_vectors) { + minimum -= vertical_step; + maximum -= vertical_step; + } } delete[] workp; }