From fe114a3752f858063fa9b0a733ecba37501572c7 Mon Sep 17 00:00:00 2001
From: John Doe <nickgray0@gmail.com>
Date: Sat, 16 May 2020 23:24:55 +0800
Subject: [PATCH] cumulative bug fixes

---
 src/DCTFFTW.hpp       |   5 +-
 src/FakeBlockData.hpp |   2 +-
 src/MVBlockFPS.hxx    |  28 ++--
 src/MVFlow.hxx        |  31 ++--
 src/MVFlowBlur.hxx    |  20 +--
 src/MVFlowFPS.hxx     | 120 +++++---------
 src/MVFlowInter.hxx   | 122 +++++---------
 src/MVInterface.h     |   2 +-
 src/MVMask.hxx        |  10 +-
 src/MaskFun.hpp       | 360 +++++++++++++++++++++++-------------------
 src/PlaneOfBlocks.h   | 115 ++++++--------
 src/SimpleResize.hpp  |  44 +++++-
 12 files changed, 392 insertions(+), 467 deletions(-)
diff --git a/src/DCTFFTW.hpp b/src/DCTFFTW.hpp
index 3496b96..e0b4a92 100644
--- a/src/DCTFFTW.hpp
+++ b/src/DCTFFTW.hpp
@@ -6,9 +6,8 @@
 #include "DCT.hpp"
 #include "Include/Interface.hxx"
 
-static auto &&g_fftw_plans_mutex = std::mutex{};
-
 class DCTFFTW final :public DCTClass {
+	static inline auto g_fftw_plans_mutex = std::mutex{};
 	self(fSrc, static_cast<double *>(nullptr));
 	self(dctplan, static_cast<fftw_plan>(nullptr));
 	self(fSrcDCT, static_cast<double *>(nullptr));
@@ -38,7 +37,9 @@ class DCTFFTW final :public DCTClass {
 	auto &operator=(DCTFFTW &&) = delete;
 	auto &operator=(const DCTFFTW &) = delete;
 	~DCTFFTW() override {
+		g_fftw_plans_mutex.lock();
 		fftw_destroy_plan(dctplan);
+		g_fftw_plans_mutex.unlock();
 		fftw_free(fSrc);
 		fftw_free(fSrcDCT);
 	}
diff --git a/src/FakeBlockData.hpp b/src/FakeBlockData.hpp
index 21fca05..e026acf 100644
--- a/src/FakeBlockData.hpp
+++ b/src/FakeBlockData.hpp
@@ -30,6 +30,6 @@ class FakeBlockData final {
 		return Vector; 
 	}
 	auto GetSAD() const {
-		return reinterpret_cast<const float &>(Vector.sad);
+		return Vector.sad;
 	}
 };
\ No newline at end of file
diff --git a/src/MVBlockFPS.hxx b/src/MVBlockFPS.hxx
index 7d5c2fa..1b6b37a 100644
--- a/src/MVBlockFPS.hxx
+++ b/src/MVBlockFPS.hxx
@@ -338,28 +338,20 @@ static const VSFrameRef *VS_CC mvblockfpsGetFrame(int32_t n, int32_t activationR
 					MakeSADMaskTime(ballsF, nBlkX, nBlkY, 4.0 / (ml * nBlkSizeX * nBlkSizeY), 1.0, nPel, smallMaskF, nBlkXP, time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY);
 					MakeSADMaskTime(ballsB, nBlkX, nBlkY, 4.0 / (ml * nBlkSizeX * nBlkSizeY), 1.0, nPel, smallMaskB, nBlkXP, 256 - time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY);
 				}
-				if (nBlkXP > nBlkX)
-					for (int j = 0; j < nBlkY; ++j) {
-						smallMaskF[j * nBlkXP + nBlkX] = smallMaskF[j * nBlkXP + nBlkX - 1];
-						smallMaskB[j * nBlkXP + nBlkX] = smallMaskB[j * nBlkXP + nBlkX - 1];
-					}
-				if (nBlkYP > nBlkY)
-					for (int i = 0; i < nBlkXP; ++i) {
-						smallMaskF[nBlkXP * nBlkY + i] = smallMaskF[nBlkXP * (nBlkY - 1) + i];
-						smallMaskB[nBlkXP * nBlkY + i] = smallMaskB[nBlkXP * (nBlkY - 1) + i];
-					}
-				upsizer->Resize(MaskFullYF, nPitchY, smallMaskF, nBlkXP);
-				upsizer->Resize(MaskFullYB, nPitchY, smallMaskB, nBlkXP);
+				CheckAndPadMaskSmall(smallMaskF, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				CheckAndPadMaskSmall(smallMaskB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				upsizer->Resize(MaskFullYF, nPitchY, smallMaskF, nBlkXP, false);
+				upsizer->Resize(MaskFullYB, nPitchY, smallMaskB, nBlkXP, false);
 				if (nSuperModeYUV & UVPLANES) {
-					upsizerUV->Resize(MaskFullUVF, nPitchUV, smallMaskF, nBlkXP);
-					upsizerUV->Resize(MaskFullUVB, nPitchUV, smallMaskB, nBlkXP);
+					upsizerUV->Resize(MaskFullUVF, nPitchUV, smallMaskF, nBlkXP, false);
+					upsizerUV->Resize(MaskFullUVB, nPitchUV, smallMaskB, nBlkXP, false);
 				}
 			}
 			if (mode == 4 || mode == 5 || mode == 7 || mode == 8) {
 				MultMasks(smallMaskF, smallMaskB, smallMaskO, nBlkXP, nBlkYP);
-				upsizer->Resize(MaskOccY, nPitchY, smallMaskO, nBlkXP);
+				upsizer->Resize(MaskOccY, nPitchY, smallMaskO, nBlkXP, false);
 				if (nSuperModeYUV & UVPLANES)
-					upsizerUV->Resize(MaskOccUV, nPitchUV, smallMaskO, nBlkXP);
+					upsizerUV->Resize(MaskOccUV, nPitchUV, smallMaskO, nBlkXP, false);
 			}
 			auto pMaskFullYB = MaskFullYB;
 			auto pMaskFullYF = MaskFullYF;
@@ -938,9 +930,9 @@ static void VS_CC mvblockfpsCreate(const VSMap *in, VSMap *out, void *userData,
 	d.nWidthUV = d.bleh->nWidth / d.bleh->xRatioUV;
 	d.nPitchY = (d.nWidthP + 15) & (~15);
 	d.nPitchUV = (d.nWidthPUV + 15) & (~15);
-	d.upsizer = new SimpleResize<double>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP);
+	d.upsizer = new SimpleResize<double>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, 0, 0, 0);
 	if (d.nSuperModeYUV & UVPLANES)
-		d.upsizerUV = new SimpleResize<double>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP);
+		d.upsizerUV = new SimpleResize<double>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, 0, 0, 0);
 	if (d.bleh->nOverlapX || d.bleh->nOverlapY) {
 		d.OverWins = new OverlapWindows(d.bleh->nBlkSizeX, d.bleh->nBlkSizeY, d.bleh->nOverlapX, d.bleh->nOverlapY);
 		if (d.nSuperModeYUV & UVPLANES)
diff --git a/src/MVFlow.hxx b/src/MVFlow.hxx
index d4c1b37..dde300e 100644
--- a/src/MVFlow.hxx
+++ b/src/MVFlow.hxx
@@ -176,16 +176,7 @@ else if (activationReason == arAllFramesReady) {
 		auto VXSmallY = new int32_t[nBlkYP * nBlkXP];
 		auto VYSmallY = new int32_t[nBlkYP * nBlkXP];
 		MakeVectorSmallMasks(balls, nBlkX, nBlkY, VXSmallY, nBlkXP, VYSmallY, nBlkXP);
-		if (nBlkXP > nBlkX)
-			for (auto j = 0; j < nBlkY; ++j) {
-				VXSmallY[j * nBlkXP + nBlkX] = std::min(VXSmallY[j * nBlkXP + nBlkX - 1], 0);
-				VYSmallY[j * nBlkXP + nBlkX] = VYSmallY[j * nBlkXP + nBlkX - 1];
-			}
-		if (nBlkYP > nBlkY)
-			for (auto i = 0; i < nBlkXP; ++i) {
-				VXSmallY[nBlkXP * nBlkY + i] = VXSmallY[nBlkXP * (nBlkY - 1) + i];
-				VYSmallY[nBlkXP * nBlkY + i] = std::min(VYSmallY[nBlkXP * (nBlkY - 1) + i], 0);
-			}
+		CheckAndPadSmallY(VXSmallY, VYSmallY, nBlkXP, nBlkYP, nBlkX, nBlkY);
 		auto fieldShift = 0;
 		if (d->fields && nPel > 1 && ((nref - n) % 2 != 0)) {
 			auto src = vsapi->getFrameFilter(n, d->finest, frameCtx);
@@ -216,8 +207,8 @@ else if (activationReason == arAllFramesReady) {
 		for (auto j = 0; j < nBlkYP; ++j)
 			for (auto i = 0; i < nBlkXP; ++i)
 				VYSmallY[j * nBlkXP + i] += fieldShift;
-		d->upsizer->Resize(VXFullY, VPitchY, VXSmallY, nBlkXP);
-		d->upsizer->Resize(VYFullY, VPitchY, VYSmallY, nBlkXP);
+		d->upsizer->Resize(VXFullY, VPitchY, VXSmallY, nBlkXP, true);
+		d->upsizer->Resize(VYFullY, VPitchY, VYSmallY, nBlkXP, false);
 		auto nOffsetY = nRefPitches[0] * nVPadding * nPel + nHPadding * bytesPerSample * nPel;
 		auto nOffsetUV = nRefPitches[1] * nVPaddingUV * nPel + nHPaddingUV * bytesPerSample * nPel;
 		if (static_cast<FlowModes>(d->mode) == FlowModes::Shift)
@@ -232,8 +223,8 @@ else if (activationReason == arAllFramesReady) {
 			auto VYSmallUV = new int32_t[nBlkYP * nBlkXP];
 			VectorSmallMaskYToHalfUV(VXSmallY, nBlkXP, nBlkYP, VXSmallUV, xRatioUV);
 			VectorSmallMaskYToHalfUV(VYSmallY, nBlkXP, nBlkYP, VYSmallUV, yRatioUV);
-			d->upsizerUV->Resize(VXFullUV, VPitchUV, VXSmallUV, nBlkXP);
-			d->upsizerUV->Resize(VYFullUV, VPitchUV, VYSmallUV, nBlkXP);
+			d->upsizerUV->Resize(VXFullUV, VPitchUV, VXSmallUV, nBlkXP, true);
+			d->upsizerUV->Resize(VYFullUV, VPitchUV, VYSmallUV, nBlkXP, false);
 			if (static_cast<FlowModes>(d->mode) == FlowModes::Shift) {
 				if (d->vi->format->colorFamily == cmRGB) {
 					flowMemset(pDst[1], 1.f, nHeightUV * nDstPitches[1]);
@@ -408,8 +399,12 @@ auto CreateFlow(auto in, auto out, auto core, auto vsapi) {
 		delete d.bleh;
 		return d;
 	}
-	d.nBlkXP = (d.bleh->nBlkX * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) ? d.bleh->nBlkX + 1 : d.bleh->nBlkX;
-	d.nBlkYP = (d.bleh->nBlkY * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) ? d.bleh->nBlkY + 1 : d.bleh->nBlkY;
+	d.nBlkXP = d.bleh->nBlkX;
+	while (d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth)
+		d.nBlkXP++;
+	d.nBlkYP = d.bleh->nBlkY;
+	while (d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight)
+		d.nBlkYP++;
 	d.nWidthP = d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX;
 	d.nHeightP = d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY;
 	d.nWidthPUV = d.nWidthP / d.bleh->xRatioUV;
@@ -420,9 +415,9 @@ auto CreateFlow(auto in, auto out, auto core, auto vsapi) {
 	d.nVPaddingUV = d.bleh->nVPadding / d.bleh->yRatioUV;
 	d.VPitchY = (d.nWidthP + 15) & (~15);
 	d.VPitchUV = (d.nWidthPUV + 15) & (~15);
-	d.upsizer = new SimpleResize<int32_t>(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY);
+	d.upsizer = new SimpleResize<int32_t>(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY, d.bleh->nWidth, d.bleh->nHeight, d.bleh->nPel);
 	if (d.vi->format->colorFamily != cmGray)
-		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY);
+		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY, d.nWidthUV, d.nHeightUV, d.bleh->nPel);
 	if (static_cast<FlowModes>(d.mode) == FlowModes::Fetch)
 		d.flow_function = flowFetch;
 	else if (static_cast<FlowModes>(d.mode) == FlowModes::Shift)
diff --git a/src/MVFlowBlur.hxx b/src/MVFlowBlur.hxx
index b84be68..965dd1c 100644
--- a/src/MVFlowBlur.hxx
+++ b/src/MVFlowBlur.hxx
@@ -294,10 +294,10 @@ static const VSFrameRef *VS_CC mvflowblurGetFrame(int32_t n, int32_t activationR
 			MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYF, nBlkX, VYSmallYF, nBlkX);
 
 
-			d->upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkX);
-			d->upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkX);
-			d->upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkX);
-			d->upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkX);
+			d->upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkX, true);
+			d->upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkX, false);
+			d->upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkX, true);
+			d->upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkX, false);
 
 			FlowBlur(pDst[0], nDstPitches[0], pRef[0] + nOffsetY, nRefPitches[0],
 				VXFullYB, VXFullYF, VYFullYB, VYFullYF, VPitchY,
@@ -322,11 +322,11 @@ static const VSFrameRef *VS_CC mvflowblurGetFrame(int32_t n, int32_t activationR
 				VectorSmallMaskYToHalfUV(VXSmallYF, nBlkX, nBlkY, VXSmallUVF, xRatioUV);
 				VectorSmallMaskYToHalfUV(VYSmallYF, nBlkX, nBlkY, VYSmallUVF, yRatioUV);
 
-				d->upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkX);
-				d->upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkX);
+				d->upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkX, true);
+				d->upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkX, false);
 
-				d->upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkX);
-				d->upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkX);
+				d->upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkX, true);
+				d->upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkX, false);
 
 
 				FlowBlur(pDst[1], nDstPitches[1], pRef[1] + nOffsetUV, nRefPitches[1],
@@ -614,9 +614,9 @@ static void VS_CC mvflowblurCreate(const VSMap *in, VSMap *out, void *userData,
 	d.nVPaddingUV = d.bleh->nVPadding / d.bleh->yRatioUV;
 	d.VPitchY = d.bleh->nWidth;
 	d.VPitchUV = d.nWidthUV;
-	d.upsizer = new SimpleResize<int32_t>(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY);
+	d.upsizer = new SimpleResize<int32_t>(d.bleh->nWidth, d.bleh->nHeight, d.bleh->nBlkX, d.bleh->nBlkY, d.mvClipB->nWidth, d.mvClipB->nHeight, d.mvClipB->nPel);
 	if (d.vi->format->colorFamily != cmGray)
-		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY);
+		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthUV, d.nHeightUV, d.bleh->nBlkX, d.bleh->nBlkY, d.nWidthUV, d.nHeightUV, d.mvClipB->nPel);
 	data = new MVFlowBlurData;
 	*data = d;
 	vsapi->createFilter(in, out, "FlowBlur", mvflowblurInit, mvflowblurGetFrame, mvflowblurFree, fmParallel, 0, data, core);
diff --git a/src/MVFlowFPS.hxx b/src/MVFlowFPS.hxx
index f566ff1..212981b 100644
--- a/src/MVFlowFPS.hxx
+++ b/src/MVFlowFPS.hxx
@@ -261,89 +261,52 @@ static const VSFrameRef *VS_CC mvflowfpsGetFrame(int32_t n, int32_t activationRe
 			int32_t nOffsetUV = nRefPitches[1] * nVPaddingUV * nPel + nHPaddingUV * bytesPerSample * nPel;
 
 			if (nright != d->nrightLast) {
-
 				MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYB, nBlkXP, VYSmallYB, nBlkXP);
-				if (nBlkXP > nBlkX) {// fill right
-					for (int32_t j = 0; j<nBlkY; j++) {
-						VXSmallYB[j*nBlkXP + nBlkX] = VSMIN(VXSmallYB[j*nBlkXP + nBlkX - 1], 0);
-						VYSmallYB[j*nBlkXP + nBlkX] = VYSmallYB[j*nBlkXP + nBlkX - 1];
-					}
-				}
-				if (nBlkYP > nBlkY) {// fill bottom
-					for (int32_t i = 0; i<nBlkXP; i++) {
-						VXSmallYB[nBlkXP*nBlkY + i] = VXSmallYB[nBlkXP*(nBlkY - 1) + i];
-						VYSmallYB[nBlkXP*nBlkY + i] = VSMIN(VYSmallYB[nBlkXP*(nBlkY - 1) + i], 0);
-					}
-				}
-
-				upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkXP);
-				upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP);
+				CheckAndPadSmallY(VXSmallYB, VYSmallYB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkXP, true);
+				upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP, false);
 
 				if (d->vi.format->colorFamily != cmGray) {
 					VectorSmallMaskYToHalfUV(VXSmallYB, nBlkXP, nBlkYP, VXSmallUVB, xRatioUV);
 					VectorSmallMaskYToHalfUV(VYSmallYB, nBlkXP, nBlkYP, VYSmallUVB, yRatioUV);
 
-					upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP);
-					upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP);
+					upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP, true);
+					upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP, false);
 				}
 			}
 			// analyze vectors field to detect occlusion
 			//        double occNormB = (256-time256)/(256*ml);
 			//        MakeVectorOcclusionMask(mvClipB, nBlkX, nBlkY, occNormB, 1.0, nPel, MaskSmallB, nBlkXP);
 			MakeVectorOcclusionMaskTime(ballsB, true, nBlkX, nBlkY, ml, 1.0, nPel, MaskSmallB, nBlkXP, (256 - time256), nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY);
-			if (nBlkXP > nBlkX) // fill right
-				for (int32_t j = 0; j<nBlkY; j++)
-					MaskSmallB[j*nBlkXP + nBlkX] = MaskSmallB[j*nBlkXP + nBlkX - 1];
-			if (nBlkYP > nBlkY) // fill bottom
-				for (int32_t i = 0; i<nBlkXP; i++)
-					MaskSmallB[nBlkXP*nBlkY + i] = MaskSmallB[nBlkXP*(nBlkY - 1) + i];
-
-			upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP);
+			CheckAndPadMaskSmall(MaskSmallB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+			upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP, false);
 			if (d->vi.format->colorFamily != cmGray)
-				upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP);
+				upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP, false);
 
 			d->nrightLast = nright;
 
 			if (nleft != d->nleftLast) {
 				MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYF, nBlkXP, VYSmallYF, nBlkXP);
-				if (nBlkXP > nBlkX) {// fill right
-					for (int32_t j = 0; j<nBlkY; j++) {
-						VXSmallYF[j*nBlkXP + nBlkX] = VSMIN(VXSmallYF[j*nBlkXP + nBlkX - 1], 0);
-						VYSmallYF[j*nBlkXP + nBlkX] = VYSmallYF[j*nBlkXP + nBlkX - 1];
-					}
-				}
-				if (nBlkYP > nBlkY) {// fill bottom
-					for (int32_t i = 0; i<nBlkXP; i++) {
-						VXSmallYF[nBlkXP*nBlkY + i] = VXSmallYF[nBlkXP*(nBlkY - 1) + i];
-						VYSmallYF[nBlkXP*nBlkY + i] = VSMIN(VYSmallYF[nBlkXP*(nBlkY - 1) + i], 0);
-					}
-				}
-
-				upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP);
-				upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP);
+				CheckAndPadSmallY(VXSmallYF, VYSmallYF, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP, true);
+				upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP, false);
 
 				if (d->vi.format->colorFamily != cmGray) {
 					VectorSmallMaskYToHalfUV(VXSmallYF, nBlkXP, nBlkYP, VXSmallUVF, xRatioUV);
 					VectorSmallMaskYToHalfUV(VYSmallYF, nBlkXP, nBlkYP, VYSmallUVF, yRatioUV);
 
-					upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP);
-					upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP);
+					upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP, true);
+					upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP, false);
 				}
 			}
 			// analyze vectors field to detect occlusion
 			//        double occNormF = time256/(256*ml);
 			//        MakeVectorOcclusionMask(mvClipF, nBlkX, nBlkY, occNormF, 1.0, nPel, MaskSmallF, nBlkXP);
 			MakeVectorOcclusionMaskTime(ballsF, false, nBlkX, nBlkY, ml, 1.0, nPel, MaskSmallF, nBlkXP, time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY);
-			if (nBlkXP > nBlkX) // fill right
-				for (int32_t j = 0; j<nBlkY; j++)
-					MaskSmallF[j*nBlkXP + nBlkX] = MaskSmallF[j*nBlkXP + nBlkX - 1];
-			if (nBlkYP > nBlkY) // fill bottom
-				for (int32_t i = 0; i<nBlkXP; i++)
-					MaskSmallF[nBlkXP*nBlkY + i] = MaskSmallF[nBlkXP*(nBlkY - 1) + i];
-
-			upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP);
+			CheckAndPadMaskSmall(MaskSmallF, nBlkXP, nBlkYP, nBlkX, nBlkY);
+			upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP, false);
 			if (d->vi.format->colorFamily != cmGray)
-				upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP);
+				upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP, false);
 
 			d->nleftLast = nleft;
 
@@ -364,28 +327,13 @@ static const VSFrameRef *VS_CC mvflowfpsGetFrame(int32_t n, int32_t activationRe
 														  // get vector mask from extra frames
 				MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYBB, nBlkXP, VYSmallYBB, nBlkXP);
 				MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYFF, nBlkXP, VYSmallYFF, nBlkXP);
-				if (nBlkXP > nBlkX) {// fill right
-					for (int32_t j = 0; j<nBlkY; j++) {
-						VXSmallYBB[j*nBlkXP + nBlkX] = VSMIN(VXSmallYBB[j*nBlkXP + nBlkX - 1], 0);
-						VYSmallYBB[j*nBlkXP + nBlkX] = VYSmallYBB[j*nBlkXP + nBlkX - 1];
-						VXSmallYFF[j*nBlkXP + nBlkX] = VSMIN(VXSmallYFF[j*nBlkXP + nBlkX - 1], 0);
-						VYSmallYFF[j*nBlkXP + nBlkX] = VYSmallYFF[j*nBlkXP + nBlkX - 1];
-					}
-				}
-				if (nBlkYP > nBlkY) {// fill bottom
-					for (int32_t i = 0; i<nBlkXP; i++) {
-						VXSmallYBB[nBlkXP*nBlkY + i] = VXSmallYBB[nBlkXP*(nBlkY - 1) + i];
-						VYSmallYBB[nBlkXP*nBlkY + i] = VSMIN(VYSmallYBB[nBlkXP*(nBlkY - 1) + i], 0);
-						VXSmallYFF[nBlkXP*nBlkY + i] = VXSmallYFF[nBlkXP*(nBlkY - 1) + i];
-						VYSmallYFF[nBlkXP*nBlkY + i] = VSMIN(VYSmallYFF[nBlkXP*(nBlkY - 1) + i], 0);
-					}
-				}
-
-				upsizer->Resize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP);
-				upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP);
+				CheckAndPadSmallY(VXSmallYBB, VYSmallYBB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				CheckAndPadSmallY(VXSmallYFF, VYSmallYFF, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				upsizer->Resize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP, true);
+				upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP, false);
 
-				upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP);
-				upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP);
+				upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP, true);
+				upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP, false);
 
 				FlowInterExtra(pDst[0], nDstPitches[0], pRef[0] + nOffsetY, pSrc[0] + nOffsetY, nRefPitches[0],
 					VXFullYB, VXFullYF, VYFullYB, VYFullYF, MaskFullYB, MaskFullYF, VPitchY,
@@ -396,11 +344,11 @@ static const VSFrameRef *VS_CC mvflowfpsGetFrame(int32_t n, int32_t activationRe
 					VectorSmallMaskYToHalfUV(VXSmallYFF, nBlkXP, nBlkYP, VXSmallUVFF, xRatioUV);
 					VectorSmallMaskYToHalfUV(VYSmallYFF, nBlkXP, nBlkYP, VYSmallUVFF, yRatioUV);
 
-					upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP);
-					upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP);
+					upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP, true);
+					upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP, false);
 
-					upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP);
-					upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP);
+					upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP, true);
+					upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP, false);
 
 					FlowInterExtra(pDst[1], nDstPitches[1], pRef[1] + nOffsetUV, pSrc[1] + nOffsetUV, nRefPitches[1],
 						VXFullUVB, VXFullUVF, VYFullUVB, VYFullUVF, MaskFullUVB, MaskFullUVF, VPitchUV,
@@ -858,8 +806,12 @@ static void VS_CC mvflowfpsCreate(const VSMap *in, VSMap *out, void *userData, V
 		return;
 	}
 
-	d.nBlkXP = (d.bleh->nBlkX * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) ? d.bleh->nBlkX + 1 : d.bleh->nBlkX;
-	d.nBlkYP = (d.bleh->nBlkY * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) ? d.bleh->nBlkY + 1 : d.bleh->nBlkY;
+	d.nBlkXP = d.bleh->nBlkX;
+	while (d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth)
+		d.nBlkXP++;
+	d.nBlkYP = d.bleh->nBlkY;
+	while (d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight)
+		d.nBlkYP++;
 	d.nWidthP = d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX;
 	d.nHeightP = d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY;
 
@@ -907,8 +859,8 @@ static void VS_CC mvflowfpsCreate(const VSMap *in, VSMap *out, void *userData, V
 	d.MaskSmallF = new double[d.nBlkXP * d.nBlkYP];
 	d.MaskFullYF = new double[d.nHeightP * d.VPitchY];
 
-	d.upsizer = new SimpleResize<int32_t>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP);
-	d.upsizer2 = new SimpleResize<double>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP);
+	d.upsizer = new SimpleResize<int32_t>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, d.mvClipB->nWidth, d.mvClipB->nHeight, d.mvClipB->nPel);
+	d.upsizer2 = new SimpleResize<double>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, 0, 0, 0);
 
 	if (d.vi.format->colorFamily != cmGray) {
 		d.VXFullUVB = new int32_t[d.nHeightPUV * d.VPitchUV];
@@ -934,8 +886,8 @@ static void VS_CC mvflowfpsCreate(const VSMap *in, VSMap *out, void *userData, V
 		d.MaskFullUVB = new double[d.nHeightPUV * d.VPitchUV];
 		d.MaskFullUVF = new double[d.nHeightPUV * d.VPitchUV];
 
-		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP);
-		d.upsizerUV2 = new SimpleResize<double>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP);
+		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, d.nWidthUV, d.nHeightUV, d.mvClipB->nPel);
+		d.upsizerUV2 = new SimpleResize<double>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, 0, 0, 0);
 	}
 
 
diff --git a/src/MVFlowInter.hxx b/src/MVFlowInter.hxx
index f0c6997..ef4a068 100644
--- a/src/MVFlowInter.hxx
+++ b/src/MVFlowInter.hxx
@@ -172,59 +172,25 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation
 			double *MaskFullUVB = nullptr;
 			double *MaskFullUVF = nullptr;
 
-
 			MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYB, nBlkXP, VYSmallYB, nBlkXP);
 			MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYF, nBlkXP, VYSmallYF, nBlkXP);
-			if (nBlkXP > nBlkX) // fill right
-			{
-				for (int32_t j = 0; j<nBlkY; j++)
-				{
-					VXSmallYB[j*nBlkXP + nBlkX] = VSMIN(VXSmallYB[j*nBlkXP + nBlkX - 1], 0);
-					VYSmallYB[j*nBlkXP + nBlkX] = VYSmallYB[j*nBlkXP + nBlkX - 1];
-					VXSmallYF[j*nBlkXP + nBlkX] = VSMIN(VXSmallYF[j*nBlkXP + nBlkX - 1], 0);
-					VYSmallYF[j*nBlkXP + nBlkX] = VYSmallYF[j*nBlkXP + nBlkX - 1];
-				}
-			}
-			if (nBlkYP > nBlkY) // fill bottom
-			{
-				for (int32_t i = 0; i<nBlkXP; i++)
-				{
-					VXSmallYB[nBlkXP*nBlkY + i] = VXSmallYB[nBlkXP*(nBlkY - 1) + i];
-					VYSmallYB[nBlkXP*nBlkY + i] = VSMIN(VYSmallYB[nBlkXP*(nBlkY - 1) + i], 0);
-					VXSmallYF[nBlkXP*nBlkY + i] = VXSmallYF[nBlkXP*(nBlkY - 1) + i];
-					VYSmallYF[nBlkXP*nBlkY + i] = VSMIN(VYSmallYF[nBlkXP*(nBlkY - 1) + i], 0);
-				}
-			}
+			CheckAndPadSmallY(VXSmallYB, VYSmallYB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+			CheckAndPadSmallY(VXSmallYF, VYSmallYF, nBlkXP, nBlkYP, nBlkX, nBlkY);
 			// analyze vectors field to detect occlusion
 			//      double occNormB = (256-time256)/(256*ml);
 			MakeVectorOcclusionMaskTime(ballsB, true, nBlkX, nBlkY, ml, 1.0, nPel, MaskSmallB, nBlkXP, (256 - time256), nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY);
 			//      double occNormF = time256/(256*ml);
 			MakeVectorOcclusionMaskTime(ballsF, false, nBlkX, nBlkY, ml, 1.0, nPel, MaskSmallF, nBlkXP, time256, nBlkSizeX - nOverlapX, nBlkSizeY - nOverlapY);
-			if (nBlkXP > nBlkX) // fill right
-			{
-				for (int32_t j = 0; j<nBlkY; j++)
-				{
-					MaskSmallB[j*nBlkXP + nBlkX] = MaskSmallB[j*nBlkXP + nBlkX - 1];
-					MaskSmallF[j*nBlkXP + nBlkX] = MaskSmallF[j*nBlkXP + nBlkX - 1];
-				}
-			}
-			if (nBlkYP > nBlkY) // fill bottom
-			{
-				for (int32_t i = 0; i<nBlkXP; i++)
-				{
-					MaskSmallB[nBlkXP*nBlkY + i] = MaskSmallB[nBlkXP*(nBlkY - 1) + i];
-					MaskSmallF[nBlkXP*nBlkY + i] = MaskSmallF[nBlkXP*(nBlkY - 1) + i];
-				}
-			}
-			// upsize (bilinear interpolate) vector masks to fullframe size
+			CheckAndPadMaskSmall(MaskSmallB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+			CheckAndPadMaskSmall(MaskSmallF, nBlkXP, nBlkYP, nBlkX, nBlkY);
 
-
-			upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkXP);
-			upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP);
-			upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP);
-			upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP);
-			upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP);
-			upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP);
+			// upsize (bilinear interpolate) vector masks to fullframe size
+			upsizer->Resize(VXFullYB, VPitchY, VXSmallYB, nBlkXP, true);
+			upsizer->Resize(VYFullYB, VPitchY, VYSmallYB, nBlkXP, false);
+			upsizer->Resize(VXFullYF, VPitchY, VXSmallYF, nBlkXP, true);
+			upsizer->Resize(VYFullYF, VPitchY, VYSmallYF, nBlkXP, false);
+			upsizer2->Resize(MaskFullYB, VPitchY, MaskSmallB, nBlkXP, false);
+			upsizer2->Resize(MaskFullYF, VPitchY, MaskSmallF, nBlkXP, false);
 
 			if (d->vi->format->colorFamily != cmGray) {
 				VXFullUVB = new int32_t[nHeightPUV * VPitchUV];
@@ -243,12 +209,12 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation
 				VectorSmallMaskYToHalfUV(VXSmallYF, nBlkXP, nBlkYP, VXSmallUVF, xRatioUV);
 				VectorSmallMaskYToHalfUV(VYSmallYF, nBlkXP, nBlkYP, VYSmallUVF, yRatioUV);
 
-				upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP);
-				upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP);
-				upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP);
-				upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP);
-				upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP);
-				upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP);
+				upsizerUV->Resize(VXFullUVB, VPitchUV, VXSmallUVB, nBlkXP, true);
+				upsizerUV->Resize(VYFullUVB, VPitchUV, VYSmallUVB, nBlkXP, false);
+				upsizerUV->Resize(VXFullUVF, VPitchUV, VXSmallUVF, nBlkXP, true);
+				upsizerUV->Resize(VYFullUVF, VPitchUV, VYSmallUVF, nBlkXP, false);
+				upsizerUV2->Resize(MaskFullUVB, VPitchUV, MaskSmallB, nBlkXP, false);
+				upsizerUV2->Resize(MaskFullUVF, VPitchUV, MaskSmallF, nBlkXP, false);
 			}
 
 
@@ -275,32 +241,14 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation
 				// get vector mask from extra frames
 				MakeVectorSmallMasks(ballsB, nBlkX, nBlkY, VXSmallYBB, nBlkXP, VYSmallYBB, nBlkXP);
 				MakeVectorSmallMasks(ballsF, nBlkX, nBlkY, VXSmallYFF, nBlkXP, VYSmallYFF, nBlkXP);
-				if (nBlkXP > nBlkX) // fill right
-				{
-					for (int32_t j = 0; j<nBlkY; j++)
-					{
-						VXSmallYBB[j*nBlkXP + nBlkX] = VSMIN(VXSmallYBB[j*nBlkXP + nBlkX - 1], 0);
-						VYSmallYBB[j*nBlkXP + nBlkX] = VYSmallYBB[j*nBlkXP + nBlkX - 1];
-						VXSmallYFF[j*nBlkXP + nBlkX] = VSMIN(VXSmallYFF[j*nBlkXP + nBlkX - 1], 0);
-						VYSmallYFF[j*nBlkXP + nBlkX] = VYSmallYFF[j*nBlkXP + nBlkX - 1];
-					}
-				}
-				if (nBlkYP > nBlkY) // fill bottom
-				{
-					for (int32_t i = 0; i<nBlkXP; i++)
-					{
-						VXSmallYBB[nBlkXP*nBlkY + i] = VXSmallYBB[nBlkXP*(nBlkY - 1) + i];
-						VYSmallYBB[nBlkXP*nBlkY + i] = VSMIN(VYSmallYBB[nBlkXP*(nBlkY - 1) + i], 0);
-						VXSmallYFF[nBlkXP*nBlkY + i] = VXSmallYFF[nBlkXP*(nBlkY - 1) + i];
-						VYSmallYFF[nBlkXP*nBlkY + i] = VSMIN(VYSmallYFF[nBlkXP*(nBlkY - 1) + i], 0);
-					}
-				}
+				CheckAndPadSmallY(VXSmallYBB, VYSmallYBB, nBlkXP, nBlkYP, nBlkX, nBlkY);
+				CheckAndPadSmallY(VXSmallYFF, VYSmallYFF, nBlkXP, nBlkYP, nBlkX, nBlkY);
 
 				// upsize vectors to full frame
-				upsizer->Resize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP);
-				upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP);
-				upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP);
-				upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP);
+				upsizer->Resize(VXFullYBB, VPitchY, VXSmallYBB, nBlkXP, true);
+				upsizer->Resize(VYFullYBB, VPitchY, VYSmallYBB, nBlkXP, false);
+				upsizer->Resize(VXFullYFF, VPitchY, VXSmallYFF, nBlkXP, true);
+				upsizer->Resize(VYFullYFF, VPitchY, VYSmallYFF, nBlkXP, false);
 
 				FlowInterExtra(pDst[0], nDstPitches[0], pRef[0] + nOffsetY, pSrc[0] + nOffsetY, nRefPitches[0],
 					VXFullYB, VXFullYF, VYFullYB, VYFullYF, MaskFullYB, MaskFullYF, VPitchY,
@@ -321,10 +269,10 @@ static const VSFrameRef *VS_CC mvflowinterGetFrame(int32_t n, int32_t activation
 					VectorSmallMaskYToHalfUV(VXSmallYFF, nBlkXP, nBlkYP, VXSmallUVFF, xRatioUV);
 					VectorSmallMaskYToHalfUV(VYSmallYFF, nBlkXP, nBlkYP, VYSmallUVFF, yRatioUV);
 
-					upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP);
-					upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP);
-					upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP);
-					upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP);
+					upsizerUV->Resize(VXFullUVBB, VPitchUV, VXSmallUVBB, nBlkXP, true);
+					upsizerUV->Resize(VYFullUVBB, VPitchUV, VYSmallUVBB, nBlkXP, false);
+					upsizerUV->Resize(VXFullUVFF, VPitchUV, VXSmallUVFF, nBlkXP, true);
+					upsizerUV->Resize(VYFullUVFF, VPitchUV, VYSmallUVFF, nBlkXP, false);
 
 					FlowInterExtra(pDst[1], nDstPitches[1], pRef[1] + nOffsetUV, pSrc[1] + nOffsetUV, nRefPitches[1],
 						VXFullUVB, VXFullUVF, VYFullUVB, VYFullUVF, MaskFullUVB, MaskFullUVF, VPitchUV,
@@ -675,8 +623,12 @@ static void VS_CC mvflowinterCreate(const VSMap *in, VSMap *out, void *userData,
 		delete d.mvClipF;
 		return;
 	}
-	d.nBlkXP = (d.bleh->nBlkX * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth) ? d.bleh->nBlkX + 1 : d.bleh->nBlkX;
-	d.nBlkYP = (d.bleh->nBlkY * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight) ? d.bleh->nBlkY + 1 : d.bleh->nBlkY;
+	d.nBlkXP = d.bleh->nBlkX;
+	while (d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX < d.bleh->nWidth)
+		d.nBlkXP++;
+	d.nBlkYP = d.bleh->nBlkY;
+	while (d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY < d.bleh->nHeight)
+		d.nBlkYP++;
 	d.nWidthP = d.nBlkXP * (d.bleh->nBlkSizeX - d.bleh->nOverlapX) + d.bleh->nOverlapX;
 	d.nHeightP = d.nBlkYP * (d.bleh->nBlkSizeY - d.bleh->nOverlapY) + d.bleh->nOverlapY;
 	d.nWidthPUV = d.nWidthP / d.bleh->xRatioUV;
@@ -687,11 +639,11 @@ static void VS_CC mvflowinterCreate(const VSMap *in, VSMap *out, void *userData,
 	d.nVPaddingUV = d.bleh->nVPadding / d.bleh->yRatioUV;
 	d.VPitchY = (d.nWidthP + 15) & (~15);
 	d.VPitchUV = (d.nWidthPUV + 15) & (~15);
-	d.upsizer = new SimpleResize<int32_t>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP);
-	d.upsizer2 = new SimpleResize<double>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP);
+	d.upsizer = new SimpleResize<int32_t>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, d.mvClipB->nWidth, d.mvClipB->nHeight, d.mvClipB->nPel);
+	d.upsizer2 = new SimpleResize<double>(d.nWidthP, d.nHeightP, d.nBlkXP, d.nBlkYP, 0, 0, 0);
 	if (d.vi->format->colorFamily != cmGray) {
-		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP);
-		d.upsizerUV2 = new SimpleResize<double>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP);
+		d.upsizerUV = new SimpleResize<int32_t>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, d.nWidthUV, d.nHeightUV, d.mvClipB->nPel);
+		d.upsizerUV2 = new SimpleResize<double>(d.nWidthPUV, d.nHeightPUV, d.nBlkXP, d.nBlkYP, 0, 0, 0);
 	}
 	data = new MVFlowInterData;
 	*data = d;
diff --git a/src/MVInterface.h b/src/MVInterface.h
index 60cf515..a99a7b9 100644
--- a/src/MVInterface.h
+++ b/src/MVInterface.h
@@ -23,7 +23,7 @@ constexpr auto MVAnalysisDataVersion = 5;
 struct VectorStructure {
 	self(x, 0_i32);
 	self(y, 0_i32);
-	self(sad, -1.f);
+	self(sad, -1.);
 };
 
 constexpr auto N_PER_BLOCK = sizeof(VectorStructure) / sizeof(std::int32_t);
diff --git a/src/MVMask.hxx b/src/MVMask.hxx
index 8c6d541..972a922 100644
--- a/src/MVMask.hxx
+++ b/src/MVMask.hxx
@@ -110,8 +110,8 @@ struct MVMaskData final {
 		nWidthUV = bleh->nWidth / bleh->xRatioUV;
 		nHeightBUV = nHeightB / bleh->yRatioUV;
 		nWidthBUV = nWidthB / bleh->xRatioUV;
-		upsizer = new SimpleResize<float>(nWidthB, nHeightB, bleh->nBlkX, bleh->nBlkY);
-		upsizerUV = new SimpleResize<float>(nWidthBUV, nHeightBUV, bleh->nBlkX, bleh->nBlkY);
+		upsizer = new SimpleResize<float>(nWidthB, nHeightB, bleh->nBlkX, bleh->nBlkY, 0, 0, 0);
+		upsizerUV = new SimpleResize<float>(nWidthBUV, nHeightBUV, bleh->nBlkX, bleh->nBlkY, 0, 0, 0);
 	}
 	MVMaskData(const MVMaskData &) = delete;
 	MVMaskData(MVMaskData &&) = delete;
@@ -225,7 +225,7 @@ static auto VS_CC mvmaskGetFrame(int n, int activationReason, void **instanceDat
 			if (kind == 5)
 				std::memcpy(pDst[0], pSrc, nSrcPitch * nHeight * sizeof(float));
 			else {
-				upsizer->Resize(pDst[0], nDstPitches[0], smallMask.get(), nBlkX);
+				upsizer->Resize(pDst[0], nDstPitches[0], smallMask.get(), nBlkX, false);
 				if (nWidth > nWidthB)
 					for (auto h = 0; h < nHeight; ++h)
 						for (auto w = nWidthB; w < nWidth; ++w)
@@ -236,9 +236,9 @@ static auto VS_CC mvmaskGetFrame(int n, int activationReason, void **instanceDat
 		};
 		auto SmallMaskToChroma = [&]() {
 			if (!d->IsGray) {
-				upsizerUV->Resize(pDst[1], nDstPitches[1], smallMask.get(), nBlkX);
+				upsizerUV->Resize(pDst[1], nDstPitches[1], smallMask.get(), nBlkX, false);
 				if (kind == 5)
-					upsizerUV->Resize(pDst[2], nDstPitches[2], smallMaskV.get(), nBlkX);
+					upsizerUV->Resize(pDst[2], nDstPitches[2], smallMaskV.get(), nBlkX, false);
 				else
 					std::memcpy(pDst[2], pDst[1], nHeightUV * nDstPitches[1] * sizeof(float));
 				if (nWidthUV > nWidthBUV)
diff --git a/src/MaskFun.hpp b/src/MaskFun.hpp
index 23b2a3e..9589e52 100644
--- a/src/MaskFun.hpp
+++ b/src/MaskFun.hpp
@@ -7,22 +7,50 @@
 #include "MVFrame.h"
 #include "SADFunctions.hpp"
 
+auto CheckAndPadMaskSmall(auto MaskSmall, auto nBlkXP, auto nBlkYP, auto nBlkX, auto nBlkY) {
+	if (nBlkXP > nBlkX)
+		for (auto j : Range{ nBlkY })
+			for (auto dx : Range{ nBlkX, nBlkXP })
+				MaskSmall[j * nBlkXP + dx] = MaskSmall[j * nBlkXP + nBlkX - 1];
+	if (nBlkYP > nBlkY)
+		for (auto i : Range{ nBlkXP })
+			for (auto dy : Range{ nBlkY, nBlkYP })
+				MaskSmall[nBlkXP * dy + i] = MaskSmall[nBlkXP * (nBlkY - 1) + i];
+}
+
+auto CheckAndPadSmallY(auto VXSmallY, auto VYSmallY, auto nBlkXP, auto nBlkYP, auto nBlkX, auto nBlkY) {
+	using PixelType = std::decay_t<decltype(VXSmallY[0])>;
+	constexpr auto Zero = static_cast<PixelType>(0);
+	if (nBlkXP > nBlkX)
+		for (auto j : Range{ nBlkY })
+			for (auto dx : Range{ nBlkX, nBlkXP }) {
+				VXSmallY[j * nBlkXP + dx] = std::min(VXSmallY[j * nBlkXP + nBlkX - 1], Zero);
+				VYSmallY[j * nBlkXP + dx] = VYSmallY[j * nBlkXP + nBlkX - 1];
+			}
+	if (nBlkYP > nBlkY)
+		for (auto i : Range{ nBlkXP })
+			for (auto dy : Range{ nBlkY, nBlkYP }) {
+				VXSmallY[nBlkXP * dy + i] = VXSmallY[nBlkXP * (nBlkY - 1) + i];
+				VYSmallY[nBlkXP * dy + i] = std::min(VYSmallY[nBlkXP * (nBlkY - 1) + i], Zero);
+			}
+}
+
 template <typename PixelType>
-static auto RealMerge4PlanesToBig(uint8_t *pel2Plane_u8, int32_t pel2Pitch, const uint8_t *pPlane0_u8, const uint8_t *pPlane1_u8,
-	const uint8_t *pPlane2_u8, const uint8_t * pPlane3_u8, int32_t width, int32_t height, int32_t pitch) {
-	for (auto h = 0; h<height; ++h) {
-		for (auto w = 0; w<width; ++w) {
-			auto pel2Plane = reinterpret_cast<PixelType *>(pel2Plane_u8);
-			auto pPlane0 = reinterpret_cast<const PixelType *>(pPlane0_u8);
-			auto pPlane1 = reinterpret_cast<const PixelType *>(pPlane1_u8);
+static auto RealMerge4PlanesToBig(uint8_t* pel2Plane_u8, int32_t pel2Pitch, const uint8_t* pPlane0_u8, const uint8_t* pPlane1_u8,
+	const uint8_t* pPlane2_u8, const uint8_t* pPlane3_u8, int32_t width, int32_t height, int32_t pitch) {
+	for (auto h = 0; h < height; ++h) {
+		for (auto w = 0; w < width; ++w) {
+			auto pel2Plane = reinterpret_cast<PixelType*>(pel2Plane_u8);
+			auto pPlane0 = reinterpret_cast<const PixelType*>(pPlane0_u8);
+			auto pPlane1 = reinterpret_cast<const PixelType*>(pPlane1_u8);
 			pel2Plane[w << 1] = pPlane0[w];
 			pel2Plane[(w << 1) + 1] = pPlane1[w];
 		}
 		pel2Plane_u8 += pel2Pitch;
-		for (auto w = 0; w<width; ++w) {
-			auto pel2Plane = reinterpret_cast<PixelType *>(pel2Plane_u8);
-			auto pPlane2 = reinterpret_cast<const PixelType *>(pPlane2_u8);
-			auto pPlane3 = reinterpret_cast<const PixelType *>(pPlane3_u8);
+		for (auto w = 0; w < width; ++w) {
+			auto pel2Plane = reinterpret_cast<PixelType*>(pel2Plane_u8);
+			auto pPlane2 = reinterpret_cast<const PixelType*>(pPlane2_u8);
+			auto pPlane3 = reinterpret_cast<const PixelType*>(pPlane3_u8);
 			pel2Plane[w << 1] = pPlane2[w];
 			pel2Plane[(w << 1) + 1] = pPlane3[w];
 		}
@@ -34,60 +62,60 @@ static auto RealMerge4PlanesToBig(uint8_t *pel2Plane_u8, int32_t pel2Pitch, cons
 	}
 }
 
-static auto Merge4PlanesToBig(uint8_t *pel2Plane, int32_t pel2Pitch, const uint8_t *pPlane0, const uint8_t *pPlane1, const uint8_t *pPlane2, const uint8_t *pPlane3, int32_t width, int32_t height, int32_t pitch) {
+static auto Merge4PlanesToBig(uint8_t* pel2Plane, int32_t pel2Pitch, const uint8_t* pPlane0, const uint8_t* pPlane1, const uint8_t* pPlane2, const uint8_t* pPlane3, int32_t width, int32_t height, int32_t pitch) {
 	RealMerge4PlanesToBig<float>(pel2Plane, pel2Pitch, pPlane0, pPlane1, pPlane2, pPlane3, width, height, pitch);
 }
 
 template <typename PixelType>
-static void RealMerge16PlanesToBig(uint8_t *pel4Plane_u8, int32_t pel4Pitch,
-	const uint8_t *pPlane0_u8, const uint8_t *pPlane1_u8, const uint8_t *pPlane2_u8, const uint8_t * pPlane3_u8,
-	const uint8_t *pPlane4_u8, const uint8_t *pPlane5_u8, const uint8_t *pPlane6_u8, const uint8_t * pPlane7_u8,
-	const uint8_t *pPlane8_u8, const uint8_t *pPlane9_u8, const uint8_t *pPlane10_u8, const uint8_t * pPlane11_u8,
-	const uint8_t *pPlane12_u8, const uint8_t * pPlane13_u8, const uint8_t *pPlane14_u8, const uint8_t * pPlane15_u8,
+static void RealMerge16PlanesToBig(uint8_t* pel4Plane_u8, int32_t pel4Pitch,
+	const uint8_t* pPlane0_u8, const uint8_t* pPlane1_u8, const uint8_t* pPlane2_u8, const uint8_t* pPlane3_u8,
+	const uint8_t* pPlane4_u8, const uint8_t* pPlane5_u8, const uint8_t* pPlane6_u8, const uint8_t* pPlane7_u8,
+	const uint8_t* pPlane8_u8, const uint8_t* pPlane9_u8, const uint8_t* pPlane10_u8, const uint8_t* pPlane11_u8,
+	const uint8_t* pPlane12_u8, const uint8_t* pPlane13_u8, const uint8_t* pPlane14_u8, const uint8_t* pPlane15_u8,
 	int32_t width, int32_t height, int32_t pitch) {
-	for (auto h = 0; h<height; ++h) {
-		for (auto w = 0; w<width; ++w) {
-			PixelType *pel4Plane = (PixelType *)pel4Plane_u8;
-			const PixelType *pPlane0 = (const PixelType *)pPlane0_u8;
-			const PixelType *pPlane1 = (const PixelType *)pPlane1_u8;
-			const PixelType *pPlane2 = (const PixelType *)pPlane2_u8;
-			const PixelType *pPlane3 = (const PixelType *)pPlane3_u8;
+	for (auto h = 0; h < height; ++h) {
+		for (auto w = 0; w < width; ++w) {
+			PixelType* pel4Plane = (PixelType*)pel4Plane_u8;
+			const PixelType* pPlane0 = (const PixelType*)pPlane0_u8;
+			const PixelType* pPlane1 = (const PixelType*)pPlane1_u8;
+			const PixelType* pPlane2 = (const PixelType*)pPlane2_u8;
+			const PixelType* pPlane3 = (const PixelType*)pPlane3_u8;
 			pel4Plane[w << 2] = pPlane0[w];
 			pel4Plane[(w << 2) + 1] = pPlane1[w];
 			pel4Plane[(w << 2) + 2] = pPlane2[w];
 			pel4Plane[(w << 2) + 3] = pPlane3[w];
 		}
 		pel4Plane_u8 += pel4Pitch;
-		for (auto w = 0; w<width; ++w) {
-			PixelType *pel4Plane = (PixelType *)pel4Plane_u8;
-			const PixelType *pPlane4 = (const PixelType *)pPlane4_u8;
-			const PixelType *pPlane5 = (const PixelType *)pPlane5_u8;
-			const PixelType *pPlane6 = (const PixelType *)pPlane6_u8;
-			const PixelType *pPlane7 = (const PixelType *)pPlane7_u8;
+		for (auto w = 0; w < width; ++w) {
+			PixelType* pel4Plane = (PixelType*)pel4Plane_u8;
+			const PixelType* pPlane4 = (const PixelType*)pPlane4_u8;
+			const PixelType* pPlane5 = (const PixelType*)pPlane5_u8;
+			const PixelType* pPlane6 = (const PixelType*)pPlane6_u8;
+			const PixelType* pPlane7 = (const PixelType*)pPlane7_u8;
 			pel4Plane[w << 2] = pPlane4[w];
 			pel4Plane[(w << 2) + 1] = pPlane5[w];
 			pel4Plane[(w << 2) + 2] = pPlane6[w];
 			pel4Plane[(w << 2) + 3] = pPlane7[w];
 		}
 		pel4Plane_u8 += pel4Pitch;
-		for (auto w = 0; w<width; ++w) {
-			PixelType *pel4Plane = (PixelType *)pel4Plane_u8;
-			const PixelType *pPlane8 = (const PixelType *)pPlane8_u8;
-			const PixelType *pPlane9 = (const PixelType *)pPlane9_u8;
-			const PixelType *pPlane10 = (const PixelType *)pPlane10_u8;
-			const PixelType *pPlane11 = (const PixelType *)pPlane11_u8;
+		for (auto w = 0; w < width; ++w) {
+			PixelType* pel4Plane = (PixelType*)pel4Plane_u8;
+			const PixelType* pPlane8 = (const PixelType*)pPlane8_u8;
+			const PixelType* pPlane9 = (const PixelType*)pPlane9_u8;
+			const PixelType* pPlane10 = (const PixelType*)pPlane10_u8;
+			const PixelType* pPlane11 = (const PixelType*)pPlane11_u8;
 			pel4Plane[w << 2] = pPlane8[w];
 			pel4Plane[(w << 2) + 1] = pPlane9[w];
 			pel4Plane[(w << 2) + 2] = pPlane10[w];
 			pel4Plane[(w << 2) + 3] = pPlane11[w];
 		}
 		pel4Plane_u8 += pel4Pitch;
-		for (auto w = 0; w<width; ++w) {
-			PixelType *pel4Plane = (PixelType *)pel4Plane_u8;
-			const PixelType *pPlane12 = (const PixelType *)pPlane12_u8;
-			const PixelType *pPlane13 = (const PixelType *)pPlane13_u8;
-			const PixelType *pPlane14 = (const PixelType *)pPlane14_u8;
-			const PixelType *pPlane15 = (const PixelType *)pPlane15_u8;
+		for (auto w = 0; w < width; ++w) {
+			PixelType* pel4Plane = (PixelType*)pel4Plane_u8;
+			const PixelType* pPlane12 = (const PixelType*)pPlane12_u8;
+			const PixelType* pPlane13 = (const PixelType*)pPlane13_u8;
+			const PixelType* pPlane14 = (const PixelType*)pPlane14_u8;
+			const PixelType* pPlane15 = (const PixelType*)pPlane15_u8;
 			pel4Plane[w << 2] = pPlane12[w];
 			pel4Plane[(w << 2) + 1] = pPlane13[w];
 			pel4Plane[(w << 2) + 2] = pPlane14[w];
@@ -113,28 +141,28 @@ static void RealMerge16PlanesToBig(uint8_t *pel4Plane_u8, int32_t pel4Pitch,
 	}
 }
 
-static void Merge16PlanesToBig(uint8_t *pel4Plane, int32_t pel4Pitch,
-	const uint8_t *pPlane0, const uint8_t *pPlane1, const uint8_t *pPlane2, const uint8_t * pPlane3,
-	const uint8_t *pPlane4, const uint8_t *pPlane5, const uint8_t *pPlane6, const uint8_t * pPlane7,
-	const uint8_t *pPlane8, const uint8_t *pPlane9, const uint8_t *pPlane10, const uint8_t * pPlane11,
-	const uint8_t *pPlane12, const uint8_t * pPlane13, const uint8_t *pPlane14, const uint8_t * pPlane15,
+static void Merge16PlanesToBig(uint8_t* pel4Plane, int32_t pel4Pitch,
+	const uint8_t* pPlane0, const uint8_t* pPlane1, const uint8_t* pPlane2, const uint8_t* pPlane3,
+	const uint8_t* pPlane4, const uint8_t* pPlane5, const uint8_t* pPlane6, const uint8_t* pPlane7,
+	const uint8_t* pPlane8, const uint8_t* pPlane9, const uint8_t* pPlane10, const uint8_t* pPlane11,
+	const uint8_t* pPlane12, const uint8_t* pPlane13, const uint8_t* pPlane14, const uint8_t* pPlane15,
 	int32_t width, int32_t height, int32_t pitch) {
 	RealMerge16PlanesToBig<float>(pel4Plane, pel4Pitch, pPlane0, pPlane1, pPlane2, pPlane3, pPlane4, pPlane5, pPlane6, pPlane7, pPlane8, pPlane9, pPlane10, pPlane11, pPlane12, pPlane13, pPlane14, pPlane15, width, height, pitch);
 }
 
-static void MakeVectorSmallMasks(MVClipBalls &mvClip, int32_t nBlkX, int32_t nBlkY, int32_t *VXSmallY, int32_t pitchVXSmallY, int32_t *VYSmallY, int32_t pitchVYSmallY) {
+static void MakeVectorSmallMasks(MVClipBalls& mvClip, int32_t nBlkX, int32_t nBlkY, int32_t* VXSmallY, int32_t pitchVXSmallY, int32_t* VYSmallY, int32_t pitchVYSmallY) {
 	for (auto by = 0; by < nBlkY; ++by)
 		for (auto bx = 0; bx < nBlkX; ++bx) {
 			auto i = bx + by * nBlkX;
-			auto &block = mvClip[0][i];
+			auto& block = mvClip[0][i];
 			int32_t vx = block.GetMV().x;
 			int32_t vy = block.GetMV().y;
-			VXSmallY[bx + by*pitchVXSmallY] = vx;
-			VYSmallY[bx + by*pitchVYSmallY] = vy;
+			VXSmallY[bx + by * pitchVXSmallY] = vx;
+			VYSmallY[bx + by * pitchVYSmallY] = vy;
 		}
 }
 
-static void VectorSmallMaskYToHalfUV(int32_t *VSmallY, int32_t nBlkX, int32_t nBlkY, int32_t *VSmallUV, int32_t ratioUV) {
+static void VectorSmallMaskYToHalfUV(int32_t* VSmallY, int32_t nBlkX, int32_t nBlkY, int32_t* VSmallUV, int32_t ratioUV) {
 	if (ratioUV == 2) {
 		for (auto by = 0; by < nBlkY; ++by) {
 			for (auto bx = 0; bx < nBlkX; ++bx)
@@ -154,12 +182,12 @@ static void VectorSmallMaskYToHalfUV(int32_t *VSmallY, int32_t nBlkX, int32_t nB
 }
 
 template <typename PixelType>
-static void RealBlend(uint8_t * pdst, const uint8_t * psrc, const uint8_t * pref, int32_t height, int32_t width, int32_t dst_pitch, int32_t src_pitch, int32_t ref_pitch, int32_t time256) {
-	for (auto h = 0; h<height; ++h) {
-		for (auto w = 0; w<width; ++w) {
-			const PixelType *psrc_ = (const PixelType *)psrc;
-			const PixelType *pref_ = (const PixelType *)pref;
-			PixelType *pdst_ = (PixelType *)pdst;
+static void RealBlend(uint8_t* pdst, const uint8_t* psrc, const uint8_t* pref, int32_t height, int32_t width, int32_t dst_pitch, int32_t src_pitch, int32_t ref_pitch, int32_t time256) {
+	for (auto h = 0; h < height; ++h) {
+		for (auto w = 0; w < width; ++w) {
+			const PixelType* psrc_ = (const PixelType*)psrc;
+			const PixelType* pref_ = (const PixelType*)pref;
+			PixelType* pdst_ = (PixelType*)pdst;
 			pdst_[w] = (psrc_[w] * (256 - time256) + pref_[w] * time256) / 256;
 		}
 		pdst += dst_pitch;
@@ -168,18 +196,18 @@ static void RealBlend(uint8_t * pdst, const uint8_t * psrc, const uint8_t * pref
 	}
 }
 
-static void Blend(uint8_t * pdst, const uint8_t * psrc, const uint8_t * pref, int32_t height, int32_t width, int32_t dst_pitch, int32_t src_pitch, int32_t ref_pitch, int32_t time256) {
+static void Blend(uint8_t* pdst, const uint8_t* psrc, const uint8_t* pref, int32_t height, int32_t width, int32_t dst_pitch, int32_t src_pitch, int32_t ref_pitch, int32_t time256) {
 	RealBlend<float>(pdst, psrc, pref, height, width, dst_pitch, src_pitch, ref_pitch, time256);
 }
 
-static inline void ByteOccMask(double *occMask, int32_t occlusion, double occnorm, double fGamma) {
+static inline void ByteOccMask(double* occMask, int32_t occlusion, double occnorm, double fGamma) {
 	if (fGamma == 1.0)
 		*occMask = std::max(*occMask, std::min((255. * occlusion * occnorm), 255.));
 	else
 		*occMask = std::max(*occMask, std::min((255. * pow(occlusion * occnorm, fGamma)), 255.));
 }
 
-static void MakeVectorOcclusionMaskTime(MVClipBalls &mvClip, bool isb, int32_t nBlkX, int32_t nBlkY, double dMaskNormDivider, double fGamma, int32_t nPel, double *occMask, int32_t occMaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) {
+static void MakeVectorOcclusionMaskTime(MVClipBalls& mvClip, bool isb, int32_t nBlkX, int32_t nBlkY, double dMaskNormDivider, double fGamma, int32_t nPel, double* occMask, int32_t occMaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) {
 	for (auto i = 0; i < occMaskPitch * nBlkY; ++i)
 		occMask[i] = 0.;
 	int time4096X = time256 * 16 / (nBlkStepX * nPel);
@@ -187,17 +215,17 @@ static void MakeVectorOcclusionMaskTime(MVClipBalls &mvClip, bool isb, int32_t n
 	double occnormX = 80. / (dMaskNormDivider * nBlkStepX * nPel);
 	double occnormY = 80. / (dMaskNormDivider * nBlkStepY * nPel);
 	int32_t occlusion;
-	for (auto by = 0; by<nBlkY; ++by)
-		for (auto bx = 0; bx<nBlkX; ++bx) {
-			int32_t i = bx + by*nBlkX;
-			auto &block = mvClip[0][i];
+	for (auto by = 0; by < nBlkY; ++by)
+		for (auto bx = 0; bx < nBlkX; ++bx) {
+			int32_t i = bx + by * nBlkX;
+			auto& block = mvClip[0][i];
 			int32_t vx = block.GetMV().x;
 			int32_t vy = block.GetMV().y;
 			if (bx < nBlkX - 1) {
 				int32_t i1 = i + 1;
-				auto &block1 = mvClip[0][i1];
+				auto& block1 = mvClip[0][i1];
 				int32_t vx1 = block1.GetMV().x;
-				if (vx1<vx) {
+				if (vx1 < vx) {
 					occlusion = vx - vx1;
 					int32_t minb = isb ? std::max(0, bx + 1 - occlusion * time4096X / 4096) : bx;
 					int32_t maxb = isb ? bx + 1 : std::min(bx + 1 - occlusion * time4096X / 4096, nBlkX - 1);
@@ -207,9 +235,9 @@ static void MakeVectorOcclusionMaskTime(MVClipBalls &mvClip, bool isb, int32_t n
 			}
 			if (by < nBlkY - 1) {
 				int32_t i1 = i + nBlkX;
-				auto &block1 = mvClip[0][i1];
+				auto& block1 = mvClip[0][i1];
 				int32_t vy1 = block1.GetMV().y;
-				if (vy1<vy) {
+				if (vy1 < vy) {
 					occlusion = vy - vy1;
 					int32_t minb = isb ? std::max(0, by + 1 - occlusion * time4096Y / 4096) : by;
 					int32_t maxb = isb ? by + 1 : std::min(by + 1 - occlusion * time4096Y / 4096, nBlkY - 1);
@@ -225,7 +253,7 @@ static double ByteNorm(double sad, double dSADNormFactor, double fGamma) {
 	return (l > 255.) ? 255. : l;
 }
 
-static void MakeSADMaskTime(MVClipBalls &mvClip, int32_t nBlkX, int32_t nBlkY, double dSADNormFactor, double fGamma, int32_t nPel, double *Mask, int32_t MaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) {
+static void MakeSADMaskTime(MVClipBalls& mvClip, int32_t nBlkX, int32_t nBlkY, double dSADNormFactor, double fGamma, int32_t nPel, double* Mask, int32_t MaskPitch, int32_t time256, int32_t nBlkStepX, int32_t nBlkStepY) {
 	for (auto i = 0; i < nBlkY * MaskPitch; ++i)
 		Mask[i] = 0.;
 	int32_t time4096X = (256 - time256) * 16 / (nBlkStepX * nPel);
@@ -233,7 +261,7 @@ static void MakeSADMaskTime(MVClipBalls &mvClip, int32_t nBlkX, int32_t nBlkY, d
 	for (auto by = 0; by < nBlkY; ++by) {
 		for (auto bx = 0; bx < nBlkX; ++bx) {
 			auto i = bx + by * nBlkX;
-			auto &block = mvClip[0][i];
+			auto& block = mvClip[0][i];
 			int32_t vx = block.GetMV().x;
 			int32_t vy = block.GetMV().y;
 			int32_t bxi = bx - vx * time4096X / 4096;
@@ -256,33 +284,33 @@ static inline float Median3r(float a, float b, float c) {
 }
 
 template <typename PixelType>
-static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int32_t ref_pitch,
-	int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF,
+static void RealFlowInterExtra(uint8_t* pdst8, int32_t dst_pitch, const uint8_t* prefB8, const uint8_t* prefF8, int32_t ref_pitch,
+	int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF,
 	int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel,
-	int32_t *VXFullBB, int32_t *VXFullFF, int32_t *VYFullBB, int32_t *VYFullFF) {
-	const PixelType *prefB = reinterpret_cast<const PixelType *>(prefB8);
-	const PixelType *prefF = reinterpret_cast<const PixelType *>(prefF8);
-	PixelType *pdst = (PixelType *)pdst8;
+	int32_t* VXFullBB, int32_t* VXFullFF, int32_t* VYFullBB, int32_t* VYFullFF) {
+	const PixelType* prefB = reinterpret_cast<const PixelType*>(prefB8);
+	const PixelType* prefF = reinterpret_cast<const PixelType*>(prefF8);
+	PixelType* pdst = (PixelType*)pdst8;
 	ref_pitch /= sizeof(PixelType);
 	dst_pitch /= sizeof(PixelType);
 	if (nPel == 1) {
-		for (auto h = 0; h<height; ++h) {
-			for (auto w = 0; w<width; ++w) {
+		for (auto h = 0; h < height; ++h) {
+			for (auto w = 0; w < width; ++w) {
 				auto vxF = (VXFullF[w] * time256) >> 8;
 				auto vyF = (VYFullF[w] * time256) >> 8;
-				int32_t adrF = vyF*ref_pitch + vxF + w;
+				int32_t adrF = vyF * ref_pitch + vxF + w;
 				float dstF = prefF[adrF];
 				auto vxFF = (VXFullFF[w] * time256) >> 8;
 				auto vyFF = (VYFullFF[w] * time256) >> 8;
-				int32_t adrFF = vyFF*ref_pitch + vxFF + w;
+				int32_t adrFF = vyFF * ref_pitch + vxFF + w;
 				float dstFF = prefF[adrFF];
 				auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 				auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-				int32_t adrB = vyB*ref_pitch + vxB + w;
+				int32_t adrB = vyB * ref_pitch + vxB + w;
 				float dstB = prefB[adrB];
 				auto vxBB = (VXFullBB[w] * (256 - time256)) >> 8;
 				auto vyBB = (VYFullBB[w] * (256 - time256)) >> 8;
-				int32_t adrBB = vyBB*ref_pitch + vxBB + w;
+				int32_t adrBB = vyBB * ref_pitch + vxBB + w;
 				float dstBB = prefB[adrBB];
 				float minfb;
 				float maxfb;
@@ -294,8 +322,8 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 					maxfb = dstB;
 					minfb = dstF;
 				}
-				pdst[w] = static_cast<PixelType>((((Median3r(minfb, dstBB, maxfb)*MaskF[w] + dstF*(255 - MaskF[w])) / 256)*(256 - time256) +
-					((Median3r(minfb, dstFF, maxfb)*MaskB[w] + dstB*(255 - MaskB[w])) / 256)*time256) / 256);
+				pdst[w] = static_cast<PixelType>((((Median3r(minfb, dstBB, maxfb) * MaskF[w] + dstF * (255 - MaskF[w])) / 256) * (256 - time256) +
+					((Median3r(minfb, dstFF, maxfb) * MaskB[w] + dstB * (255 - MaskB[w])) / 256) * time256) / 256);
 			}
 			pdst += dst_pitch;
 			prefB += ref_pitch;
@@ -313,23 +341,23 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 		}
 	}
 	else if (nPel == 2) {
-		for (auto h = 0; h<height; ++h) {
-			for (auto w = 0; w<width; ++w) {
+		for (auto h = 0; h < height; ++h) {
+			for (auto w = 0; w < width; ++w) {
 				auto vxF = (VXFullF[w] * time256) >> 8;
 				auto vyF = (VYFullF[w] * time256) >> 8;
-				int32_t adrF = vyF*ref_pitch + vxF + (w << 1);
+				int32_t adrF = vyF * ref_pitch + vxF + (w << 1);
 				float dstF = prefF[adrF];
 				auto vxFF = (VXFullFF[w] * time256) >> 8;
 				auto vyFF = (VYFullFF[w] * time256) >> 8;
-				int32_t adrFF = vyFF*ref_pitch + vxFF + (w << 1);
+				int32_t adrFF = vyFF * ref_pitch + vxFF + (w << 1);
 				float dstFF = prefF[adrFF];
 				auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 				auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-				int32_t adrB = vyB*ref_pitch + vxB + (w << 1);
+				int32_t adrB = vyB * ref_pitch + vxB + (w << 1);
 				float dstB = prefB[adrB];
 				auto vxBB = (VXFullBB[w] * (256 - time256)) >> 8;
 				auto vyBB = (VYFullBB[w] * (256 - time256)) >> 8;
-				int32_t adrBB = vyBB*ref_pitch + vxBB + (w << 1);
+				int32_t adrBB = vyBB * ref_pitch + vxBB + (w << 1);
 				float dstBB = prefB[adrBB];
 				float minfb;
 				float maxfb;
@@ -341,8 +369,8 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 					maxfb = dstB;
 					minfb = dstF;
 				}
-				pdst[w] = static_cast<PixelType>((((Median3r(minfb, dstBB, maxfb)*MaskF[w] + dstF*(255 - MaskF[w])) / 256)*(256 - time256) +
-					((Median3r(minfb, dstFF, maxfb)*MaskB[w] + dstB*(255 - MaskB[w])) / 256)*time256) / 256);
+				pdst[w] = static_cast<PixelType>((((Median3r(minfb, dstBB, maxfb) * MaskF[w] + dstF * (255 - MaskF[w])) / 256) * (256 - time256) +
+					((Median3r(minfb, dstFF, maxfb) * MaskB[w] + dstB * (255 - MaskB[w])) / 256) * time256) / 256);
 			}
 			pdst += dst_pitch;
 			prefB += ref_pitch << 1;
@@ -360,20 +388,20 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 		}
 	}
 	else if (nPel == 4) {
-		for (auto h = 0; h<height; ++h) {
-			for (auto w = 0; w<width; ++w) {
+		for (auto h = 0; h < height; ++h) {
+			for (auto w = 0; w < width; ++w) {
 				auto vxF = (VXFullF[w] * time256) >> 8;
 				auto vyF = (VYFullF[w] * time256) >> 8;
-				float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)];
+				float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)];
 				auto vxFF = (VXFullFF[w] * time256) >> 8;
 				auto vyFF = (VYFullFF[w] * time256) >> 8;
-				float dstFF = prefF[vyFF*ref_pitch + vxFF + (w << 2)];
+				float dstFF = prefF[vyFF * ref_pitch + vxFF + (w << 2)];
 				auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 				auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-				float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)];
+				float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)];
 				auto vxBB = (VXFullBB[w] * (256 - time256)) >> 8;
 				auto vyBB = (VYFullBB[w] * (256 - time256)) >> 8;
-				float dstBB = prefB[vyBB*ref_pitch + vxBB + (w << 2)];
+				float dstBB = prefB[vyBB * ref_pitch + vxBB + (w << 2)];
 				float minfb;
 				float maxfb;
 				if (dstF > dstB) {
@@ -384,8 +412,8 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 					maxfb = dstB;
 					minfb = dstF;
 				}
-				pdst[w] = static_cast<PixelType>((((Median3r(minfb, dstBB, maxfb)*MaskF[w] + dstF*(255 - MaskF[w])) / 256)*(256 - time256) +
-					((Median3r(minfb, dstFF, maxfb)*MaskB[w] + dstB*(255 - MaskB[w])) / 256)*time256) / 256);
+				pdst[w] = static_cast<PixelType>((((Median3r(minfb, dstBB, maxfb) * MaskF[w] + dstF * (255 - MaskF[w])) / 256) * (256 - time256) +
+					((Median3r(minfb, dstFF, maxfb) * MaskB[w] + dstB * (255 - MaskB[w])) / 256) * time256) / 256);
 			}
 			pdst += dst_pitch;
 			prefB += ref_pitch << 2;
@@ -404,20 +432,20 @@ static void RealFlowInterExtra(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 	}
 }
 
-static void FlowInterExtra(uint8_t * pdst, int32_t dst_pitch, const uint8_t *prefB, const uint8_t *prefF, int32_t ref_pitch,
-	int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF,
+static void FlowInterExtra(uint8_t* pdst, int32_t dst_pitch, const uint8_t* prefB, const uint8_t* prefF, int32_t ref_pitch,
+	int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF,
 	int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel,
-	int32_t *VXFullBB, int32_t *VXFullFF, int32_t *VYFullBB, int32_t *VYFullFF) {
+	int32_t* VXFullBB, int32_t* VXFullFF, int32_t* VYFullBB, int32_t* VYFullFF) {
 	RealFlowInterExtra<float>(pdst, dst_pitch, prefB, prefF, ref_pitch, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, VPitch, width, height, time256, nPel, VXFullBB, VXFullFF, VYFullBB, VYFullFF);
 }
 
 template <typename PixelType>
-static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int32_t ref_pitch,
-	int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF,
+static void RealFlowInter(uint8_t* pdst8, int32_t dst_pitch, const uint8_t* prefB8, const uint8_t* prefF8, int32_t ref_pitch,
+	int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF,
 	int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) {
-	const PixelType *prefB = reinterpret_cast<const PixelType *>(prefB8);
-	const PixelType *prefF = reinterpret_cast<const PixelType *>(prefF8);
-	PixelType *pdst = reinterpret_cast<PixelType *>(pdst8);
+	const PixelType* prefB = reinterpret_cast<const PixelType*>(prefB8);
+	const PixelType* prefF = reinterpret_cast<const PixelType*>(prefF8);
+	PixelType* pdst = reinterpret_cast<PixelType*>(pdst8);
 	ref_pitch /= sizeof(PixelType);
 	dst_pitch /= sizeof(PixelType);
 	if (nPel == 1) {
@@ -425,14 +453,14 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref
 			for (auto w = 0; w < width; ++w) {
 				auto vxF = (VXFullF[w] * time256) >> 8;
 				auto vyF = (VYFullF[w] * time256) >> 8;
-				double dstF = prefF[vyF*ref_pitch + vxF + w];
+				double dstF = prefF[vyF * ref_pitch + vxF + w];
 				float dstF0 = prefF[w];
 				auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 				auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-				double dstB = prefB[vyB*ref_pitch + vxB + w];
+				double dstB = prefB[vyB * ref_pitch + vxB + w];
 				float dstB0 = prefB[w];
-				pdst[w] = static_cast<PixelType>((((dstF*(255 - MaskF[w]) + ((MaskF[w] * (dstB*(255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256)*(256 - time256) +
-					((dstB*(255 - MaskB[w]) + ((MaskB[w] * (dstF*(255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256)*time256) / 256);
+				pdst[w] = static_cast<PixelType>((((dstF * (255 - MaskF[w]) + ((MaskF[w] * (dstB * (255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256) * (256 - time256) +
+					((dstB * (255 - MaskB[w]) + ((MaskB[w] * (dstF * (255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256) * time256) / 256);
 			}
 			pdst += dst_pitch;
 			prefB += ref_pitch;
@@ -450,14 +478,14 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref
 			for (auto w = 0; w < width; ++w) {
 				auto vxF = (VXFullF[w] * time256) >> 8;
 				auto vyF = (VYFullF[w] * time256) >> 8;
-				float dstF = prefF[vyF*ref_pitch + vxF + (w << 1)];
+				float dstF = prefF[vyF * ref_pitch + vxF + (w << 1)];
 				float dstF0 = prefF[(w << 1)];
 				auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 				auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-				float dstB = prefB[vyB*ref_pitch + vxB + (w << 1)];
+				float dstB = prefB[vyB * ref_pitch + vxB + (w << 1)];
 				float dstB0 = prefB[(w << 1)];
-				pdst[w] = static_cast<PixelType>((((dstF*(255 - MaskF[w]) + ((MaskF[w] * (dstB*(255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256)*(256 - time256) +
-					((dstB*(255 - MaskB[w]) + ((MaskB[w] * (dstF*(255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256)*time256) / 256);
+				pdst[w] = static_cast<PixelType>((((dstF * (255 - MaskF[w]) + ((MaskF[w] * (dstB * (255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256) * (256 - time256) +
+					((dstB * (255 - MaskB[w]) + ((MaskB[w] * (dstF * (255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256) * time256) / 256);
 			}
 			pdst += dst_pitch;
 			prefB += ref_pitch << 1;
@@ -475,14 +503,14 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref
 			for (auto w = 0; w < width; ++w) {
 				auto vxF = (VXFullF[w] * time256) >> 8;
 				auto vyF = (VYFullF[w] * time256) >> 8;
-				float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)];
+				float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)];
 				float dstF0 = prefF[(w << 2)];
 				auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 				auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-				float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)];
+				float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)];
 				float dstB0 = prefB[(w << 2)];
-				pdst[w] = static_cast<PixelType>((((dstF*(255 - MaskF[w]) + ((MaskF[w] * (dstB*(255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256)*(256 - time256) +
-					((dstB*(255 - MaskB[w]) + ((MaskB[w] * (dstF*(255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256)*time256) / 256);
+				pdst[w] = static_cast<PixelType>((((dstF * (255 - MaskF[w]) + ((MaskF[w] * (dstB * (255 - MaskB[w]) + MaskB[w] * dstF0)) / 256)) / 256) * (256 - time256) +
+					((dstB * (255 - MaskB[w]) + ((MaskB[w] * (dstF * (255 - MaskF[w]) + MaskF[w] * dstB0)) / 256)) / 256) * time256) / 256);
 			}
 			pdst += dst_pitch;
 			prefB += ref_pitch << 2;
@@ -497,37 +525,37 @@ static void RealFlowInter(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *pref
 	}
 }
 
-static void FlowInter(uint8_t *pdst, int32_t dst_pitch, const uint8_t *prefB, const uint8_t *prefF, int32_t ref_pitch,
-	int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF,
+static void FlowInter(uint8_t* pdst, int32_t dst_pitch, const uint8_t* prefB, const uint8_t* prefF, int32_t ref_pitch,
+	int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF,
 	int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) {
 	RealFlowInter<float>(pdst, dst_pitch, prefB, prefF, ref_pitch, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, VPitch, width, height, time256, nPel);
 }
 
 template <typename PixelType>
-static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int32_t ref_pitch,
-	int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF,
+static void RealFlowInterSimple(uint8_t* pdst8, int32_t dst_pitch, const uint8_t* prefB8, const uint8_t* prefF8, int32_t ref_pitch,
+	int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF,
 	int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) {
-	const PixelType *prefB = reinterpret_cast<const PixelType *>(prefB8);
-	const PixelType *prefF = reinterpret_cast<const PixelType *>(prefF8);
-	PixelType *pdst = (PixelType *)pdst8;
+	const PixelType* prefB = reinterpret_cast<const PixelType*>(prefB8);
+	const PixelType* prefF = reinterpret_cast<const PixelType*>(prefF8);
+	PixelType* pdst = (PixelType*)pdst8;
 	ref_pitch /= sizeof(PixelType);
 	dst_pitch /= sizeof(PixelType);
 	if (time256 == 128) {
 		if (nPel == 1) {
-			for (auto h = 0; h<height; ++h) {
-				for (auto w = 0; w<width; w += 2) {
+			for (auto h = 0; h < height; ++h) {
+				for (auto w = 0; w < width; w += 2) {
 					auto vxF = VXFullF[w] >> 1;
 					auto vyF = VYFullF[w] >> 1;
-					int32_t addrF = vyF*ref_pitch + vxF + w;
+					int32_t addrF = vyF * ref_pitch + vxF + w;
 					float dstF = prefF[addrF];
 					float dstF1 = prefF[addrF + 1];
 					auto vxB = VXFullB[w] >> 1;
 					auto vyB = VYFullB[w] >> 1;
-					int32_t addrB = vyB*ref_pitch + vxB + w;
+					int32_t addrB = vyB * ref_pitch + vxB + w;
 					float dstB = prefB[addrB];
 					float dstB1 = prefB[addrB + 1];
-					pdst[w] = static_cast<PixelType>((((dstF + dstB) * 256) + (dstB - dstF)*(MaskF[w] - MaskB[w])) / 512);
-					pdst[w + 1] = static_cast<PixelType>((((dstF1 + dstB1) * 256) + (dstB1 - dstF1)*(MaskF[w + 1] - MaskB[w + 1])) / 512);
+					pdst[w] = static_cast<PixelType>((((dstF + dstB) * 256) + (dstB - dstF) * (MaskF[w] - MaskB[w])) / 512);
+					pdst[w + 1] = static_cast<PixelType>((((dstF1 + dstB1) * 256) + (dstB1 - dstF1) * (MaskF[w + 1] - MaskB[w + 1])) / 512);
 				}
 				pdst += dst_pitch;
 				prefB += ref_pitch;
@@ -541,15 +569,15 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 			}
 		}
 		else if (nPel == 2) {
-			for (auto h = 0; h<height; ++h) {
-				for (auto w = 0; w<width; w += 1) {
+			for (auto h = 0; h < height; ++h) {
+				for (auto w = 0; w < width; w += 1) {
 					auto vxF = VXFullF[w] >> 1;
 					auto vyF = VYFullF[w] >> 1;
-					float dstF = prefF[vyF*ref_pitch + vxF + (w << 1)];
+					float dstF = prefF[vyF * ref_pitch + vxF + (w << 1)];
 					auto vxB = VXFullB[w] >> 1;
 					auto vyB = VYFullB[w] >> 1;
-					float dstB = prefB[vyB*ref_pitch + vxB + (w << 1)];
-					pdst[w] = static_cast<PixelType>((((dstF + dstB) * 256) + (dstB - dstF)*(MaskF[w] - MaskB[w])) / 512);
+					float dstB = prefB[vyB * ref_pitch + vxB + (w << 1)];
+					pdst[w] = static_cast<PixelType>((((dstF + dstB) * 256) + (dstB - dstF) * (MaskF[w] - MaskB[w])) / 512);
 				}
 				pdst += dst_pitch;
 				prefB += ref_pitch << 1;
@@ -563,15 +591,15 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 			}
 		}
 		else if (nPel == 4) {
-			for (auto h = 0; h<height; ++h) {
-				for (auto w = 0; w<width; w += 1) {
+			for (auto h = 0; h < height; ++h) {
+				for (auto w = 0; w < width; w += 1) {
 					auto vxF = VXFullF[w] >> 1;
 					auto vyF = VYFullF[w] >> 1;
-					float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)];
+					float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)];
 					auto vxB = VXFullB[w] >> 1;
 					auto vyB = VYFullB[w] >> 1;
-					float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)];
-					pdst[w] = static_cast<PixelType>((((dstF + dstB) * 256) + (dstB - dstF)*(MaskF[w] - MaskB[w])) / 512);
+					float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)];
+					pdst[w] = static_cast<PixelType>((((dstF + dstB) * 256) + (dstB - dstF) * (MaskF[w] - MaskB[w])) / 512);
 				}
 				pdst += dst_pitch;
 				prefB += ref_pitch << 2;
@@ -587,22 +615,22 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 	}
 	else {
 		if (nPel == 1) {
-			for (auto h = 0; h<height; ++h) {
-				for (auto w = 0; w<width; w += 2) {
+			for (auto h = 0; h < height; ++h) {
+				for (auto w = 0; w < width; w += 2) {
 					auto vxF = (VXFullF[w] * time256) >> 8;
 					auto vyF = (VYFullF[w] * time256) >> 8;
-					int32_t addrF = vyF*ref_pitch + vxF + w;
+					int32_t addrF = vyF * ref_pitch + vxF + w;
 					float dstF = prefF[addrF];
 					float dstF1 = prefF[addrF + 1];
 					auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 					auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-					int32_t addrB = vyB*ref_pitch + vxB + w;
+					int32_t addrB = vyB * ref_pitch + vxB + w;
 					float dstB = prefB[addrB];
 					float dstB1 = prefB[addrB + 1];
-					pdst[w] = static_cast<PixelType>((((dstF * 255 + (dstB - dstF)*MaskF[w]))*(256 - time256) +
-						((dstB * 255 - (dstB - dstF)*MaskB[w]))*time256) / 65536);
-					pdst[w + 1] = static_cast<PixelType>((((dstF1 * 255 + (dstB1 - dstF1)*MaskF[w + 1]))*(256 - time256) +
-						((dstB1 * 255 - (dstB1 - dstF1)*MaskB[w + 1]))*time256) / 65536);
+					pdst[w] = static_cast<PixelType>((((dstF * 255 + (dstB - dstF) * MaskF[w])) * (256 - time256) +
+						((dstB * 255 - (dstB - dstF) * MaskB[w])) * time256) / 65536);
+					pdst[w + 1] = static_cast<PixelType>((((dstF1 * 255 + (dstB1 - dstF1) * MaskF[w + 1])) * (256 - time256) +
+						((dstB1 * 255 - (dstB1 - dstF1) * MaskB[w + 1])) * time256) / 65536);
 				}
 				pdst += dst_pitch;
 				prefB += ref_pitch;
@@ -616,16 +644,16 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 			}
 		}
 		else if (nPel == 2) {
-			for (auto h = 0; h<height; ++h) {
-				for (auto w = 0; w<width; w += 1) {
+			for (auto h = 0; h < height; ++h) {
+				for (auto w = 0; w < width; w += 1) {
 					auto vxF = (VXFullF[w] * time256) >> 8;
 					auto vyF = (VYFullF[w] * time256) >> 8;
-					float dstF = prefF[vyF*ref_pitch + vxF + (w << 1)];
+					float dstF = prefF[vyF * ref_pitch + vxF + (w << 1)];
 					auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 					auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-					float dstB = prefB[vyB*ref_pitch + vxB + (w << 1)];
-					pdst[w] = static_cast<PixelType>((((dstF*(255 - MaskF[w]) + dstB*MaskF[w]) / 256)*(256 - time256) +
-						((dstB*(255 - MaskB[w]) + dstF*MaskB[w]) / 256)*time256) / 256);
+					float dstB = prefB[vyB * ref_pitch + vxB + (w << 1)];
+					pdst[w] = static_cast<PixelType>((((dstF * (255 - MaskF[w]) + dstB * MaskF[w]) / 256) * (256 - time256) +
+						((dstB * (255 - MaskB[w]) + dstF * MaskB[w]) / 256) * time256) / 256);
 				}
 				pdst += dst_pitch;
 				prefB += ref_pitch << 1;
@@ -639,16 +667,16 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 			}
 		}
 		else if (nPel == 4) {
-			for (auto h = 0; h<height; ++h) {
-				for (auto w = 0; w<width; w += 1) {
+			for (auto h = 0; h < height; ++h) {
+				for (auto w = 0; w < width; w += 1) {
 					auto vxF = (VXFullF[w] * time256) >> 8;
 					auto vyF = (VYFullF[w] * time256) >> 8;
-					float dstF = prefF[vyF*ref_pitch + vxF + (w << 2)];
+					float dstF = prefF[vyF * ref_pitch + vxF + (w << 2)];
 					auto vxB = (VXFullB[w] * (256 - time256)) >> 8;
 					auto vyB = (VYFullB[w] * (256 - time256)) >> 8;
-					float dstB = prefB[vyB*ref_pitch + vxB + (w << 2)];
-					pdst[w] = static_cast<PixelType>((((dstF*(255 - MaskF[w]) + dstB*MaskF[w]) / 256)*(256 - time256) +
-						((dstB*(255 - MaskB[w]) + dstF*MaskB[w]) / 256)*time256) / 256);
+					float dstB = prefB[vyB * ref_pitch + vxB + (w << 2)];
+					pdst[w] = static_cast<PixelType>((((dstF * (255 - MaskF[w]) + dstB * MaskF[w]) / 256) * (256 - time256) +
+						((dstB * (255 - MaskB[w]) + dstF * MaskB[w]) / 256) * time256) / 256);
 				}
 				pdst += dst_pitch;
 				prefB += ref_pitch << 2;
@@ -664,8 +692,8 @@ static void RealFlowInterSimple(uint8_t *pdst8, int32_t dst_pitch, const uint8_t
 	}
 }
 
-static void FlowInterSimple(uint8_t *pdst, int32_t dst_pitch, const uint8_t *prefB, const uint8_t *prefF, int32_t ref_pitch,
-	int32_t *VXFullB, int32_t *VXFullF, int32_t *VYFullB, int32_t *VYFullF, double *MaskB, double *MaskF,
+static void FlowInterSimple(uint8_t* pdst, int32_t dst_pitch, const uint8_t* prefB, const uint8_t* prefF, int32_t ref_pitch,
+	int32_t* VXFullB, int32_t* VXFullF, int32_t* VYFullB, int32_t* VYFullF, double* MaskB, double* MaskF,
 	int32_t VPitch, int32_t width, int32_t height, int32_t time256, int32_t nPel) {
 	RealFlowInterSimple<float>(pdst, dst_pitch, prefB, prefF, ref_pitch, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, VPitch, width, height, time256, nPel);
 }
diff --git a/src/PlaneOfBlocks.h b/src/PlaneOfBlocks.h
index 7bd68b6..c14b372 100644
--- a/src/PlaneOfBlocks.h
+++ b/src/PlaneOfBlocks.h
@@ -129,16 +129,16 @@ class PlaneOfBlocks {
 	inline const uint8_t* GetRefBlockV(int32_t nVx, int32_t nVy) {
 		if (nPel == 2)
 			return pRefFrame->GetPlane(VPLANE)->GetAbsolutePointerPel2(
-				x[1] * 2 + nVx / xRatioUV,
-				y[1] * 2 + nVy / yRatioUV);
+				x[2] * 2 + nVx / xRatioUV,
+				y[2] * 2 + nVy / yRatioUV);
 		else if (nPel == 1)
 			return pRefFrame->GetPlane(VPLANE)->GetAbsolutePointerPel1(
-				x[1] + nVx / xRatioUV,
-				y[1] + nVy / yRatioUV);
+				x[2] + nVx / xRatioUV,
+				y[2] + nVy / yRatioUV);
 		else
 			return pRefFrame->GetPlane(VPLANE)->GetAbsolutePointerPel4(
-				x[1] * 4 + nVx / xRatioUV,
-				y[1] * 4 + nVy / yRatioUV);
+				x[2] * 4 + nVx / xRatioUV,
+				y[2] * 4 + nVy / yRatioUV);
 	}
 	inline const uint8_t* GetSrcBlock(int32_t nX, int32_t nY) {
 		return pSrcFrame->GetPlane(YPLANE)->GetAbsolutePelPointer(nX, nY);
@@ -253,7 +253,7 @@ class PlaneOfBlocks {
 			bestMV.x = vx;
 			bestMV.y = vy;
 			nMinCost = cost;
-			bestMV.sad = static_cast<float>(sad + saduv);
+			bestMV.sad = sad + saduv;
 		}
 	}
 	inline void CheckMV(int32_t vx, int32_t vy) {
@@ -270,7 +270,7 @@ class PlaneOfBlocks {
 			bestMV.x = vx;
 			bestMV.y = vy;
 			nMinCost = cost;
-			bestMV.sad = static_cast<float>(sad + saduv);
+			bestMV.sad = sad + saduv;
 		}
 	}
 	inline void CheckMV2(int32_t vx, int32_t vy, int32_t* dir, int32_t val) {
@@ -287,7 +287,7 @@ class PlaneOfBlocks {
 			bestMV.x = vx;
 			bestMV.y = vy;
 			nMinCost = cost;
-			bestMV.sad = static_cast<float>(sad + saduv);
+			bestMV.sad = sad + saduv;
 			*dir = val;
 		}
 	}
@@ -303,7 +303,7 @@ class PlaneOfBlocks {
 			cost += saduv + ((penaltyNew * saduv) / 256);
 			if (cost >= nMinCost) return;
 			nMinCost = cost;
-			bestMV.sad = static_cast<float>(sad + saduv);
+			bestMV.sad = sad + saduv;
 			*dir = val;
 		}
 	}
@@ -528,7 +528,7 @@ class PlaneOfBlocks {
 		SATD = satds[nBlkSizeX][nBlkSizeY];
 		if (!chroma)
 			SADCHROMA = nullptr;
-		dctpitch = max(nBlkSizeX, 16) * 4;
+		dctpitch = nBlkSizeX * sizeof(float);
 		dctSrc = vs_aligned_malloc<uint8_t>(nBlkSizeY * dctpitch, ALIGN_PLANES);
 		dctRef = vs_aligned_malloc<uint8_t>(nBlkSizeY * dctpitch, ALIGN_PLANES);
 
@@ -542,7 +542,7 @@ class PlaneOfBlocks {
 
 		freqSize = 8192 * nPel * 2;
 		freqArray = new int32_t[freqSize];
-		verybigSAD = static_cast<double>(nBlkSizeX) * nBlkSizeY;
+		verybigSAD = 1. * nBlkSizeX * nBlkSizeY;
 	}
 	~PlaneOfBlocks() {
 		delete[] vectors;
@@ -805,7 +805,7 @@ class PlaneOfBlocks {
 			+ SADCHROMA(pSrc[2], nSrcPitch[2], GetRefBlockV(0, 0), nRefPitch[2]) : 0.f;
 		sad = LumaSAD(GetRefBlock(0, zeroMVfieldShifted.y));
 		sad += saduv;
-		bestMV.sad = static_cast<float>(sad);
+		bestMV.sad = sad;
 		nMinCost = sad + ((penaltyZero * sad) / 256); // v.1.11.0.2
 
 		VectorStructure bestMVMany[8];
@@ -831,7 +831,7 @@ class PlaneOfBlocks {
 		{
 			bestMV.x = globalMVPredictor.x;
 			bestMV.y = globalMVPredictor.y;
-			bestMV.sad = static_cast<float>(sad);
+			bestMV.sad = sad;
 			nMinCost = cost;
 		}
 		if (tryMany)
@@ -851,7 +851,7 @@ class PlaneOfBlocks {
 		{
 			bestMV.x = predictor.x;
 			bestMV.y = predictor.y;
-			bestMV.sad = static_cast<float>(sad);
+			bestMV.sad = sad;
 			nMinCost = cost;
 		}
 		if (tryMany)
@@ -864,11 +864,12 @@ class PlaneOfBlocks {
 
 		// then all the other predictors
 		int32_t npred = (temporal) ? 5 : 4;
+		constexpr auto epsilon = 1e-5;
 
 		for (int32_t i = 0; i < npred; i++)
 		{
 			if (tryMany)
-				nMinCost = verybigSAD;
+				nMinCost = verybigSAD + epsilon;
 			CheckMV0(predictors[i].x, predictors[i].y);
 			if (tryMany)
 			{
@@ -880,9 +881,8 @@ class PlaneOfBlocks {
 		}
 
 
-		if (tryMany)
-		{ // select best of multi best
-			nMinCost = verybigSAD;
+		if (tryMany) { // select best of multi best
+			nMinCost = verybigSAD + epsilon;
 			for (int32_t i = 0; i < npred + 3; i++)
 			{
 				if (nMinCostMany[i] < nMinCost)
@@ -1149,6 +1149,7 @@ class PlaneOfBlocks {
 		planeSAD = 0.0;
 		badcount = 0;
 		tryMany = _tryMany;
+		sumLumaChange = 0.;
 		// Functions using double must not be used here
 
 		for (blky = 0; blky < nBlkY; blky++)
@@ -1226,25 +1227,17 @@ class PlaneOfBlocks {
 				/* search the mv */
 				predictor = ClipMV(vectors[blkIdx]);
 				if (temporal)
-					predictors[4] = ClipMV(*reinterpret_cast<VectorStructure*>(&vecPrev[blkIdx * N_PER_BLOCK])); // temporal predictor
+					predictors[4] = ClipMV(reinterpret_cast<VectorStructure&>(vecPrev[blkIdx * N_PER_BLOCK])); // temporal predictor
 				else
 					predictors[4] = ClipMV(zeroMV);
 
 				PseudoEPZSearch();
 
-				if (outfilebuf != nullptr) // write vector to outfile
-				{
-					outfilebuf[blkx * 4 + 0] = bestMV.x;
-					outfilebuf[blkx * 4 + 1] = bestMV.y;
-					outfilebuf[blkx * 4 + 2] = reinterpret_cast<int32_t&>(bestMV.sad);
-					
-				}
 
 				/* write the results */
-				pBlkData[blkx * N_PER_BLOCK + 0] = bestMV.x;
-				pBlkData[blkx * N_PER_BLOCK + 1] = bestMV.y;
-				pBlkData[blkx * N_PER_BLOCK + 2] = reinterpret_cast<int32_t&>(bestMV.sad);
 
+				auto& BlockData = reinterpret_cast<VectorStructure&>(pBlkData[blkx * N_PER_BLOCK]);
+				BlockData = bestMV;
 
 
 				if (smallestPlane)
@@ -1261,8 +1254,6 @@ class PlaneOfBlocks {
 				}
 			}
 			pBlkData += nBlkX * N_PER_BLOCK;
-			if (outfilebuf != nullptr) // write vector to outfile
-				outfilebuf += nBlkX * 4;// 4 int32_t word per block
 
 			y[0] += (nBlkSizeY - nOverlapY);
 			if (pSrcFrame->GetMode() & UPLANE)
@@ -1327,7 +1318,7 @@ class PlaneOfBlocks {
 				{
 					vectors[index].x = 9 * v1.x + 3 * v2.x + 3 * v3.x + v4.x;
 					vectors[index].y = 9 * v1.y + 3 * v2.y + 3 * v3.y + v4.y;
-					temp_sad = 9 * static_cast<double>(v1.sad) + 3 * v2.sad + 3 * v3.sad + v4.sad;
+					temp_sad = 9 * v1.sad + 3 * v2.sad + 3 * v3.sad + v4.sad;
 				}
 				else if (nOverlapX <= (nBlkSizeX >> 1) && nOverlapY <= (nBlkSizeY >> 1)) // corrected in v1.4.11
 				{
@@ -1346,11 +1337,11 @@ class PlaneOfBlocks {
 					// Dead branch. The overlap is no longer allowed to be more than half the block size.
 					vectors[index].x = (v1.x + v2.x + v3.x + v4.x) << 2;
 					vectors[index].y = (v1.y + v2.y + v3.y + v4.y) << 2;
-					temp_sad = (static_cast<double>(v1.sad) + v2.sad + v3.sad + v4.sad) * 4;
+					temp_sad = (v1.sad + v2.sad + v3.sad + v4.sad) * 4;
 				}
 				vectors[index].x = vectors[index].x ? vectors[index].x / abs(vectors[index].x) * ((abs(vectors[index].x) >> normFactor) << mulFactor) : 0;
 				vectors[index].y = vectors[index].y ? vectors[index].y / abs(vectors[index].y) * ((abs(vectors[index].y) >> normFactor) << mulFactor) : 0;
-				vectors[index].sad = static_cast<float>(temp_sad / 16);
+				vectors[index].sad = temp_sad / 16;
 			}
 		}
 	}
@@ -1358,25 +1349,22 @@ class PlaneOfBlocks {
 		array[0] = nBlkCount * N_PER_BLOCK + 1;
 	}
 	auto WriteDefaultToArray(int32_t* array, int32_t divideMode) {
-		auto verybigSAD_f = static_cast<float>(verybigSAD);
 		array[0] = nBlkCount * N_PER_BLOCK + 1;
-		for (int32_t i = 0; i < nBlkCount * N_PER_BLOCK; i += N_PER_BLOCK)
-		{
-			array[i + 1] = 0;
-			array[i + 2] = 0;
-			array[i + 3] = reinterpret_cast<std::int32_t&>(verybigSAD_f);
+		for (auto i : Range{ 0, nBlkCount * N_PER_BLOCK, N_PER_BLOCK }) {
+			auto& BlockData = reinterpret_cast<VectorStructure&>(array[i + 1]);
+			BlockData.x = 0;
+			BlockData.y = 0;
+			BlockData.sad = verybigSAD;
 		}
-
-		if (nLogScale == 0)
-		{
+		if (nLogScale == 0) {
 			array += array[0];
 			if (divideMode) { // reserve space for divided subblocks extra level
 				array[0] = nBlkCount * N_PER_BLOCK * 4 + 1; // 4 subblocks
-				for (int32_t i = 0; i < nBlkCount * 4 * N_PER_BLOCK; i += N_PER_BLOCK)
-				{
-					array[i + 1] = 0;
-					array[i + 2] = 0;
-					array[i + 3] = reinterpret_cast<std::int32_t&>(verybigSAD_f);
+				for (auto i : Range{ 0, nBlkCount * 4 * N_PER_BLOCK, N_PER_BLOCK }) {
+					auto& BlockData = reinterpret_cast<VectorStructure&>(array[i + 1]);
+					BlockData.x = 0;
+					BlockData.y = 0;
+					BlockData.sad = verybigSAD;
 				}
 				array += array[0];
 			}
@@ -1632,15 +1620,15 @@ class PlaneOfBlocks {
 					// interpolate
 					int32_t vector1_x = vectorOld1.x * nStepXold + deltaX * (vectorOld2.x - vectorOld1.x); // scaled by nStepXold to skip slow division
 					int32_t vector1_y = vectorOld1.y * nStepXold + deltaX * (vectorOld2.y - vectorOld1.y);
-					auto vector1_sad = static_cast<double>(vectorOld1.sad) * nStepXold + deltaX * (vectorOld2.sad - vectorOld1.sad);
+					auto vector1_sad = vectorOld1.sad * nStepXold + deltaX * (vectorOld2.sad - vectorOld1.sad);
 
 					int32_t vector2_x = vectorOld3.x * nStepXold + deltaX * (vectorOld4.x - vectorOld3.x);
 					int32_t vector2_y = vectorOld3.y * nStepXold + deltaX * (vectorOld4.y - vectorOld3.y);
-					auto vector2_sad = static_cast<double>(vectorOld3.sad) * nStepXold + deltaX * (vectorOld4.sad - vectorOld3.sad);
+					auto vector2_sad = vectorOld3.sad * nStepXold + deltaX * (vectorOld4.sad - vectorOld3.sad);
 
 					vectorOld.x = (vector1_x + deltaY * (vector2_x - vector1_x) / nStepYold) / nStepXold;
 					vectorOld.y = (vector1_y + deltaY * (vector2_y - vector1_y) / nStepYold) / nStepXold;
-					vectorOld.sad = static_cast<float>((vector1_sad + deltaY * (vector2_sad - vector1_sad) / nStepYold) / nStepXold);
+					vectorOld.sad = (vector1_sad + deltaY * (vector2_sad - vector1_sad) / nStepYold) / nStepXold;
 
 				}
 				else // nearest
@@ -1660,7 +1648,7 @@ class PlaneOfBlocks {
 				vectorOld.y = vectorOld.y ? vectorOld.y / abs(vectorOld.y) * ((abs(vectorOld.y) << nLogPel) >> nLogPelold) : 0;
 
 				predictor = ClipMV(vectorOld); // predictor
-				predictor.sad = static_cast<float>(static_cast<double>(vectorOld.sad) * (nBlkSizeX * nBlkSizeY) / (nBlkSizeXold * nBlkSizeYold)); // normalized to new block size
+				predictor.sad = vectorOld.sad * (nBlkSizeX * nBlkSizeY) / (nBlkSizeXold * nBlkSizeYold); // normalized to new block size
 
 				bestMV.x = predictor.x;
 				bestMV.y = predictor.y;
@@ -1680,7 +1668,7 @@ class PlaneOfBlocks {
 					+ SADCHROMA(pSrc[2], nSrcPitch[2], GetRefBlockV(predictor.x, predictor.y), nRefPitch[2]) : 0.f;
 				double sad = LumaSAD(GetRefBlock(predictor.x, predictor.y));
 				sad += saduv;
-				bestMV.sad = static_cast<float>(sad);
+				bestMV.sad = sad;
 				nMinCost = sad;
 
 
@@ -1741,23 +1729,10 @@ class PlaneOfBlocks {
 				vectors[blkIdx].sad = bestMV.sad;
 
 
-				if (outfilebuf != nullptr) // write vector to outfile
-				{
-					outfilebuf[blkx * 4 + 0] = bestMV.x;
-					outfilebuf[blkx * 4 + 1] = bestMV.y;
-					outfilebuf[blkx * 4 + 2] = reinterpret_cast<int32_t&>(bestMV.sad);
-					
-				}
-
 				/* write the results */
-				pBlkData[blkx * N_PER_BLOCK + 0] = bestMV.x;
-				pBlkData[blkx * N_PER_BLOCK + 1] = bestMV.y;
-				pBlkData[blkx * N_PER_BLOCK + 2] = reinterpret_cast<int32_t&>(bestMV.sad);
-
-
+				auto& BlockData = reinterpret_cast<VectorStructure&>(pBlkData[blkx * N_PER_BLOCK]);
+				BlockData = bestMV;
 
-				if (smallestPlane)
-					sumLumaChange += LUMA(GetRefBlock(0, 0), nRefPitch[0]) - LUMA(pSrc[0], nSrcPitch[0]);
 
 				if (iblkx < nBlkX - 1)
 				{
@@ -1769,8 +1744,6 @@ class PlaneOfBlocks {
 				}
 			}
 			pBlkData += nBlkX * N_PER_BLOCK;
-			if (outfilebuf != nullptr) // write vector to outfile
-				outfilebuf += nBlkX * 4;// 4 int32_t word per block
 
 			y[0] += (nBlkSizeY - nOverlapY);
 			if (pSrcFrame->GetMode() & UPLANE)
diff --git a/src/SimpleResize.hpp b/src/SimpleResize.hpp
index e7004c4..d331e71 100644
--- a/src/SimpleResize.hpp
+++ b/src/SimpleResize.hpp
@@ -9,6 +9,9 @@ class SimpleResize {
 	int32_t dst_height;
 	int32_t src_width;
 	int32_t src_height;
+	self(limit_width, 0);
+	self(limit_height, 0);
+	self(pel, 0);
 	int32_t *vertical_offsets;
 	double *vertical_weights;
 	int32_t *horizontal_offsets;
@@ -39,11 +42,14 @@ class SimpleResize {
 		}
 	}
 public:
-	SimpleResize(int32_t _dst_width, int32_t _dst_height, int32_t _src_width, int32_t _src_height) {
+	SimpleResize(int32_t _dst_width, int32_t _dst_height, int32_t _src_width, int32_t _src_height, auto limit_width, auto limit_height, auto pel) {
 		src_width = _src_width;
 		src_height = _src_height;
 		dst_width = _dst_width;
 		dst_height = _dst_height;
+		this->limit_width = limit_width;
+		this->limit_height = limit_height;
+		this->pel = pel;
 		vertical_offsets = new int32_t[dst_height];
 		vertical_weights = new double[dst_height];
 		horizontal_offsets = new int32_t[dst_width];
@@ -57,7 +63,14 @@ class SimpleResize {
 		delete[] horizontal_offsets;
 		delete[] horizontal_weights;
 	}
-	auto Resize(T *dstp, int32_t dst_stride, const T *srcp, int32_t src_stride) {
+	auto Resize(T *dstp, int32_t dst_stride, const T *srcp, int32_t src_stride, auto horizontal_vectors) {
+		constexpr auto limit_vectors = requires(T x) { x << 0; };
+		auto minimum = static_cast<T>(0);
+		auto maximum = static_cast<T>(limit_height * pel - 1);
+		auto horizontal_step = horizontal_vectors ? pel : 0;
+		auto vertical_step = horizontal_vectors ? 0 : pel;
+
+
 		const T *srcp1;
 		const T *srcp2;
 		auto workp = new double[src_width];
@@ -68,13 +81,32 @@ class SimpleResize {
 			srcp2 = srcp1 + src_stride;
 			for (auto x = 0; x < src_width; ++x)
 				workp[x] = srcp1[x] * weight_top + srcp2[x] * weight_bottom;
+
+			if (horizontal_vectors) {
+				minimum = 0;
+				maximum = limit_width * pel - 1;
+			}
+
 			for (auto x = 0; x < dst_width; ++x) {
-				double weight_right = horizontal_weights[x];
-				double weight_left = 1. - weight_right;
-				int32_t offset = horizontal_offsets[x];
-				dstp[x] = static_cast<T>(workp[offset] * weight_left + workp[offset + 1] * weight_right);
+				auto weight_right = horizontal_weights[x];
+				auto weight_left = 1. - weight_right;
+				auto offset = horizontal_offsets[x];
+				auto result = static_cast<T>(workp[offset] * weight_left + workp[offset + 1] * weight_right);
+
+				if constexpr (limit_vectors) {
+					result = std::max(minimum, std::min(result, maximum));
+					minimum -= horizontal_step;
+					maximum -= horizontal_step;
+				}
+
+				dstp[x] = result;
 			}
 			dstp += dst_stride;
+
+			if constexpr (limit_vectors) {
+				minimum -= vertical_step;
+				maximum -= vertical_step;
+			}
 		}
 		delete[] workp;
 	}