diff --git a/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c
new file mode 100644
index 000000000..8687b9620
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c
@@ -0,0 +1,295 @@
+/**
+ *
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arm_neon.h>
+#include <time.h>
+#include "../resizer.h"
+#include "resizer_neon.h"
+
+#if OPTIMISED_COEFF
+void hresize_neon(const unsigned char **src, int **dst, int count,
+                  const short *alpha,
+                  int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hresize_neon(const unsigned char **src, int **dst, int count,
+                  const int *xofs, const short *alpha,
+                  int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+{
+    // int first_col_count = 0;
+    uint8x8_t src1_8x8, src2_8x8, src3_8x8;
+    int simd_loop = (xmax / 8) * 8;
+    int num_pix = 8;
+
+#if OPTIMISED_COEFF
+    int sx_start = 2;
+#else
+    int sx_start = xofs[1];
+#endif
+
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned char *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+
+            int start = sx_start - cn;
+            src1_8x8 = vld1_u8(S + start);
+#if OPTIMISED_COEFF
+            for (; dx < simd_loop;)
+            {
+#else
+            for (; dx < simd_loop; alpha += 32)
+            {
+#endif
+                start += num_pix;
+                src2_8x8 = vld1_u8(S + start);
+                start += num_pix;
+                src3_8x8 = vld1_u8(S + start);
+
+                uint16x8_t movl1_16x8 = vmovl_u8(src1_8x8);
+                uint16x8_t movl2_16x8 = vmovl_u8(src2_8x8);
+                uint16x8_t movl3_16x8 = vmovl_u8(src3_8x8);
+                int16x8_t s_movl1_16x8 = vreinterpretq_s16_u16(movl1_16x8);
+                int16x8_t s_movl2_16x8 = vreinterpretq_s16_u16(movl2_16x8);
+                int16x8_t s_movl3_16x8 = vreinterpretq_s16_u16(movl3_16x8);
+                int16x8x2_t t1 = vuzpq_s16(s_movl1_16x8, s_movl2_16x8); // 0 odd, 1 even
+                int16x8x2_t t2 = vuzpq_s16(s_movl3_16x8, s_movl3_16x8);
+                int16x8_t vx1 = vextq_s16(t1.val[0], t2.val[0], 1); // s_movl3_16x8,1);
+                int16x8_t vx2 = vextq_s16(t1.val[1], t2.val[1], 1);
+                int32x4_t m1_l = vmull_n_s16(vget_low_s16(t1.val[0]), alpha[0]);
+                int32x4_t m1_h = vmull_n_s16(vget_high_s16(t1.val[0]), alpha[0]);
+                int32x4_t m2_l = vmlal_n_s16(m1_l, vget_low_s16(vx1), alpha[1]);
+                int32x4_t m2_h = vmlal_n_s16(m1_h, vget_high_s16(vx1), alpha[1]);
+                int32x4_t m3_l = vmlal_n_s16(m2_l, vget_low_s16(t1.val[1]), alpha[2]);
+                int32x4_t m3_h = vmlal_n_s16(m2_h, vget_high_s16(t1.val[1]), alpha[2]);
+                int32x4_t out_l = vmlal_n_s16(m3_l, vget_low_s16(vx2), alpha[3]);  // final out
+                int32x4_t out_h = vmlal_n_s16(m3_h, vget_high_s16(vx2), alpha[3]); // final out
+
+                vst1q_s32(D + dx, out_l);
+                dx += 4;
+                vst1q_s32(D + dx, out_h);
+                dx += 4;
+                src1_8x8 = src3_8x8;
+            }
+
+#if OPTIMISED_COEFF
+            for (; dx < xmax; dx++)
+            {
+                int sx2 = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx2 = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                D[dx] = S[sx2 - 1] * alpha[0] + S[sx2] * alpha[1] + S[sx2 + 1] * alpha[2] + S[sx2 + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+void vresize_neon(const int **src, unsigned char *dst, const short *beta, int width)
+{
+    int32x4_t src_1, src_2, src_3, src_4, src_1_mul;
+    int32x4_t d4_q;
+    int32x4_t add_1;
+    int32x4_t add_delta;
+    int32x4_t shift_right_32x4;
+    uint16x4_t shift_right_16x4;
+    uint16x8_t shift_right_16x8;
+    int32x4_t dt;
+    uint8x8_t dt2;
+
+
+#define BITS 22
+    int bits = BITS;
+
+    // int32x4_t SHIFT = vdupq_n_s32(bits);
+    int DELTA = (1 << (bits - 1));
+    // b1_vq = vdupq_n_s32(beta[0]);
+    // b2_vq = vdupq_n_s32(beta[1]);
+    // b3_vq = vdupq_n_s32(beta[2]);
+    // b4_vq = vdupq_n_s32(beta[3]);
+    d4_q = vdupq_n_s32(DELTA);
+    src_1_mul = vdupq_n_s32(0);
+
+    int32x4_t lower = vdupq_n_s32(0);
+    int32x4_t higher = vdupq_n_s32(255);
+
+    for (int x = 0; x < width; x += 4)
+    {
+        src_1 = vld1q_s32(src[0] + x);
+        src_2 = vld1q_s32(src[1] + x);
+        src_3 = vld1q_s32(src[2] + x);
+        src_4 = vld1q_s32(src[3] + x);
+
+        add_1 = vmlaq_n_s32(src_1_mul, src_1, beta[0]);
+        add_1 = vmlaq_n_s32(add_1, src_2, beta[1]);
+        add_1 = vmlaq_n_s32(add_1, src_3, beta[2]);
+        add_1 = vmlaq_n_s32(add_1, src_4, beta[3]);
+
+        add_delta = vaddq_s32(add_1, d4_q);
+
+        shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4
+
+        dt = vminq_s32(shift_right_32x4, higher);
+        dt = vmaxq_s32(dt, lower);
+
+        // shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4
+
+        shift_right_16x4 = vqmovun_s32(dt);                                  // 16x4
+        shift_right_16x8 = vcombine_u16(shift_right_16x4, shift_right_16x4); // 16x8
+        dt2 = vqmovn_u16(shift_right_16x8);                                  // 8x8
+
+        vst1_lane_u32((unsigned int *)(dst + x), vreinterpret_u32_u8(dt2), 0);
+    }
+
+#undef BITS
+}
+
+static int clip_neon(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+#if OPTIMISED_COEFF
+void step_neon(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax)
+#else
+void step_neon(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("malloc fails\n");
+    }
+    const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[MAX_ESIZE];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = clip_neon(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+        
+
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hresize_neon((srows + k0), (rows + k0), ksize - k0, _alpha,
+                         iwidth, dwidth, cn, xmin, xmax);
+        }
+#if USE_C_VRESIZE
+        vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
+#elif !USE_C_VRESIZE
+        vresize_neon((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
+#endif
+#else
+        if (k0 < ksize)
+        {
+            hresize_neon((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                         iwidth, dwidth, cn, xmin, xmax);
+        }
+#if USE_C_VRESIZE
+        vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
+#elif !USE_C_VRESIZE
+        vresize_neon((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
+#endif
+#endif
+
+    }
+
+    free(_buffer);
+}
diff --git a/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h
new file mode 100644
index 000000000..801a32c25
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h
@@ -0,0 +1,30 @@
+/**
+ *
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#if OPTIMISED_COEFF
+void step_neon(const unsigned char *_src, unsigned char *_dst,
+               const short *_alpha, const short *_beta, 
+               int iwidth, int iheight, int dwidth, int channels, 
+               int ksize, int start, int end, int xmin, int xmax);
+#else
+void step_neon(const unsigned char *_src, unsigned char *_dst, 
+               const int *xofs, const int *yofs, 
+               const short *_alpha, const short *_beta, 
+               int iwidth, int iheight, int dwidth, int dheight, int channels, 
+               int ksize, int start, int end, int xmin, int xmax);
+#endif
diff --git a/libvmaf/src/feature/third_party/funque/hbd_resizer.c b/libvmaf/src/feature/third_party/funque/hbd_resizer.c
new file mode 100644
index 000000000..209a876b2
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/hbd_resizer.c
@@ -0,0 +1,314 @@
+/**
+ *
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "resizer.h"
+
+//const int HBD_INTER_RESIZE_COEF_SCALE = 2048;
+//static const int HBD_MAX_ESIZE = 16;
+
+//#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \
+//                                                        : X)
+//#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT)
+//#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT)
+
+// enabled by default for funque since resize factor is always 0.5, disabled otherwise
+//#define OPTIMISED_COEFF 1
+
+//#define USE_C_VRESIZE 0
+
+#if !OPTIMISED_COEFF
+static void interpolateCubic(float x, float *coeffs)
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+#endif
+
+#if OPTIMISED_COEFF
+void hbd_hresize(const unsigned short **src, int **dst, int count,
+                 const short *alpha,
+                 int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hbd_hresize(const unsigned short **src, int **dst, int count,
+                 const int *xofs, const short *alpha,
+                 int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+{
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned short *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+#if OPTIMISED_COEFF
+            for (; dx < xmax; dx++)
+            {
+                int sx = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+unsigned short hbd_castOp(int64_t val, int bitdepth)
+{
+    int bits = 22;
+    int SHIFT = bits;
+    int DELTA = (1 << (bits - 1));
+    return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1));
+}
+
+static int hbd_clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+void hbd_vresize(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth)
+{
+    int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+    const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+
+    for (int x = 0; x < width; x++)
+        dst[x] = hbd_castOp((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth);
+}
+
+#if OPTIMISED_COEFF
+void hbd_step(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth)
+#else
+void hbd_step(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("resizer: malloc fails\n");
+        return;
+    }
+    const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[MAX_ESIZE];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = hbd_clip(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+        
+
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hbd_hresize((srows + k0), (rows + k0), ksize - k0, _alpha,
+                        iwidth, dwidth, cn, xmin, xmax);
+        }
+        hbd_vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth);
+#else
+        if (k0 < ksize)
+        {
+            hbd_hresize((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                        iwidth, dwidth, cn, xmin, xmax);
+        }
+        hbd_vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth);
+#endif
+    }
+    free(_buffer);
+}
+
+void hbd_resize(ResizerState m, const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth)
+{
+    // int depth = 0;
+    int cn = 1;
+    double inv_scale_x = (double)dwidth / iwidth;
+
+    int ksize = 4, ksize2;
+    ksize2 = ksize / 2;
+
+    int xmin = 0, xmax = dwidth;
+
+#if OPTIMISED_COEFF
+    const short ibeta[] = {-192, 1216, 1216, -192};
+    const short ialpha[] = {-192, 1216, 1216, -192};
+    double scale_x = 1. / inv_scale_x;
+    float fx;
+    int sx;
+
+    for (int dx = 0; dx < dwidth; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = (int)floor(fx);
+        fx -= sx;
+
+        if (sx < ksize2 - 1)
+        {
+            xmin = dx + 1;
+        }
+
+        if (sx + ksize2 >= iwidth)
+        {
+            xmax = MIN(xmax, dx);
+        }
+    }
+    m.hbd_resizer_step(_src, _dst, ialpha, ibeta, iwidth, iheight, dwidth, cn, ksize, 0, dheight, xmin, xmax, bitdepth);
+#else
+    double inv_scale_y = (double)dheight / iheight;
+    double scale_x = 1. / inv_scale_x, scale_y = 1. / inv_scale_y;
+    int width = dwidth * cn;
+
+    int iscale_x = (int)scale_x;
+    int iscale_y = (int)scale_y;
+
+    int k, sx, sy, dx, dy;
+
+    float fx, fy;
+
+    unsigned short *_buffer = (unsigned short *)malloc((width + dheight) * (sizeof(int) + sizeof(float) * ksize));
+
+    int *xofs = (int *)_buffer;
+    int *yofs = xofs + width;
+    float *alpha = (float *)(yofs + dheight);
+    short *ialpha = (short *)alpha;
+    float *beta = alpha + width * ksize;
+    short *ibeta = ialpha + width * ksize;
+    float cbuf[4] = {0};
+
+    for (dx = 0; dx < dwidth; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = (int)floor(fx);
+        fx -= sx;
+
+        if (sx < ksize2 - 1)
+        {
+            xmin = dx + 1;
+        }
+
+        if (sx + ksize2 >= iwidth)
+        {
+            xmax = MIN(xmax, dx);
+        }
+
+        for (k = 0, sx *= cn; k < cn; k++)
+            xofs[dx * cn + k] = sx + k;
+
+        interpolateCubic(fx, cbuf);
+        for (k = 0; k < ksize; k++)
+            ialpha[dx * cn * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE);
+        for (; k < cn * ksize; k++)
+            ialpha[dx * cn * ksize + k] = ialpha[dx * cn * ksize + k - ksize];
+    }
+
+    for (dy = 0; dy < dheight; dy++)
+    {
+        fy = (float)((dy + 0.5) * scale_y - 0.5);
+        sy = (int)floor(fy);
+        fy -= sy;
+
+        yofs[dy] = sy;
+
+        interpolateCubic(fy, cbuf);
+        for (k = 0; k < ksize; k++)
+            ibeta[dy * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE);
+    }
+    m.hbd_resizer_step(_src, _dst, xofs, yofs, ialpha, ibeta, iwidth, iheight, dwidth, dheight, cn, ksize, 0, dheight, xmin, xmax, bitdepth);
+#endif
+
+}
diff --git a/libvmaf/src/feature/third_party/funque/resizer.c b/libvmaf/src/feature/third_party/funque/resizer.c
new file mode 100644
index 000000000..ef121cfa1
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/resizer.c
@@ -0,0 +1,307 @@
+/**
+ *
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#if ARCH_AARCH64
+#include <arm_neon.h>
+#endif
+
+#include "resizer.h"
+
+#if !OPTIMISED_COEFF
+static void interpolateCubic(float x, float *coeffs)
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+#endif
+
+#if OPTIMISED_COEFF
+void hresize(const unsigned char **src, int **dst, int count,
+             const short *alpha,
+             int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hresize(const unsigned char **src, int **dst, int count,
+             const int *xofs, const short *alpha,
+             int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+
+{
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned char *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+#if OPTIMISED_COEFF
+            for (; dx < xmax; dx++)
+            {
+                int sx = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+unsigned char castOp(int val)
+{
+    int bits = 22;
+    int SHIFT = bits;
+    int DELTA = (1 << (bits - 1));
+    return CLIP3((val + DELTA) >> SHIFT, 0, 255);
+}
+
+void vresize(const int **src, unsigned char *dst, const short *beta, int width)
+{
+    int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+    const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+
+    for (int x = 0; x < width; x++)
+        dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3);
+}
+
+static int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+#if OPTIMISED_COEFF
+void step(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax)
+#else
+void step(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("resizer: malloc fails\n");
+        return;
+    }
+    const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[MAX_ESIZE];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+        
+
+        // regular c
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hresize((srows + k0), (rows + k0), ksize - k0, _alpha,
+                    iwidth, dwidth, cn, xmin, xmax);
+        }
+        vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
+#else
+        if (k0 < ksize)
+        {
+            hresize((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                    iwidth, dwidth, cn, xmin, xmax);
+        }
+        vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
+#endif
+    }
+    free(_buffer);
+}
+
+void resize(ResizerState m, const unsigned char *_src, unsigned char *_dst, int iwidth, int iheight, int dwidth, int dheight)
+{
+    // int depth = 0;
+    int cn = 1;
+    double inv_scale_x = (double)dwidth / iwidth;
+    int ksize = 4, ksize2;
+    ksize2 = ksize / 2;
+
+    int xmin = 0, xmax = dwidth; 
+
+#if OPTIMISED_COEFF
+    const short ibeta[] = {-192, 1216, 1216, -192};
+    const short ialpha[] = {-192, 1216, 1216, -192};
+    double scale_x = 1. / inv_scale_x;
+    float fx;
+    int sx;
+
+    for (int dx = 0; dx < dwidth; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = (int)floor(fx);
+        fx -= sx;
+
+        if (sx < ksize2 - 1)
+        {
+            xmin = dx + 1;
+        }
+
+        if (sx + ksize2 >= iwidth)
+        {
+            xmax = MIN(xmax, dx);
+        }
+    }
+    m.resizer_step(_src, _dst, ialpha, ibeta, iwidth, iheight, dwidth, cn, ksize, 0, dheight, xmin, xmax);
+#else
+    double inv_scale_y = (double)dheight / iheight;
+    double scale_x = 1. / inv_scale_x, scale_y = 1. / inv_scale_y;
+
+    int width = dwidth * cn;
+
+    int iscale_x = (int)scale_x;
+    int iscale_y = (int)scale_y;
+
+    int k, sx, sy, dx, dy;
+
+    float fx, fy;
+
+    unsigned char *_buffer = (unsigned char *)malloc((width + dheight) * (sizeof(int) + sizeof(float) * ksize));
+
+    int *xofs = (int *)_buffer;
+    int *yofs = xofs + width;
+    float *alpha = (float *)(yofs + dheight);
+    short *ialpha = (short *)alpha;
+    float *beta = alpha + width * ksize;
+    short *ibeta = ialpha + width * ksize;
+    float cbuf[4] = {0};
+
+    for (dx = 0; dx < dwidth; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = (int)floor(fx);
+        fx -= sx;
+
+        if (sx < ksize2 - 1)
+        {
+            xmin = dx + 1;
+        }
+
+        if (sx + ksize2 >= iwidth)
+        {
+            xmax = MIN(xmax, dx);
+        }
+
+        for (k = 0, sx *= cn; k < cn; k++)
+            xofs[dx * cn + k] = sx + k;
+
+        interpolateCubic(fx, cbuf);
+        
+        for (k = 0; k < ksize; k++)
+            ialpha[dx * cn * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE);
+        for (; k < cn * ksize; k++)
+            ialpha[dx * cn * ksize + k] = ialpha[dx * cn * ksize + k - ksize];
+    }
+
+    for (dy = 0; dy < dheight; dy++)
+    {
+        fy = (float)((dy + 0.5) * scale_y - 0.5);
+        sy = (int)floor(fy);
+        fy -= sy;
+
+        yofs[dy] = sy;
+
+        interpolateCubic(fy, cbuf);
+
+        for (k = 0; k < ksize; k++)
+            ibeta[dy * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE);
+    }
+    m.resizer_step(_src, _dst, xofs, yofs, ialpha, ibeta, iwidth, iheight, dwidth, dheight, cn, ksize, 0, dheight, xmin, xmax);
+#endif
+}
\ No newline at end of file
diff --git a/libvmaf/src/feature/third_party/funque/resizer.h b/libvmaf/src/feature/third_party/funque/resizer.h
new file mode 100644
index 000000000..13df96ff1
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/resizer.h
@@ -0,0 +1,63 @@
+/**
+ *
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+#ifndef FEATURE_RESIZER_H_
+#define FEATURE_RESIZER_H_
+
+// #include "integer_funque_filters.h"
+
+#define INTER_RESIZE_COEF_BITS 11
+#define INTER_RESIZE_COEF_SCALE 2048
+#define MAX_ESIZE 16
+
+// enabled by default for funque since resize factor is always 0.5, disabled otherwise
+#define OPTIMISED_COEFF 1
+#define USE_C_VRESIZE 1
+
+#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \
+                                                        : X)
+#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT)
+#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT)
+#define MAX7(A, B, C, D, E, F, G) MAX(MAX(MAX(MAX(MAX(MAX(A, B), C), D), E), F), G)
+#define MAX6(A, B, C, D, E, F) MAX(MAX(MAX(MAX(MAX(A, B), C), D), E), F)
+#define MAX5(A, B, C, D, E) MAX(MAX(MAX(MAX(A, B), C), D), E)
+#define MAX4(A, B, C, D) MAX(MAX(MAX(A, B), C), D)
+
+typedef struct ResizerState
+{
+#if OPTIMISED_COEFF
+    void (*resizer_step)(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax);
+    void (*hbd_resizer_step)(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#else
+    void (*resizer_step)(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax);
+    void (*hbd_resizer_step)(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#endif
+}ResizerState;
+
+unsigned char castOp(int val);
+void vresize(const int **src, unsigned char *dst, const short *beta, int width);
+#if OPTIMISED_COEFF
+void step(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax);
+void hbd_step(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#else
+void step(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax);
+void hbd_step(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#endif
+void resize(ResizerState m, const unsigned char* _src, unsigned char* _dst, int iwidth, int iheight, int dwidth, int dheight);
+void hbd_resize(ResizerState m, const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth);
+
+#endif /* _FEATURE_RESIZER_H_ */
diff --git a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c
new file mode 100644
index 000000000..6a109d459
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c
@@ -0,0 +1,590 @@
+/**
+ *
+ *  Copyright (C) 2022 Intel Corporation.
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <immintrin.h>
+
+#include "resizer_avx2.h"
+
+#define shift22_64b_signExt(a, r)\
+{ \
+    r = _mm256_add_epi64( _mm256_srli_epi64(a, 22) , _mm256_and_si256(a, _mm256_set1_epi64x(0xFFFFFC0000000000)));\
+}
+
+#define shift22_64b_signExt_128(a, r)\
+{ \
+    r = _mm_add_epi64( _mm_srli_epi64(a, 22) , _mm_and_si128(a, _mm_set1_epi64x(0xFFFFFC0000000000)));\
+}
+
+const int HBD_INTER_RESIZE_COEF_SCALE_avx2 = 2048;
+static const int HBD_MAX_ESIZE_avx2 = 16;
+
+#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \
+                                                        : X)
+#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT)
+#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT)
+
+// enabled by default for funque since resize factor is always 0.5, disabled otherwise
+//#define OPTIMISED_COEFF 1
+
+//#define USE_C_VRESIZE 0
+
+#if !OPTIMISED_COEFF
+static void interpolateCubic(float x, float *coeffs)
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+#endif
+
+#if OPTIMISED_COEFF
+void hbd_hresize_avx2(const unsigned short **src, int **dst, int count,
+                 const short *alpha,
+                 int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hbd_hresize_avx2(const unsigned short **src, int **dst, int count,
+                 const int *xofs, const short *alpha,
+                 int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+{
+    __m256i coef0_256 = _mm256_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]);
+    __m256i coef2_256 = _mm256_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]);
+    __m256i zero_256 = _mm256_setzero_si256();
+
+    int xmax_16 = xmax - (xmax % 16);
+    int xmax_8 = xmax - (xmax % 8);
+    int xmax_4 = xmax - (xmax % 4);
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned short *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+#if OPTIMISED_COEFF
+            for (; dx < xmax_16; dx+=16)
+            {
+                int sx = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1));
+                __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1));
+                __m256i val0_16 = _mm256_loadu_si256((__m256i*)(S + sx + 15));
+                __m256i val2_16 = _mm256_loadu_si256((__m256i*)(S + sx + 17));
+
+                __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, zero_256);
+                __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, zero_256);
+                __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, zero_256);
+                __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, zero_256);
+                __m256i val0_16_lo = _mm256_unpacklo_epi16(val0_16, zero_256);
+                __m256i val0_16_hi = _mm256_unpackhi_epi16(val0_16, zero_256);
+                __m256i val2_16_lo = _mm256_unpacklo_epi16(val2_16, zero_256);
+                __m256i val2_16_hi = _mm256_unpackhi_epi16(val2_16, zero_256);
+
+                __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0_256);
+                __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0_256);
+                __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2_256);
+                __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2_256);
+                __m256i mul0_16_lo = _mm256_mullo_epi32(val0_16_lo, coef0_256);
+                __m256i mul0_16_hi = _mm256_mullo_epi32(val0_16_hi, coef0_256);
+                __m256i mul2_16_lo = _mm256_mullo_epi32(val2_16_lo, coef2_256);
+                __m256i mul2_16_hi = _mm256_mullo_epi32(val2_16_hi, coef2_256);
+
+                __m256i hadd0_0 = _mm256_hadd_epi32(mul0_0_lo, mul0_0_hi);
+                __m256i hadd2_0 = _mm256_hadd_epi32(mul2_0_lo, mul2_0_hi);
+                __m256i hadd0_16 = _mm256_hadd_epi32(mul0_16_lo, mul0_16_hi);
+                __m256i hadd2_16 = _mm256_hadd_epi32(mul2_16_lo, mul2_16_hi);
+
+                __m256i res_0 = _mm256_add_epi32(hadd0_0, hadd2_0);
+                __m256i res_16 = _mm256_add_epi32(hadd0_16, hadd2_16);
+
+                _mm256_storeu_si256((__m256i*)(D + dx), res_0);
+                _mm256_storeu_si256((__m256i*)(D + dx + 8), res_16);
+
+            }
+            for (; dx < xmax_8; dx+=8)
+            {
+                int sx = dx * 2;
+
+                __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1));
+                __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1));
+
+                __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, zero_256);
+                __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, zero_256);
+                __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, zero_256);
+                __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, zero_256);
+
+                __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0_256);
+                __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0_256);
+                __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2_256);
+                __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2_256);
+
+                __m256i hadd0_0 = _mm256_hadd_epi32(mul0_0_lo, mul0_0_hi);
+                __m256i hadd2_0 = _mm256_hadd_epi32(mul2_0_lo, mul2_0_hi);
+
+                __m256i res_0 = _mm256_add_epi32(hadd0_0, hadd2_0);
+
+                _mm256_storeu_si256((__m256i*)(D + dx), res_0);
+            }
+            for (; dx < xmax_4; dx+=4)
+            {
+                int sx = dx * 2;
+
+                __m256i val0_0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(S + sx - 1)));
+                __m256i val2_0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(S + sx + 1)));
+                __m256i mul0_0 = _mm256_mullo_epi32(val0_0, coef0_256);
+                __m256i mul2_0 = _mm256_mullo_epi32(val2_0, coef2_256);
+
+                __m256i hadd0 = _mm256_hadd_epi32(mul0_0, mul0_0);
+                __m256i hadd2 = _mm256_hadd_epi32(mul2_0, mul2_0);
+
+                __m256i res_0 = _mm256_add_epi32(hadd0, hadd2);
+                res_0 = _mm256_permutevar8x32_epi32(res_0, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 1, 0));
+
+                _mm_storeu_si128 ((__m128i*)(D + dx), _mm256_castsi256_si128(res_0));                
+            }
+
+            for (; dx < xmax; dx++)
+            {
+                int sx = dx * 2;
+                D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+unsigned short hbd_castOp_avx2(int64_t val, int bitdepth)
+{
+    int bits = 22;
+    int SHIFT = bits;
+    int DELTA = (1 << (bits - 1));
+    return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1));
+}
+
+static int hbd_clip_avx2(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth)
+{
+    int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+    const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+
+    int bits = 22;
+
+    __m256i delta_256 = _mm256_set1_epi64x(1 << (bits - 1));
+    __m256i max_char_256 = _mm256_set1_epi64x(((1 << bitdepth) - 1));
+    __m256i coef0_256 = _mm256_set1_epi32(beta[0]);
+    __m256i coef1_256 = _mm256_set1_epi32(beta[1]);
+    __m256i perm_256 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+    __m256i zero_256 = _mm256_setzero_si256();
+
+    __m128i max_char_128 = _mm_set1_epi64x(((1 << bitdepth) - 1));
+    __m128i delta_128 = _mm_set1_epi64x(1 << (bits - 1));
+    __m128i coef0_128 = _mm_set1_epi32(beta[0]);
+    __m128i coef1_128 = _mm_set1_epi32(beta[1]);
+    __m128i zero_128 = _mm_setzero_si128();
+
+    int width_16 = width - (width % 16);
+    int width_8 = width - (width % 8);
+    int width_4 = width - (width % 4);
+    int x = 0;
+    for (; x < width_16; x+=16)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8));
+        __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8));
+        __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8));
+        __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8));
+
+        __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256);
+
+        __m256i mul0_4 = _mm256_mul_epi32(_mm256_srli_si256(src0_0, 4), coef0_256);
+        __m256i mul1_4 = _mm256_mul_epi32(_mm256_srli_si256(src1_0, 4), coef1_256);
+        __m256i mul2_4 = _mm256_mul_epi32(_mm256_srli_si256(src2_0, 4), coef1_256);
+        __m256i mul3_4 = _mm256_mul_epi32(_mm256_srli_si256(src3_0, 4), coef0_256);
+
+        __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0_256);
+        __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1_256);
+        __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1_256);
+        __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0_256);
+
+        __m256i mul0_12 = _mm256_mul_epi32(_mm256_srli_si256(src0_8, 4), coef0_256);
+        __m256i mul1_12 = _mm256_mul_epi32(_mm256_srli_si256(src1_8, 4), coef1_256);
+        __m256i mul2_12 = _mm256_mul_epi32(_mm256_srli_si256(src2_8, 4), coef1_256);
+        __m256i mul3_12 = _mm256_mul_epi32(_mm256_srli_si256(src3_8, 4), coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0);
+        __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4);
+        __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4);
+        __m256i accum_01_8 = _mm256_add_epi64(mul0_8, mul1_8);
+        __m256i accum_23_8 = _mm256_add_epi64(mul2_8, mul3_8);
+        __m256i accum_01_12 = _mm256_add_epi64(mul0_12, mul1_12);
+        __m256i accum_23_12 = _mm256_add_epi64(mul2_12, mul3_12);
+
+        __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0);
+        __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4);
+        __m256i accum_0123_8 = _mm256_add_epi64(accum_01_8, accum_23_8);
+        __m256i accum_0123_12 = _mm256_add_epi64(accum_01_12, accum_23_12);
+
+        accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256);
+        accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256);
+        accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta_256);
+        accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta_256);
+
+        shift22_64b_signExt(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt(accum_0123_4, accum_0123_4);
+        shift22_64b_signExt(accum_0123_8, accum_0123_8);
+        shift22_64b_signExt(accum_0123_12, accum_0123_12);
+
+        accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256);
+        accum_0123_4 = _mm256_max_epi32(accum_0123_4, zero_256);
+        accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256);
+        accum_0123_12 = _mm256_max_epi32(accum_0123_12, zero_256);
+
+        accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256);
+        accum_0123_4 = _mm256_min_epi32(accum_0123_4, max_char_256);
+        accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256);
+        accum_0123_12 = _mm256_min_epi32(accum_0123_12, max_char_256);
+
+        accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16));
+        accum_0123_8 = _mm256_or_si256(accum_0123_8, _mm256_slli_epi32(accum_0123_12, 16));
+        accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi64(accum_0123_8, 32));
+
+        accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm_256);
+
+        _mm256_storeu_si256((__m256i*)(dst + x), accum_0123_0);
+    }
+    for (; x < width_8; x+=8)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256);
+
+        __m256i mul0_4 = _mm256_mul_epi32(_mm256_srli_si256(src0_0, 4), coef0_256);
+        __m256i mul1_4 = _mm256_mul_epi32(_mm256_srli_si256(src1_0, 4), coef1_256);
+        __m256i mul2_4 = _mm256_mul_epi32(_mm256_srli_si256(src2_0, 4), coef1_256);
+        __m256i mul3_4 = _mm256_mul_epi32(_mm256_srli_si256(src3_0, 4), coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0);
+        __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4);
+        __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4);
+        __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0);
+        __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4);
+
+        accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256);
+        accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256);
+
+        shift22_64b_signExt(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt(accum_0123_4, accum_0123_4);
+
+        accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256);
+        accum_0123_4 = _mm256_max_epi32(accum_0123_4, zero_256);
+
+        accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256);
+        accum_0123_4 = _mm256_min_epi32(accum_0123_4, max_char_256);
+
+        accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16));
+        __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, perm_256));
+
+        _mm_storeu_si128((__m128i*)(dst + x), accum);
+    }
+    for (; x < width_4; x+=4)
+    {
+        __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x));
+        __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x));
+        __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x));
+        __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x));
+
+        __m128i mul0_0 = _mm_mul_epi32(src0_0, coef0_128);
+        __m128i mul1_0 = _mm_mul_epi32(src1_0, coef1_128);
+        __m128i mul2_0 = _mm_mul_epi32(src2_0, coef1_128);
+        __m128i mul3_0 = _mm_mul_epi32(src3_0, coef0_128);
+
+        __m128i mul0_4 = _mm_mul_epi32(_mm_srli_si128(src0_0, 4), coef0_128);
+        __m128i mul1_4 = _mm_mul_epi32(_mm_srli_si128(src1_0, 4), coef1_128);
+        __m128i mul2_4 = _mm_mul_epi32(_mm_srli_si128(src2_0, 4), coef1_128);
+        __m128i mul3_4 = _mm_mul_epi32(_mm_srli_si128(src3_0, 4), coef0_128);
+
+        __m128i accum_01_0 = _mm_add_epi64(mul0_0, mul1_0);
+        __m128i accum_23_0 = _mm_add_epi64(mul2_0, mul3_0);
+        __m128i accum_01_4 = _mm_add_epi64(mul0_4, mul1_4);
+        __m128i accum_23_4 = _mm_add_epi64(mul2_4, mul3_4);
+        __m128i accum_0123_0 = _mm_add_epi64(accum_01_0, accum_23_0);
+        __m128i accum_0123_4 = _mm_add_epi64(accum_01_4, accum_23_4);
+        accum_0123_0 = _mm_add_epi64(accum_0123_0, delta_128);
+        accum_0123_4 = _mm_add_epi64(accum_0123_4, delta_128);
+
+        shift22_64b_signExt_128(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt_128(accum_0123_4, accum_0123_4);
+       
+        accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128);
+        accum_0123_4 = _mm_max_epi32(accum_0123_4, zero_128);
+
+        accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128);
+        accum_0123_4 = _mm_min_epi32(accum_0123_4, max_char_128);
+
+        accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_slli_epi32(accum_0123_4, 16));
+        accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_srli_si128(accum_0123_0, 4));
+
+        _mm_storel_epi64((__m128i*)(dst + x), accum_0123_0);
+    }
+    for (; x < width; x++)
+        dst[x] = hbd_castOp_avx2((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth);
+}
+
+#if OPTIMISED_COEFF
+void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth)
+#else
+void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("resizer: malloc fails\n");
+        return;
+    }
+    const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[HBD_MAX_ESIZE_avx2];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = hbd_clip_avx2(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < HBD_MAX_ESIZE_avx2 && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+
+
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hbd_hresize_avx2((srows + k0), (rows + k0), ksize - k0, _alpha,
+                        iwidth, dwidth, cn, xmin, xmax);
+        }
+        hbd_vresize_avx2((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth);
+#else
+        if (k0 < ksize)
+        {
+            hbd_hresize_avx2((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                        iwidth, dwidth, cn, xmin, xmax);
+        }
+        hbd_vresize_avx2((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth);
+#endif
+    }
+    free(_buffer);
+}
+/*
+void hbd_resize_avx2(const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth)
+{
+    // int depth = 0;
+    int cn = 1;
+    double inv_scale_x = (double)dwidth / iwidth;
+
+    int ksize = 4, ksize2;
+    ksize2 = ksize / 2;
+
+    int xmin = 0, xmax = dwidth;
+
+#if OPTIMISED_COEFF
+    const short ibeta[] = {-192, 1216, 1216, -192};
+    const short ialpha[] = {-192, 1216, 1216, -192};
+    double scale_x = 1. / inv_scale_x;
+    float fx;
+    int sx;
+
+    for (int dx = 0; dx < dwidth; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = (int)floor(fx);
+        fx -= sx;
+
+        if (sx < ksize2 - 1)
+        {
+            xmin = dx + 1;
+        }
+
+        if (sx + ksize2 >= iwidth)
+        {
+            xmax = MIN(xmax, dx);
+        }
+    }
+    hbd_step_avx2(_src, _dst, ialpha, ibeta, iwidth, iheight, dwidth, cn, ksize, 0, dheight, xmin, xmax, bitdepth);
+
+#else
+    double inv_scale_y = (double)dheight / iheight;
+    double scale_x = 1. / inv_scale_x, scale_y = 1. / inv_scale_y;
+    width = dwidth * cn;
+
+    int iscale_x = (int)scale_x;
+    int iscale_y = (int)scale_y;
+
+    int k, sx, sy, dx, dy;
+
+    float fx, fy;
+
+    unsigned short *_buffer = (unsigned short *)malloc((width + dheight) * (sizeof(int) + sizeof(float) * ksize));
+
+    int *xofs = (int *)_buffer;
+    int *yofs = xofs + width;
+    float *alpha = (float *)(yofs + dheight);
+    short *ialpha = (short *)alpha;
+    float *beta = alpha + width * ksize;
+    short *ibeta = ialpha + width * ksize;
+    float cbuf[4] = {0};
+
+    for (dx = 0; dx < dwidth; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = (int)floor(fx);
+        fx -= sx;
+
+        if (sx < ksize2 - 1)
+        {
+            xmin = dx + 1;
+        }
+
+        if (sx + ksize2 >= iwidth)
+        {
+            xmax = MIN(xmax, dx);
+        }
+
+        for (k = 0, sx *= cn; k < cn; k++)
+            xofs[dx * cn + k] = sx + k;
+
+        interpolateCubic(fx, cbuf);
+        for (k = 0; k < ksize; k++)
+            ialpha[dx * cn * ksize + k] = (short)(cbuf[k] * HBD_INTER_RESIZE_COEF_SCALE_avx2);
+        for (; k < cn * ksize; k++)
+            ialpha[dx * cn * ksize + k] = ialpha[dx * cn * ksize + k - ksize];
+    }
+
+    for (dy = 0; dy < dheight; dy++)
+    {
+        fy = (float)((dy + 0.5) * scale_y - 0.5);
+        sy = (int)floor(fy);
+        fy -= sy;
+
+        yofs[dy] = sy;
+
+        interpolateCubic(fy, cbuf);
+        for (k = 0; k < ksize; k++)
+            ibeta[dy * ksize + k] = (short)(cbuf[k] * HBD_INTER_RESIZE_COEF_SCALE_avx2);
+    }
+    hbd_step_avx2(_src, _dst, xofs, yofs, ialpha, ibeta, iwidth, iheight, dwidth, dheight, cn, ksize, 0, dheight, xmin, xmax, bitdepth);
+#endif
+
+}
+*/
\ No newline at end of file
diff --git a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c
new file mode 100644
index 000000000..2fe2073b1
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c
@@ -0,0 +1,581 @@
+/**
+ *
+ *  Copyright (C) 2022 Intel Corporation.
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <immintrin.h>
+
+#include "resizer_avx512.h"
+
+
+#define shift22_64b_signExt_512(a, r)\
+{ \
+    r = _mm512_add_epi64( _mm512_srli_epi64(a, 22) , _mm512_and_si512(a, _mm512_set1_epi64(0xFFFFFC0000000000)));\
+}
+
+#define shift22_64b_signExt_256(a, r)\
+{ \
+    r = _mm256_add_epi64( _mm256_srli_epi64(a, 22) , _mm256_and_si256(a, _mm256_set1_epi64x(0xFFFFFC0000000000)));\
+}
+
+#define shift22_64b_signExt_128(a, r)\
+{ \
+    r = _mm_add_epi64( _mm_srli_epi64(a, 22) , _mm_and_si128(a, _mm_set1_epi64x(0xFFFFFC0000000000)));\
+}
+
+const int HBD_INTER_RESIZE_COEF_SCALE_avx512 = 2048;
+static const int HBD_MAX_ESIZE_avx512 = 16;
+
+#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \
+                                                        : X)
+#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT)
+#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT)
+
+// enabled by default for funque since resize factor is always 0.5, disabled otherwise
+//#define OPTIMISED_COEFF 1
+
+//#define USE_C_VRESIZE 0
+
+#if !OPTIMISED_COEFF
+static void interpolateCubic(float x, float *coeffs)
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+#endif
+
+#if OPTIMISED_COEFF
+void hbd_hresize_avx512(const unsigned short **src, int **dst, int count,
+                 const short *alpha,
+                 int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hbd_hresize_avx512(const unsigned short **src, int **dst, int count,
+                 const int *xofs, const short *alpha,
+                 int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+{
+    __m512i idx_extract_ab_512 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+	__m512i idx_extract_cd_512 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i coef0_512 = _mm512_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]);
+    __m512i coef2_512 = _mm512_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]);
+
+    int xmax_32 = xmax - (xmax % 32);
+    int xmax_16 = xmax - (xmax % 16);
+    int xmax_8 = xmax - (xmax % 8);
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned short *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+#if OPTIMISED_COEFF
+            for (; dx < xmax_32; dx+=32)
+            {
+                int sx = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1));
+                __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1));
+                __m512i val32 = _mm512_loadu_si512((__m512i*)(S + sx - 1 + 32));
+                __m512i val34 = _mm512_loadu_si512((__m512i*)(S + sx + 1 + 32));
+
+                __m512i val0_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val0));
+                __m512i val0_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val0, 1));
+                __m512i val2_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val2));
+                __m512i val2_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val2, 1));
+                __m512i val32_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val32));
+                __m512i val32_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val32, 1));
+                __m512i val34_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val34));
+                __m512i val34_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val34, 1));
+
+                __m512i mul0_lo = _mm512_mullo_epi32(val0_lo, coef0_512);
+                __m512i mul0_hi = _mm512_mullo_epi32(val0_hi, coef0_512);
+                __m512i mul2_lo = _mm512_mullo_epi32(val2_lo, coef2_512);
+                __m512i mul2_hi = _mm512_mullo_epi32(val2_hi, coef2_512);
+
+                __m512i mul32_lo = _mm512_mullo_epi32(val32_lo, coef0_512);
+                __m512i mul32_hi = _mm512_mullo_epi32(val32_hi, coef0_512);
+                __m512i mul34_lo = _mm512_mullo_epi32(val34_lo, coef2_512);
+                __m512i mul34_hi = _mm512_mullo_epi32(val34_hi, coef2_512);
+
+                __m512i ac_bd_0_lo = _mm512_add_epi32(mul0_lo, mul2_lo);
+                __m512i ac_bd_0_hi = _mm512_add_epi32(mul0_hi, mul2_hi);
+                __m512i ac_bd_32_lo = _mm512_add_epi32(mul32_lo, mul34_lo);
+                __m512i ac_bd_32_hi = _mm512_add_epi32(mul32_hi, mul34_hi);
+
+                __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_hi);
+                __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_hi);
+                __m512i ac_32 = _mm512_permutex2var_epi32(ac_bd_32_lo, idx_extract_ab_512, ac_bd_32_hi);
+                __m512i bd_32 = _mm512_permutex2var_epi32(ac_bd_32_lo, idx_extract_cd_512, ac_bd_32_hi);
+
+                __m512i res_0 = _mm512_add_epi32(ac_0, bd_0);
+                __m512i res_32 = _mm512_add_epi32(ac_32, bd_32);
+
+                _mm512_storeu_si512((__m512i*)(D + dx), res_0);
+                _mm512_storeu_si512((__m512i*)(D + dx + 16), res_32);
+            }
+            for (; dx < xmax_16; dx+=16)
+            {
+                int sx = dx * 2;
+                __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1));
+                __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1));
+                
+                __m512i val0_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val0));
+                __m512i val0_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val0, 1));
+                __m512i val2_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val2));
+                __m512i val2_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val2, 1));
+
+                __m512i mul0_lo = _mm512_mullo_epi32(val0_lo, coef0_512);
+                __m512i mul0_hi = _mm512_mullo_epi32(val0_hi, coef0_512);
+                __m512i mul2_lo = _mm512_mullo_epi32(val2_lo, coef2_512);
+                __m512i mul2_hi = _mm512_mullo_epi32(val2_hi, coef2_512);
+                
+                __m512i ac_bd_0_lo = _mm512_add_epi32(mul0_lo, mul2_lo);
+                __m512i ac_bd_0_hi = _mm512_add_epi32(mul0_hi, mul2_hi);
+
+                __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_hi);
+                __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_hi);
+
+                __m512i res_0 = _mm512_add_epi32(ac_0, bd_0);
+
+                _mm512_storeu_si512((__m512i*)(D + dx), res_0);
+            }
+            for (; dx < xmax_8; dx+=8)
+            {
+                int sx = dx * 2;
+                __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1));
+                __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1));
+                __m512i val0 = _mm512_cvtepu16_epi32(val0_0);
+                __m512i val2 = _mm512_cvtepu16_epi32(val2_0);
+
+                __m512i mul0 = _mm512_mullo_epi32(val0, coef0_512);
+                __m512i mul2 = _mm512_mullo_epi32(val2, coef2_512);
+                __m512i ac_bd_0_lo = _mm512_add_epi32(mul0, mul2);
+                __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_lo);
+                __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_lo);
+                __m512i res_0 = _mm512_add_epi32(ac_0, bd_0);
+
+                _mm256_storeu_si256((__m256i*)(D + dx), _mm512_castsi512_si256(res_0));
+            }
+            for (; dx < xmax; dx++)
+            {
+                int sx = dx * 2;
+                D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+unsigned short hbd_castOp_avx512(int64_t val, int bitdepth)
+{
+    int bits = 22;
+    int SHIFT = bits;
+    int DELTA = (1 << (bits - 1));
+    return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1));
+}
+
+static int hbd_clip_avx512(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+void hbd_vresize_avx512(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth)
+{
+    int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+    const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+    int bits = 22;
+    
+    __m512i delta_512 = _mm512_set1_epi64(1 << (bits - 1));
+    __m512i max_char_512 = _mm512_set1_epi64(((1 << bitdepth) - 1));
+    __m512i coef0_512 = _mm512_set1_epi32(beta[0]);
+    __m512i coef1_512 = _mm512_set1_epi32(beta[1]);
+    __m512i perm_512 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i zero_512 = _mm512_setzero_si512();
+
+    __m256i delta_256 = _mm256_set1_epi64x(1 << (bits - 1));
+    __m256i max_char_256 = _mm256_set1_epi64x(((1 << bitdepth) - 1));
+    __m256i coef0_256 = _mm256_set1_epi32(beta[0]);
+    __m256i coef1_256 = _mm256_set1_epi32(beta[1]);
+    __m256i perm_256 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+    __m256i zero_256 = _mm256_setzero_si256();
+
+    __m128i max_char_128 = _mm_set1_epi64x(((1 << bitdepth) - 1));
+    __m128i delta_128 = _mm_set1_epi64x(1 << (bits - 1));
+    __m128i coef0_128 = _mm_set1_epi32(beta[0]);
+    __m128i coef1_128 = _mm_set1_epi32(beta[1]);
+    __m128i zero_128 = _mm_setzero_si128();
+
+    int width_32 = width - (width % 32);
+    int width_16 = width - (width % 16);
+    int width_8 = width - (width % 8);
+    int width_4 = width - (width % 4);
+    int x = 0;
+
+    for (; x < width_32; x+=32)
+    {
+        __m512i src0_0 = _mm512_loadu_si512((__m512i*)(S0 + x));
+        __m512i src1_0 = _mm512_loadu_si512((__m512i*)(S1 + x));
+        __m512i src2_0 = _mm512_loadu_si512((__m512i*)(S2 + x));
+        __m512i src3_0 = _mm512_loadu_si512((__m512i*)(S3 + x));
+
+        __m512i src0_16 = _mm512_loadu_si512((__m512i*)(S0 + x + 16));
+        __m512i src1_16 = _mm512_loadu_si512((__m512i*)(S1 + x + 16));
+        __m512i src2_16 = _mm512_loadu_si512((__m512i*)(S2 + x + 16));
+        __m512i src3_16 = _mm512_loadu_si512((__m512i*)(S3 + x + 16));
+
+        __m512i mul0_0 = _mm512_mul_epi32(src0_0, coef0_512);
+        __m512i mul1_0 = _mm512_mul_epi32(src1_0, coef1_512);
+        __m512i mul2_0 = _mm512_mul_epi32(src2_0, coef1_512);
+        __m512i mul3_0 = _mm512_mul_epi32(src3_0, coef0_512);
+
+        __m512i mul0_4 = _mm512_mul_epi32(_mm512_srai_epi64(src0_0, 32), coef0_512);
+        __m512i mul1_4 = _mm512_mul_epi32(_mm512_srai_epi64(src1_0, 32), coef1_512);
+        __m512i mul2_4 = _mm512_mul_epi32(_mm512_srai_epi64(src2_0, 32), coef1_512);
+        __m512i mul3_4 = _mm512_mul_epi32(_mm512_srai_epi64(src3_0, 32), coef0_512);
+
+        __m512i mul0_8 = _mm512_mul_epi32(src0_16, coef0_512);
+        __m512i mul1_8 = _mm512_mul_epi32(src1_16, coef1_512);
+        __m512i mul2_8 = _mm512_mul_epi32(src2_16, coef1_512);
+        __m512i mul3_8 = _mm512_mul_epi32(src3_16, coef0_512);
+
+        __m512i mul0_12 = _mm512_mul_epi32(_mm512_srai_epi64(src0_16, 32), coef0_512);
+        __m512i mul1_12 = _mm512_mul_epi32(_mm512_srai_epi64(src1_16, 32), coef1_512);
+        __m512i mul2_12 = _mm512_mul_epi32(_mm512_srai_epi64(src2_16, 32), coef1_512);
+        __m512i mul3_12 = _mm512_mul_epi32(_mm512_srai_epi64(src3_16, 32), coef0_512);
+
+        __m512i accum_01_0 = _mm512_add_epi64(mul0_0, mul1_0);
+        __m512i accum_23_0 = _mm512_add_epi64(mul2_0, mul3_0);
+        __m512i accum_01_4 = _mm512_add_epi64(mul0_4, mul1_4);
+        __m512i accum_23_4 = _mm512_add_epi64(mul2_4, mul3_4);
+        __m512i accum_01_8 = _mm512_add_epi64(mul0_8, mul1_8);
+        __m512i accum_23_8 = _mm512_add_epi64(mul2_8, mul3_8);
+        __m512i accum_01_12 = _mm512_add_epi64(mul0_12, mul1_12);
+        __m512i accum_23_12 = _mm512_add_epi64(mul2_12, mul3_12);
+
+        __m512i accum_0123_0 = _mm512_add_epi64(accum_01_0, accum_23_0);
+        __m512i accum_0123_4 = _mm512_add_epi64(accum_01_4, accum_23_4);
+        __m512i accum_0123_8 = _mm512_add_epi64(accum_01_8, accum_23_8);
+        __m512i accum_0123_12 = _mm512_add_epi64(accum_01_12, accum_23_12);
+
+        accum_0123_0 = _mm512_add_epi64(accum_0123_0, delta_512);
+        accum_0123_4 = _mm512_add_epi64(accum_0123_4, delta_512);
+        accum_0123_8 = _mm512_add_epi64(accum_0123_8, delta_512);
+        accum_0123_12 = _mm512_add_epi64(accum_0123_12, delta_512);
+
+        shift22_64b_signExt_512(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt_512(accum_0123_4, accum_0123_4);
+        shift22_64b_signExt_512(accum_0123_8, accum_0123_8);
+        shift22_64b_signExt_512(accum_0123_12, accum_0123_12);
+
+        accum_0123_0 = _mm512_max_epi64(accum_0123_0, zero_512);
+        accum_0123_4 = _mm512_max_epi64(accum_0123_4, zero_512);
+        accum_0123_8 = _mm512_max_epi64(accum_0123_8, zero_512);
+        accum_0123_12 = _mm512_max_epi64(accum_0123_12, zero_512);
+
+        accum_0123_0 = _mm512_min_epi64(accum_0123_0, max_char_512);
+        accum_0123_4 = _mm512_min_epi64(accum_0123_4, max_char_512);
+        accum_0123_8 = _mm512_min_epi64(accum_0123_8, max_char_512);
+        accum_0123_12 = _mm512_min_epi64(accum_0123_12, max_char_512);
+
+        accum_0123_0 = _mm512_or_si512(accum_0123_0, _mm512_slli_epi32(accum_0123_4, 16));
+        accum_0123_8 = _mm512_or_si512(accum_0123_8, _mm512_slli_epi32(accum_0123_12, 16));
+        accum_0123_0 = _mm512_or_si512(accum_0123_0, _mm512_slli_epi64(accum_0123_8, 32));
+        accum_0123_0 = _mm512_permutexvar_epi32(perm_512, accum_0123_0);
+
+        _mm512_storeu_si512((__m512i*)(dst + x), accum_0123_0);
+    }
+    for (; x < width_16; x+=16)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8));
+        __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8));
+        __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8));
+        __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8));
+
+        __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256);
+
+        __m256i mul0_4 = _mm256_mul_epi32(_mm256_srai_epi64(src0_0, 32), coef0_256);
+        __m256i mul1_4 = _mm256_mul_epi32(_mm256_srai_epi64(src1_0, 32), coef1_256);
+        __m256i mul2_4 = _mm256_mul_epi32(_mm256_srai_epi64(src2_0, 32), coef1_256);
+        __m256i mul3_4 = _mm256_mul_epi32(_mm256_srai_epi64(src3_0, 32), coef0_256);
+
+        __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0_256);
+        __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1_256);
+        __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1_256);
+        __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0_256);
+
+        __m256i mul0_12 = _mm256_mul_epi32(_mm256_srai_epi64(src0_8, 32), coef0_256);
+        __m256i mul1_12 = _mm256_mul_epi32(_mm256_srai_epi64(src1_8, 32), coef1_256);
+        __m256i mul2_12 = _mm256_mul_epi32(_mm256_srai_epi64(src2_8, 32), coef1_256);
+        __m256i mul3_12 = _mm256_mul_epi32(_mm256_srai_epi64(src3_8, 32), coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0);
+        __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4);
+        __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4);
+        __m256i accum_01_8 = _mm256_add_epi64(mul0_8, mul1_8);
+        __m256i accum_23_8 = _mm256_add_epi64(mul2_8, mul3_8);
+        __m256i accum_01_12 = _mm256_add_epi64(mul0_12, mul1_12);
+        __m256i accum_23_12 = _mm256_add_epi64(mul2_12, mul3_12);
+
+        __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0);
+        __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4);
+        __m256i accum_0123_8 = _mm256_add_epi64(accum_01_8, accum_23_8);
+        __m256i accum_0123_12 = _mm256_add_epi64(accum_01_12, accum_23_12);
+
+        accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256);
+        accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256);
+        accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta_256);
+        accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta_256);
+
+        shift22_64b_signExt_256(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt_256(accum_0123_4, accum_0123_4);
+        shift22_64b_signExt_256(accum_0123_8, accum_0123_8);
+        shift22_64b_signExt_256(accum_0123_12, accum_0123_12);
+
+        accum_0123_0 = _mm256_max_epi64(accum_0123_0, zero_256);
+        accum_0123_4 = _mm256_max_epi64(accum_0123_4, zero_256);
+        accum_0123_8 = _mm256_max_epi64(accum_0123_8, zero_256);
+        accum_0123_12 = _mm256_max_epi64(accum_0123_12, zero_256);
+
+        accum_0123_0 = _mm256_min_epi64(accum_0123_0, max_char_256);
+        accum_0123_4 = _mm256_min_epi64(accum_0123_4, max_char_256);
+        accum_0123_8 = _mm256_min_epi64(accum_0123_8, max_char_256);
+        accum_0123_12 = _mm256_min_epi64(accum_0123_12, max_char_256);
+
+        accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16));
+        accum_0123_8 = _mm256_or_si256(accum_0123_8, _mm256_slli_epi32(accum_0123_12, 16));
+        accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi64(accum_0123_8, 32));
+        accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm_256);
+
+        _mm256_storeu_si256((__m256i*)(dst + x), accum_0123_0);
+    }
+    for (; x < width_8; x+=8)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256);
+
+        __m256i mul0_4 = _mm256_mul_epi32(_mm256_srai_epi64(src0_0, 32), coef0_256);
+        __m256i mul1_4 = _mm256_mul_epi32(_mm256_srai_epi64(src1_0, 32), coef1_256);
+        __m256i mul2_4 = _mm256_mul_epi32(_mm256_srai_epi64(src2_0, 32), coef1_256);
+        __m256i mul3_4 = _mm256_mul_epi32(_mm256_srai_epi64(src3_0, 32), coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0);
+        __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4);
+        __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4);
+
+        __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0);
+        __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4);
+
+        accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256);
+        accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256);
+
+        shift22_64b_signExt_256(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt_256(accum_0123_4, accum_0123_4);
+
+        accum_0123_0 = _mm256_max_epi64(accum_0123_0, zero_256);
+        accum_0123_4 = _mm256_max_epi64(accum_0123_4, zero_256);
+        accum_0123_0 = _mm256_min_epi64(accum_0123_0, max_char_256);
+        accum_0123_4 = _mm256_min_epi64(accum_0123_4, max_char_256);
+
+        accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16));        
+        __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, perm_256));
+        _mm_storeu_si128((__m128i*)(dst + x), accum);
+    }
+    for (; x < width_4; x+=4)
+    {
+        __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x));
+        __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x));
+        __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x));
+        __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x));
+
+        __m128i mul0_0 = _mm_mul_epi32(src0_0, coef0_128);
+        __m128i mul1_0 = _mm_mul_epi32(src1_0, coef1_128);
+        __m128i mul2_0 = _mm_mul_epi32(src2_0, coef1_128);
+        __m128i mul3_0 = _mm_mul_epi32(src3_0, coef0_128);
+
+        __m128i mul0_4 = _mm_mul_epi32(_mm_srli_si128(src0_0, 4), coef0_128);
+        __m128i mul1_4 = _mm_mul_epi32(_mm_srli_si128(src1_0, 4), coef1_128);
+        __m128i mul2_4 = _mm_mul_epi32(_mm_srli_si128(src2_0, 4), coef1_128);
+        __m128i mul3_4 = _mm_mul_epi32(_mm_srli_si128(src3_0, 4), coef0_128);
+
+        __m128i accum_01_0 = _mm_add_epi64(mul0_0, mul1_0);
+        __m128i accum_23_0 = _mm_add_epi64(mul2_0, mul3_0);
+        __m128i accum_01_4 = _mm_add_epi64(mul0_4, mul1_4);
+        __m128i accum_23_4 = _mm_add_epi64(mul2_4, mul3_4);
+        __m128i accum_0123_0 = _mm_add_epi64(accum_01_0, accum_23_0);
+        __m128i accum_0123_4 = _mm_add_epi64(accum_01_4, accum_23_4);
+
+        accum_0123_0 = _mm_add_epi64(accum_0123_0, delta_128);
+        accum_0123_4 = _mm_add_epi64(accum_0123_4, delta_128);
+
+        shift22_64b_signExt_128(accum_0123_0, accum_0123_0);
+        shift22_64b_signExt_128(accum_0123_4, accum_0123_4);
+
+        accum_0123_0 = _mm_max_epi64(accum_0123_0, zero_128);
+        accum_0123_4 = _mm_max_epi64(accum_0123_4, zero_128);
+        accum_0123_0 = _mm_min_epi64(accum_0123_0, max_char_128);
+        accum_0123_4 = _mm_min_epi64(accum_0123_4, max_char_128);
+
+        accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_slli_epi32(accum_0123_4, 16));
+        accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_srli_si128(accum_0123_0, 4));
+
+        _mm_storel_epi64((__m128i*)(dst + x), accum_0123_0);
+    }
+    for (; x < width; x++)
+        dst[x] = hbd_castOp_avx512((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth);
+}
+
+#if OPTIMISED_COEFF
+void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth)
+#else
+void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("resizer: malloc fails\n");
+        return;
+    }
+    const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[HBD_MAX_ESIZE_avx512];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = hbd_clip_avx512(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < HBD_MAX_ESIZE_avx512 && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+        // printf("%d ", dy);
+
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hbd_hresize_avx512((srows + k0), (rows + k0), ksize - k0, _alpha,
+                        iwidth, dwidth, cn, xmin, xmax);
+        }
+        hbd_vresize_avx512((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth);
+#else
+        if (k0 < ksize)
+        {
+            hbd_hresize_avx512((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                        iwidth, dwidth, cn, xmin, xmax);
+        }
+        hbd_vresize_avx512((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth);
+#endif
+    }
+    free(_buffer);
+}
\ No newline at end of file
diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.c b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.c
new file mode 100644
index 000000000..f5564d9d5
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.c
@@ -0,0 +1,518 @@
+/**
+ *
+ *  Copyright (C) 2022 Intel Corporation.
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#if ARCH_AARCH64
+#include <arm_neon.h>
+#endif
+
+#include "resizer_avx2.h"
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#if !OPTIMISED_COEFF
+static void interpolateCubic(float x, float *coeffs)
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+#endif
+
+#if OPTIMISED_COEFF
+void hresize_avx2(const unsigned char **src, int **dst, int count,
+             const short *alpha,
+             int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hresize_avx2(const unsigned char **src, int **dst, int count,
+             const int *xofs, const short *alpha,
+             int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+
+{
+    int xmax_64 = xmax - (xmax % 64);
+    int xmax_32 = xmax - (xmax % 32);
+    int xmax_16 = xmax - (xmax % 16);
+    int xmax_8 = xmax - (xmax % 8);
+    int xmax_4 = xmax - (xmax % 4);
+    __m256i coef0_256 = _mm256_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16));
+    __m256i coef2_256 = _mm256_set1_epi32(alpha[2] + (alpha[3] << 16));
+    __m256i zero_256 = _mm256_setzero_si256();
+
+    __m128i coef0_128 = _mm_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16));
+    __m128i coef2_128 = _mm_set1_epi32(alpha[2] + (alpha[3] << 16));
+
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned char *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+#if OPTIMISED_COEFF
+            for (; dx < xmax_64; dx+=64)
+            {
+                int sx = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                __m256i val0 = _mm256_loadu_si256((__m256i*)(S + sx - 1));
+                __m256i val2 = _mm256_loadu_si256((__m256i*)(S + sx + 1));
+                __m256i val32 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 32));
+                __m256i val34 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 32));
+                __m256i val64 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 64));
+                __m256i val66 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 64));
+                __m256i val96 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 96));
+                __m256i val98 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 96));
+
+                __m256i val0_lo = _mm256_unpacklo_epi8(val0, zero_256);
+                __m256i val0_hi = _mm256_unpackhi_epi8(val0, zero_256);
+                __m256i val2_lo = _mm256_unpacklo_epi8(val2, zero_256);
+                __m256i val2_hi = _mm256_unpackhi_epi8(val2, zero_256);
+                
+                __m256i val32_lo = _mm256_unpacklo_epi8(val32, zero_256);
+                __m256i val32_hi = _mm256_unpackhi_epi8(val32, zero_256);
+                __m256i val34_lo = _mm256_unpacklo_epi8(val34, zero_256);
+                __m256i val34_hi = _mm256_unpackhi_epi8(val34, zero_256);
+
+                __m256i val64_lo = _mm256_unpacklo_epi8(val64, zero_256);
+                __m256i val64_hi = _mm256_unpackhi_epi8(val64, zero_256);
+                __m256i val66_lo = _mm256_unpacklo_epi8(val66, zero_256);
+                __m256i val66_hi = _mm256_unpackhi_epi8(val66, zero_256);
+
+                __m256i val96_lo = _mm256_unpacklo_epi8(val96, zero_256);
+                __m256i val96_hi = _mm256_unpackhi_epi8(val96, zero_256);
+                __m256i val98_lo = _mm256_unpacklo_epi8(val98, zero_256);
+                __m256i val98_hi = _mm256_unpackhi_epi8(val98, zero_256);
+
+                __m256i res0_lo = _mm256_madd_epi16(val0_lo, coef0_256);
+                __m256i res0_hi = _mm256_madd_epi16(val0_hi, coef0_256);
+                __m256i res2_lo = _mm256_madd_epi16(val2_lo, coef2_256);
+                __m256i res2_hi = _mm256_madd_epi16(val2_hi, coef2_256);
+                __m256i res32_lo = _mm256_madd_epi16(val32_lo, coef0_256);
+                __m256i res32_hi = _mm256_madd_epi16(val32_hi, coef0_256);
+                __m256i res34_lo = _mm256_madd_epi16(val34_lo, coef2_256);
+                __m256i res34_hi = _mm256_madd_epi16(val34_hi, coef2_256);
+
+                __m256i res64_lo = _mm256_madd_epi16(val64_lo, coef0_256);
+                __m256i res64_hi = _mm256_madd_epi16(val64_hi, coef0_256);
+                __m256i res66_lo = _mm256_madd_epi16(val66_lo, coef2_256);
+                __m256i res66_hi = _mm256_madd_epi16(val66_hi, coef2_256);
+                __m256i res96_lo = _mm256_madd_epi16(val96_lo, coef0_256);
+                __m256i res96_hi = _mm256_madd_epi16(val96_hi, coef0_256);
+                __m256i res98_lo = _mm256_madd_epi16(val98_lo, coef2_256);
+                __m256i res98_hi = _mm256_madd_epi16(val98_hi, coef2_256);
+
+                __m256i acc0_lo = _mm256_add_epi32(res0_lo, res2_lo);
+                __m256i acc0_hi = _mm256_add_epi32(res0_hi, res2_hi);
+                __m256i acc32_lo = _mm256_add_epi32(res32_lo, res34_lo);
+                __m256i acc32_hi = _mm256_add_epi32(res32_hi, res34_hi);
+                __m256i acc64_lo = _mm256_add_epi32(res64_lo, res66_lo);
+                __m256i acc64_hi = _mm256_add_epi32(res64_hi, res66_hi);
+                __m256i acc96_lo = _mm256_add_epi32(res96_lo, res98_lo);
+                __m256i acc96_hi = _mm256_add_epi32(res96_hi, res98_hi);
+
+                __m256i tmp0 = acc0_lo;
+                __m256i tmp32 = acc32_lo;
+                __m256i tmp64 = acc64_lo;
+                __m256i tmp96 = acc96_lo;
+
+                acc0_lo = _mm256_inserti128_si256(acc0_lo, _mm256_castsi256_si128(acc0_hi), 1);
+                acc0_hi = _mm256_inserti128_si256(acc0_hi, _mm256_extracti128_si256(tmp0, 1), 0);
+                acc32_lo = _mm256_inserti128_si256(acc32_lo, _mm256_castsi256_si128(acc32_hi), 1);
+                acc32_hi = _mm256_inserti128_si256(acc32_hi, _mm256_extracti128_si256(tmp32, 1), 0);
+                acc64_lo = _mm256_inserti128_si256(acc64_lo, _mm256_castsi256_si128(acc64_hi), 1);
+                acc64_hi = _mm256_inserti128_si256(acc64_hi, _mm256_extracti128_si256(tmp64, 1), 0);
+                acc96_lo = _mm256_inserti128_si256(acc96_lo, _mm256_castsi256_si128(acc96_hi), 1);
+                acc96_hi = _mm256_inserti128_si256(acc96_hi, _mm256_extracti128_si256(tmp96, 1), 0);
+
+                _mm256_storeu_si256((__m256i*)(D + dx), acc0_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 8), acc0_hi);
+                _mm256_storeu_si256((__m256i*)(D + dx + 16), acc32_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 24), acc32_hi);
+                _mm256_storeu_si256((__m256i*)(D + dx + 32), acc64_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 40), acc64_hi);
+                _mm256_storeu_si256((__m256i*)(D + dx + 48), acc96_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 56), acc96_hi);
+            }
+            for (; dx < xmax_32; dx+=32)
+            {
+                int sx = dx * 2;
+
+                __m256i val0 = _mm256_loadu_si256((__m256i*)(S + sx - 1));
+                __m256i val2 = _mm256_loadu_si256((__m256i*)(S + sx + 1));
+                __m256i val32 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 32));
+                __m256i val34 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 32));
+
+                __m256i val0_lo = _mm256_unpacklo_epi8(val0, zero_256);
+                __m256i val0_hi = _mm256_unpackhi_epi8(val0, zero_256);
+                __m256i val2_lo = _mm256_unpacklo_epi8(val2, zero_256);
+                __m256i val2_hi = _mm256_unpackhi_epi8(val2, zero_256);
+
+                __m256i val32_lo = _mm256_unpacklo_epi8(val32, zero_256);
+                __m256i val32_hi = _mm256_unpackhi_epi8(val32, zero_256);
+                __m256i val34_lo = _mm256_unpacklo_epi8(val34, zero_256);
+                __m256i val34_hi = _mm256_unpackhi_epi8(val34, zero_256);
+
+                __m256i res0_lo = _mm256_madd_epi16(val0_lo, coef0_256);
+                __m256i res0_hi = _mm256_madd_epi16(val0_hi, coef0_256);
+                __m256i res2_lo = _mm256_madd_epi16(val2_lo, coef2_256);
+                __m256i res2_hi = _mm256_madd_epi16(val2_hi, coef2_256);
+                __m256i res32_lo = _mm256_madd_epi16(val32_lo, coef0_256);
+                __m256i res32_hi = _mm256_madd_epi16(val32_hi, coef0_256);
+                __m256i res34_lo = _mm256_madd_epi16(val34_lo, coef2_256);
+                __m256i res34_hi = _mm256_madd_epi16(val34_hi, coef2_256);
+
+                __m256i acc0_lo = _mm256_add_epi32(res0_lo, res2_lo);
+                __m256i acc0_hi = _mm256_add_epi32(res0_hi, res2_hi);
+                __m256i acc32_lo = _mm256_add_epi32(res32_lo, res34_lo);
+                __m256i acc32_hi = _mm256_add_epi32(res32_hi, res34_hi);
+                __m256i tmp0 = acc0_lo;
+                __m256i tmp32 = acc32_lo;
+
+                acc0_lo = _mm256_inserti128_si256(acc0_lo, _mm256_castsi256_si128(acc0_hi), 1);
+                acc0_hi = _mm256_inserti128_si256(acc0_hi, _mm256_extracti128_si256(tmp0, 1), 0);
+                acc32_lo = _mm256_inserti128_si256(acc32_lo, _mm256_castsi256_si128(acc32_hi), 1);
+                acc32_hi = _mm256_inserti128_si256(acc32_hi, _mm256_extracti128_si256(tmp32, 1), 0);
+
+                _mm256_storeu_si256((__m256i*)(D + dx), acc0_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 8), acc0_hi);
+                _mm256_storeu_si256((__m256i*)(D + dx + 16), acc32_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 24), acc32_hi);
+            }
+            for (; dx < xmax_16; dx+=16)
+            {
+                int sx = dx * 2;
+
+                __m256i val0 = _mm256_loadu_si256((__m256i*)(S + sx - 1));
+                __m256i val2 = _mm256_loadu_si256((__m256i*)(S + sx + 1));
+
+                __m256i val0_lo = _mm256_unpacklo_epi8(val0, zero_256);
+                __m256i val0_hi = _mm256_unpackhi_epi8(val0, zero_256);
+                __m256i val2_lo = _mm256_unpacklo_epi8(val2, zero_256);
+                __m256i val2_hi = _mm256_unpackhi_epi8(val2, zero_256);
+
+                __m256i res0_lo = _mm256_madd_epi16(val0_lo, coef0_256);
+                __m256i res0_hi = _mm256_madd_epi16(val0_hi, coef0_256);
+                __m256i res2_lo = _mm256_madd_epi16(val2_lo, coef2_256);
+                __m256i res2_hi = _mm256_madd_epi16(val2_hi, coef2_256);
+
+                __m256i res_lo = _mm256_add_epi32(res0_lo, res2_lo);
+                __m256i res_hi = _mm256_add_epi32(res0_hi, res2_hi);
+                __m256i tmp = res_lo;
+
+                res_lo = _mm256_inserti128_si256(res_lo, _mm256_castsi256_si128(res_hi), 1);
+                res_hi = _mm256_inserti128_si256(res_hi, _mm256_extracti128_si256(tmp, 1), 0);
+                _mm256_storeu_si256((__m256i*)(D + dx), res_lo);
+                _mm256_storeu_si256((__m256i*)(D + dx + 8), res_hi);
+            }
+            for (; dx < xmax_8; dx+=8)
+            {
+                int sx = dx * 2;
+
+                __m256i val0_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(S + sx - 1)));
+                __m256i val2_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(S + sx + 1)));
+
+                __m256i res0 = _mm256_madd_epi16(val0_16, coef0_256);
+                __m256i res2 = _mm256_madd_epi16(val2_16, coef2_256);
+
+                __m256i res = _mm256_add_epi32(res0, res2);
+                _mm256_storeu_si256((__m256i*)(D + dx), res);
+            }
+            for (; dx < xmax_4; dx+=4)
+            {
+                int sx = dx * 2;
+
+                __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1));
+                __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1));
+
+                __m128i val0_16 = _mm_cvtepu8_epi16(val0);
+                __m128i val2_16 = _mm_cvtepu8_epi16(val2);
+
+                __m128i res0 = _mm_madd_epi16(val0_16, coef0_128);
+                __m128i res2 = _mm_madd_epi16(val2_16, coef2_128);
+
+                __m128i res = _mm_add_epi32(res0, res2);
+                _mm_storeu_si128((__m128i*)(D + dx), res);
+            }
+            for (; dx < xmax; dx++)
+            {
+                int sx = dx * 2;
+                D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+void vresize_avx2(const int **src, unsigned char *dst, const short *beta, int width)
+{
+    int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+    const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+    int bits = 22;
+
+    __m256i perm0_256 = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 4, 0);
+    __m256i perm8_256 = _mm256_set_epi32(1, 1, 1, 1, 4, 0, 1, 1);
+    __m256i sh_32_to_8_256 = _mm256_set_epi64x(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400);
+    __m256i coef0_256 = _mm256_set1_epi32(beta[0]);
+    __m256i coef1_256 = _mm256_set1_epi32(beta[1]);
+    __m256i zero_256 = _mm256_setzero_si256();
+    __m256i delta_256 = _mm256_set1_epi32(1 << (bits - 1));
+    __m256i max_char_256 = _mm256_set1_epi32(255);
+
+    __m128i sh_32_to_8_128 =  _mm_set_epi64x(0x8080808080808080, 0x808080800C080400);
+    __m128i coef0_128 = _mm_set1_epi32(beta[0]);
+    __m128i coef1_128 = _mm_set1_epi32(beta[1]);
+    __m128i zero_128 = _mm_setzero_si128();
+    __m128i delta_128 = _mm_set1_epi32(1 << (bits - 1));
+    __m128i max_char_128 = _mm_set1_epi32(255);
+    
+    int width_16 = width - (width % 16);
+    int width_8 = width - (width % 8);
+    int width_4 = width - (width % 4);
+    int x = 0;
+
+    for (; x < width_16; x+=16)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8));
+        __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8));
+        __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8));
+        __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8));
+
+        __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256);
+
+        __m256i mul0_8 = _mm256_mullo_epi32(src0_8, coef0_256);
+        __m256i mul1_8 = _mm256_mullo_epi32(src1_8, coef1_256);
+        __m256i mul2_8 = _mm256_mullo_epi32(src2_8, coef1_256);
+        __m256i mul3_8 = _mm256_mullo_epi32(src3_8, coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0);
+        __m256i accum_01_8 = _mm256_add_epi32(mul0_8, mul1_8);
+        __m256i accum_23_8 = _mm256_add_epi32(mul2_8, mul3_8);
+        __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0);
+        __m256i accum_0123_8 = _mm256_add_epi32(accum_01_8, accum_23_8);
+
+        accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256);
+        accum_0123_8 = _mm256_add_epi32(accum_0123_8, delta_256);
+        accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits);
+        accum_0123_8 = _mm256_srai_epi32(accum_0123_8, bits);
+
+        accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256);
+        accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256);
+        accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256);
+        accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256);
+
+        accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256);
+        accum_0123_8 = _mm256_shuffle_epi8(accum_0123_8, sh_32_to_8_256);
+        accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256);
+        accum_0123_8 = _mm256_permutevar8x32_epi32(accum_0123_8, perm8_256);
+
+        __m128i accum = _mm256_extracti128_si256(_mm256_or_si256(accum_0123_0, accum_0123_8), 0);
+        _mm_storeu_si128((__m128i*)(dst + x), accum);
+    }
+    for (; x < width_8; x+=8)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0);
+        __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0);
+
+        accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256);
+        accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits);
+
+        accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256);
+        accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256);
+
+        accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256);
+        accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256);
+
+        __m128i accum = _mm256_castsi256_si128(accum_0123_0);
+        _mm_storel_epi64((__m128i*)(dst + x), accum);
+    }
+    for (; x < width_4; x+=4)
+    {
+        __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x));
+        __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x));
+        __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x));
+        __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x));
+
+        __m128i mul0_0 = _mm_mullo_epi32(src0_0, coef0_128);
+        __m128i mul1_0 = _mm_mullo_epi32(src1_0, coef1_128);
+        __m128i mul2_0 = _mm_mullo_epi32(src2_0, coef1_128);
+        __m128i mul3_0 = _mm_mullo_epi32(src3_0, coef0_128);
+
+        __m128i accum_01_0 = _mm_add_epi32(mul0_0, mul1_0);
+        __m128i accum_23_0 = _mm_add_epi32(mul2_0, mul3_0);
+        __m128i accum_0123_0 = _mm_add_epi32(accum_01_0, accum_23_0);
+
+        accum_0123_0 = _mm_add_epi32(accum_0123_0, delta_128);
+        accum_0123_0 = _mm_srai_epi32(accum_0123_0, bits);
+
+        accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128);
+        accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128);
+
+        accum_0123_0 = _mm_shuffle_epi8(accum_0123_0, sh_32_to_8_128);
+        _mm_maskstore_epi32((int*)(dst + x), _mm_set_epi32(0, 0, 0, 0x80000000), accum_0123_0);
+    }
+
+    for (; x < width; x++)
+        dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3);
+}
+
+static int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+#if OPTIMISED_COEFF
+void step_avx2(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax)
+#else
+void step_avx2(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("resizer: malloc fails\n");
+        return;
+    }
+    const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[MAX_ESIZE];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+
+
+        // regular c
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hresize_avx2((srows + k0), (rows + k0), ksize - k0, _alpha,
+                    iwidth, dwidth, cn, xmin, xmax);
+        }
+        vresize_avx2((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
+#else
+        if (k0 < ksize)
+        {
+            hresize_avx2((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                    iwidth, dwidth, cn, xmin, xmax);
+        }
+        vresize_avx2((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
+#endif
+    }
+    free(_buffer);
+}
\ No newline at end of file
diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.h b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.h
new file mode 100644
index 000000000..45676e7cf
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.h
@@ -0,0 +1,31 @@
+/**
+ *
+ *  Copyright (C) 2022 Intel Corporation.
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+// #include "integer_funque_filters.h"
+#include "../resizer.h"
+
+void vresize_avx2(const int **src, unsigned char *dst, const short *beta, int width);
+#if OPTIMISED_COEFF
+void step_avx2(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax);
+void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#else
+void step_avx2(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax);
+void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#endif
+//void hbd_resize(const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth);
\ No newline at end of file
diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c
new file mode 100644
index 000000000..782c91ba6
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c
@@ -0,0 +1,519 @@
+/**
+ *
+ *  Copyright (C) 2022 Intel Corporation.
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#if ARCH_AARCH64
+#include <arm_neon.h>
+#endif
+
+#include "resizer_avx512.h"
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#if !OPTIMISED_COEFF
+static void interpolateCubic(float x, float *coeffs)
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+#endif
+
+#if OPTIMISED_COEFF
+void hresize_avx512(const unsigned char **src, int **dst, int count,
+             const short *alpha,
+             int swidth, int dwidth, int cn, int xmin, int xmax)
+#else
+void hresize_avx512(const unsigned char **src, int **dst, int count,
+             const int *xofs, const short *alpha,
+             int swidth, int dwidth, int cn, int xmin, int xmax)
+#endif
+{
+    int xmax_64 = xmax - (xmax % 64);
+    int xmax_32 = xmax - (xmax % 32);
+    int xmax_16 = xmax - (xmax % 16);
+    int xmax_8 = xmax - (xmax % 8);
+    int xmax_4 = xmax - (xmax % 4);
+
+    __m512i coef0_512 = _mm512_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16));
+    __m512i coef2_512 = _mm512_set1_epi32(alpha[2] + (alpha[3] << 16));
+    __m512i permlo_512 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
+    __m512i permhi_512 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
+    __m512i zero_512 = _mm512_setzero_si512();
+
+    __m256i coef0_256 = _mm256_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16));
+    __m256i coef2_256 = _mm256_set1_epi32(alpha[2] + (alpha[3] << 16));
+
+    __m128i coef0_128 = _mm_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16));
+    __m128i coef2_128 = _mm_set1_epi32(alpha[2] + (alpha[3] << 16));
+
+    for (int k = 0; k < count; k++)
+    {
+        const unsigned char *S = src[k];
+        int *D = dst[k];
+        int dx = 0, limit = xmin;
+        for (;;)
+        {
+#if OPTIMISED_COEFF
+            for (; dx < limit; dx++)
+            {
+                int j;
+                int sx = (dx * 2) - cn;
+#else
+            for (; dx < limit; dx++, alpha += 4)
+            {
+                int j;
+                int sx = xofs[dx] - cn;
+#endif
+                int v = 0;
+                for (j = 0; j < 4; j++)
+                {
+                    int sxj = sx + j * cn;
+                    if ((unsigned)sxj >= (unsigned)swidth)
+                    {
+                        while (sxj < 0)
+                            sxj += cn;
+                        while (sxj >= swidth)
+                            sxj -= cn;
+                    }
+                    v += S[sxj] * alpha[j];
+                }
+                D[dx] = v;
+            }
+            if (limit == dwidth)
+                break;
+#if OPTIMISED_COEFF
+            for (; dx < xmax_64; dx+=64)
+            {
+                int sx = dx * 2;
+#else
+            for (; dx < xmax; dx++, alpha += 4)
+            {
+                int sx = xofs[dx]; // sx - 2, 4, 6, 8....
+#endif
+                __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1));
+                __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1));
+                __m512i val64 = _mm512_loadu_si512((__m512i*)(S + sx - 1 + 64));
+                __m512i val66 = _mm512_loadu_si512((__m512i*)(S + sx + 1 + 64));
+
+                __m512i val0_lo = _mm512_unpacklo_epi8(val0, zero_512);
+                __m512i val0_hi = _mm512_unpackhi_epi8(val0, zero_512);
+                __m512i val2_lo = _mm512_unpacklo_epi8(val2, zero_512);
+                __m512i val2_hi = _mm512_unpackhi_epi8(val2, zero_512);
+
+                __m512i val64_lo = _mm512_unpacklo_epi8(val64, zero_512);
+                __m512i val64_hi = _mm512_unpackhi_epi8(val64, zero_512);
+                __m512i val66_lo = _mm512_unpacklo_epi8(val66, zero_512);
+                __m512i val66_hi = _mm512_unpackhi_epi8(val66, zero_512);
+
+                __m512i res0_lo = _mm512_madd_epi16(val0_lo, coef0_512);
+                __m512i res0_hi = _mm512_madd_epi16(val0_hi, coef0_512);
+                __m512i res2_lo = _mm512_madd_epi16(val2_lo, coef2_512);
+                __m512i res2_hi = _mm512_madd_epi16(val2_hi, coef2_512);
+
+                __m512i res64_lo = _mm512_madd_epi16(val64_lo, coef0_512);
+                __m512i res64_hi = _mm512_madd_epi16(val64_hi, coef0_512);
+                __m512i res66_lo = _mm512_madd_epi16(val66_lo, coef2_512);
+                __m512i res66_hi = _mm512_madd_epi16(val66_hi, coef2_512);
+
+                __m512i r0_lo = _mm512_add_epi32(res0_lo, res2_lo);
+                __m512i r0_hi = _mm512_add_epi32(res0_hi, res2_hi);
+                __m512i r1_lo = _mm512_add_epi32(res64_lo, res66_lo);
+                __m512i r1_hi = _mm512_add_epi32(res64_hi, res66_hi);
+                __m512i tmp0 = r0_lo;
+                __m512i tmp1 = r1_lo;
+
+                r0_lo = _mm512_permutex2var_epi64(r0_lo, permlo_512, r0_hi);
+                r0_hi = _mm512_permutex2var_epi64(tmp0, permhi_512, r0_hi);
+                r1_lo = _mm512_permutex2var_epi64(r1_lo, permlo_512, r1_hi);
+                r1_hi = _mm512_permutex2var_epi64(tmp1, permhi_512, r1_hi);
+
+                _mm512_storeu_si512((__m512i*)(D + dx), r0_lo);
+                _mm512_storeu_si512((__m512i*)(D + dx + 16), r0_hi);
+                _mm512_storeu_si512((__m512i*)(D + dx + 32), r1_lo);
+                _mm512_storeu_si512((__m512i*)(D + dx + 48), r1_hi);
+            }   
+            for (; dx < xmax_32; dx+=32)
+            {
+                int sx = dx * 2;
+                __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1));
+                __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1));
+
+                __m512i val0_lo = _mm512_unpacklo_epi8(val0, zero_512);
+                __m512i val0_hi = _mm512_unpackhi_epi8(val0, zero_512);
+
+                __m512i val2_lo = _mm512_unpacklo_epi8(val2, zero_512);
+                __m512i val2_hi = _mm512_unpackhi_epi8(val2, zero_512);
+
+                __m512i res0_lo = _mm512_madd_epi16(val0_lo, coef0_512);
+                __m512i res0_hi = _mm512_madd_epi16(val0_hi, coef0_512);
+                __m512i res2_lo = _mm512_madd_epi16(val2_lo, coef2_512);
+                __m512i res2_hi = _mm512_madd_epi16(val2_hi, coef2_512);
+
+                __m512i res_lo = _mm512_add_epi32(res0_lo, res2_lo);
+                __m512i res_hi = _mm512_add_epi32(res0_hi, res2_hi);
+                __m512i tmp = res_lo;
+
+                res_lo = _mm512_permutex2var_epi64(res_lo, permlo_512, res_hi);
+                res_hi = _mm512_permutex2var_epi64(tmp, permhi_512, res_hi);
+
+                _mm512_storeu_si512((__m512i*)(D + dx), res_lo);
+                _mm512_storeu_si512((__m512i*)(D + dx + 16), res_hi);
+            }
+            for (; dx < xmax_16; dx+=16)
+            {
+                int sx = dx * 2;
+                __m512i val0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(S + sx - 1)));
+                __m512i val2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(S + sx + 1)));
+
+                __m512i res0_lo = _mm512_madd_epi16(val0, coef0_512);
+                __m512i res0_hi = _mm512_madd_epi16(val0, coef0_512);
+                __m512i res2_lo = _mm512_madd_epi16(val2, coef2_512);
+                __m512i res2_hi = _mm512_madd_epi16(val2, coef2_512);
+
+                __m512i res_lo = _mm512_add_epi32(res0_lo, res2_lo);
+                __m512i res_hi = _mm512_add_epi32(res0_hi, res2_hi);
+
+                _mm512_storeu_si512((__m512i*)(D + dx), res_lo);
+                _mm512_storeu_si512((__m512i*)(D + dx + 16), res_hi);
+            }
+            for (; dx < xmax_8; dx+=8)
+            {
+                int sx = dx * 2;
+
+                __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1));
+                __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1));
+
+                __m256i val0_16 = _mm256_cvtepu8_epi16(val0);
+                __m256i val2_16 = _mm256_cvtepu8_epi16(val2);
+
+                __m256i res0 = _mm256_madd_epi16(val0_16, coef0_256);
+                __m256i res2 = _mm256_madd_epi16(val2_16, coef2_256);
+
+                __m256i res = _mm256_add_epi32(res0, res2);
+                _mm256_storeu_si256((__m256i*)(D + dx), res);
+            }
+            for (; dx < xmax_4; dx+=4)
+            {
+                int sx = dx * 2;
+
+                __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1));
+                __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1));
+
+                __m128i val0_16 = _mm_cvtepu8_epi16(val0);
+                __m128i val2_16 = _mm_cvtepu8_epi16(val2);
+
+                __m128i res0 = _mm_madd_epi16(val0_16, coef0_128);
+                __m128i res2 = _mm_madd_epi16(val2_16, coef2_128);
+
+                __m128i res = _mm_add_epi32(res0, res2);
+                _mm_storeu_si128((__m128i*)(D + dx), res);
+            }
+            for (; dx < xmax; dx++)
+            {
+                int sx = dx * 2;
+                D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3];
+            }
+            limit = dwidth;
+        }
+#if !OPTIMISED_COEFF
+        alpha -= dwidth * 4;
+#endif
+    }
+}
+
+void vresize_avx512(const int **src, unsigned char *dst, const short *beta, int width)
+{
+    int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
+    const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+    int bits = 22;
+    
+    __m512i sh_32_to_8_512 = _mm512_set_epi64(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400);
+    __m512i perm0_512 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 8, 4, 0);
+    __m512i perm8_512 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 12, 8, 4, 0, 1, 1, 1, 1);
+    __m512i coef0_512 = _mm512_set1_epi32(beta[0]);
+    __m512i coef1_512 = _mm512_set1_epi32(beta[1]);
+    __m512i delta_512 = _mm512_set1_epi32(1 << (bits - 1));
+    __m512i max_char_512 = _mm512_set1_epi32(255);
+    __m512i zero_512 = _mm512_setzero_si512();
+
+    __m256i sh_32_to_8_256 = _mm256_set_epi64x(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400);
+    __m256i perm0_256 = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 4, 0);
+    __m256i perm8_256 = _mm256_set_epi32(1, 1, 1, 1, 4, 0, 1, 1);
+    __m256i coef0_256 = _mm256_set1_epi32(beta[0]);
+    __m256i coef1_256 = _mm256_set1_epi32(beta[1]);
+    __m256i delta_256 = _mm256_set1_epi32(1 << (bits - 1));
+    __m256i max_char_256 = _mm256_set1_epi32(255);
+    __m256i zero_256 = _mm256_setzero_si256();
+
+    __m128i sh_32_to_8_128 =  _mm_set_epi64x(0x8080808080808080, 0x808080800C080400);
+    __m128i coef0_128 = _mm_set1_epi32(beta[0]);
+    __m128i coef1_128 = _mm_set1_epi32(beta[1]);
+    __m128i delta_128 = _mm_set1_epi32(1 << (bits - 1));
+    __m128i max_char_128 = _mm_set1_epi32(255);
+    __m128i zero_128 = _mm_setzero_si128();
+
+    int width_32 = width - (width % 32);
+    int width_16 = width - (width % 16);
+    int width_8 = width - (width % 8);
+    int width_4 = width - (width % 4);
+    int x = 0;
+
+    for (; x < width_32; x+=32)
+    {
+        __m512i src0_0 = _mm512_loadu_si512((__m512i*)(S0 + x));
+        __m512i src1_0 = _mm512_loadu_si512((__m512i*)(S1 + x));
+        __m512i src2_0 = _mm512_loadu_si512((__m512i*)(S2 + x));
+        __m512i src3_0 = _mm512_loadu_si512((__m512i*)(S3 + x));
+
+        __m512i src0_16 = _mm512_loadu_si512((__m512i*)(S0 + x + 16));
+        __m512i src1_16 = _mm512_loadu_si512((__m512i*)(S1 + x + 16));
+        __m512i src2_16 = _mm512_loadu_si512((__m512i*)(S2 + x + 16));
+        __m512i src3_16 = _mm512_loadu_si512((__m512i*)(S3 + x + 16));
+
+        __m512i mul0_0 = _mm512_mullo_epi32(src0_0, coef0_512);
+        __m512i mul1_0 = _mm512_mullo_epi32(src1_0, coef1_512);
+        __m512i mul2_0 = _mm512_mullo_epi32(src2_0, coef1_512);
+        __m512i mul3_0 = _mm512_mullo_epi32(src3_0, coef0_512);
+
+        __m512i mul0_8 = _mm512_mullo_epi32(src0_16, coef0_512);
+        __m512i mul1_8 = _mm512_mullo_epi32(src1_16, coef1_512);
+        __m512i mul2_8 = _mm512_mullo_epi32(src2_16, coef1_512);
+        __m512i mul3_8 = _mm512_mullo_epi32(src3_16, coef0_512);
+
+        __m512i accum_01_0 = _mm512_add_epi32(mul0_0, mul1_0);
+        __m512i accum_23_0 = _mm512_add_epi32(mul2_0, mul3_0);
+        __m512i accum_01_8 = _mm512_add_epi32(mul0_8, mul1_8);
+        __m512i accum_23_8 = _mm512_add_epi32(mul2_8, mul3_8);
+        __m512i accum_0123_0 = _mm512_add_epi32(accum_01_0, accum_23_0);
+        __m512i accum_0123_8 = _mm512_add_epi32(accum_01_8, accum_23_8);
+
+        accum_0123_0 = _mm512_add_epi32(accum_0123_0, delta_512);
+        accum_0123_8 = _mm512_add_epi32(accum_0123_8, delta_512);
+        accum_0123_0 = _mm512_srai_epi32(accum_0123_0, bits);
+        accum_0123_8 = _mm512_srai_epi32(accum_0123_8, bits);
+
+        accum_0123_0 = _mm512_max_epi32(accum_0123_0, zero_512);
+        accum_0123_8 = _mm512_max_epi32(accum_0123_8, zero_512);
+        accum_0123_0 = _mm512_min_epi32(accum_0123_0, max_char_512);
+        accum_0123_8 = _mm512_min_epi32(accum_0123_8,max_char_512);
+
+        accum_0123_0 = _mm512_shuffle_epi8(accum_0123_0, sh_32_to_8_512);
+        accum_0123_8 = _mm512_shuffle_epi8(accum_0123_8, sh_32_to_8_512);
+        
+        accum_0123_0 = _mm512_permutexvar_epi32(perm0_512, accum_0123_0);
+        accum_0123_8 = _mm512_permutexvar_epi32(perm8_512, accum_0123_8);
+        __m256i accum = _mm512_extracti32x8_epi32(_mm512_or_si512(accum_0123_0, accum_0123_8), 0);
+        _mm256_storeu_si256((__m256i*)(dst + x), accum);
+    }
+    for (; x < width_16; x+=16)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8));
+        __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8));
+        __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8));
+        __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8));
+
+        __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256);
+
+        __m256i mul0_8 = _mm256_mullo_epi32(src0_8, coef0_256);
+        __m256i mul1_8 = _mm256_mullo_epi32(src1_8, coef1_256);
+        __m256i mul2_8 = _mm256_mullo_epi32(src2_8, coef1_256);
+        __m256i mul3_8 = _mm256_mullo_epi32(src3_8, coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0);
+        __m256i accum_01_8 = _mm256_add_epi32(mul0_8, mul1_8);
+        __m256i accum_23_8 = _mm256_add_epi32(mul2_8, mul3_8);
+        __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0);
+        __m256i accum_0123_8 = _mm256_add_epi32(accum_01_8, accum_23_8);
+
+        accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256);
+        accum_0123_8 = _mm256_add_epi32(accum_0123_8, delta_256);
+        accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits);
+        accum_0123_8 = _mm256_srai_epi32(accum_0123_8, bits);
+
+        accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256);
+        accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256);
+        accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256);
+        accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256);
+
+        accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256);
+        accum_0123_8 = _mm256_shuffle_epi8(accum_0123_8, sh_32_to_8_256);
+        accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256);
+        accum_0123_8 = _mm256_permutevar8x32_epi32(accum_0123_8, perm8_256);
+
+        __m128i accum = _mm256_extracti128_si256(_mm256_or_si256(accum_0123_0, accum_0123_8), 0);
+        _mm_storeu_si128((__m128i*)(dst + x), accum);
+    }
+    for (; x < width_8; x+=8)
+    {
+        __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x));
+        __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x));
+        __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x));
+        __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x));
+
+        __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256);
+        __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256);
+        __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256);
+        __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256);
+
+        __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0);
+        __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0);
+        __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0);
+
+        accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256);
+        accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits);
+
+        accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256);
+        accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256);
+
+        accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256);
+        accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256);
+
+        __m128i accum = _mm256_castsi256_si128(accum_0123_0);
+        _mm_storel_epi64((__m128i*)(dst + x), accum);
+    }
+    for (; x < width_4; x+=4)
+    {
+        __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x));
+        __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x));
+        __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x));
+        __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x));
+
+        __m128i mul0_0 = _mm_mullo_epi32(src0_0, coef0_128);
+        __m128i mul1_0 = _mm_mullo_epi32(src1_0, coef1_128);
+        __m128i mul2_0 = _mm_mullo_epi32(src2_0, coef1_128);
+        __m128i mul3_0 = _mm_mullo_epi32(src3_0, coef0_128);
+
+        __m128i accum_01_0 = _mm_add_epi32(mul0_0, mul1_0);
+        __m128i accum_23_0 = _mm_add_epi32(mul2_0, mul3_0);
+        __m128i accum_0123_0 = _mm_add_epi32(accum_01_0, accum_23_0);
+
+        accum_0123_0 = _mm_add_epi32(accum_0123_0, delta_128);
+        accum_0123_0 = _mm_srai_epi32(accum_0123_0, bits);
+
+        accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128);
+        accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128);
+
+        accum_0123_0 = _mm_shuffle_epi8(accum_0123_0, sh_32_to_8_128);
+        _mm_maskstore_epi32((int*)(dst + x), _mm_set_epi32(0, 0, 0, 0x80000000), accum_0123_0);
+    }
+
+    for (; x < width; x++)
+        dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3);
+}
+
+static int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+#if OPTIMISED_COEFF
+void step_avx512(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax)
+#else
+void step_avx512(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax)
+#endif
+{
+    int dy, cn = channels;
+
+    int bufstep = (int)((dwidth + 16 - 1) & -16);
+    int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
+    if (_buffer == NULL)
+    {
+        printf("resizer: malloc fails\n");
+        return;
+    }
+    const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int prev_sy[MAX_ESIZE];
+
+    for (int k = 0; k < ksize; k++)
+    {
+        prev_sy[k] = -1;
+        rows[k] = _buffer + bufstep * k;
+    }
+
+#if !OPTIMISED_COEFF
+    const short *beta = _beta + ksize * start;
+#endif
+
+#if OPTIMISED_COEFF
+    for (dy = start; dy < end; dy++)
+    {
+        int sy0 = dy * 2;
+#else
+    for (dy = start; dy < end; dy++, beta += ksize)
+    {
+        int sy0 = yofs[dy];
+#endif
+        int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
+
+        for (int k = 0; k < ksize; k++)
+        {
+            int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight);
+            for (k1 = MAX(k1, k); k1 < ksize; k1++)
+            {
+                if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
+                {
+                    if (k1 > k)
+                        memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
+                    break;
+                }
+            }
+            if (k1 == ksize)
+                k0 = MIN(k0, k); // remember the first row that needs to be computed
+            srows[k] = _src + (sy * iwidth);
+            prev_sy[k] = sy;
+        }
+
+
+
+        // regular c
+#if OPTIMISED_COEFF
+        if (k0 < ksize)
+        {
+            hresize_avx512((srows + k0), (rows + k0), ksize - k0, _alpha,
+                    iwidth, dwidth, cn, xmin, xmax);
+        }
+        vresize_avx512((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
+#else
+        if (k0 < ksize)
+        {
+            hresize_avx512((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
+                    iwidth, dwidth, cn, xmin, xmax);
+        }
+        vresize_avx512((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
+#endif
+    }
+    free(_buffer);
+}
\ No newline at end of file
diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h
new file mode 100644
index 000000000..a3adc148a
--- /dev/null
+++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h
@@ -0,0 +1,29 @@
+/**
+ *
+ *  Copyright (C) 2022 Intel Corporation.
+ *  Copyright (c) 2022-2024 Meta, Inc.
+ *
+ *     Licensed under the BSD 3-Clause License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/license/bsd-3-clause
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include "../resizer.h"
+
+void vresize_avx512(const int **src, unsigned char *dst, const short *beta, int width);
+#if OPTIMISED_COEFF
+void step_avx512(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax);
+void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#else
+void step_avx512(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax);
+void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth);
+#endif
\ No newline at end of file