diff --git a/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c new file mode 100644 index 000000000..8687b9620 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c @@ -0,0 +1,295 @@ +/** + * + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#include +#include +#include "../resizer.h" +#include "resizer_neon.h" + +#if OPTIMISED_COEFF +void hresize_neon(const unsigned char **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hresize_neon(const unsigned char **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + // int first_col_count = 0; + uint8x8_t src1_8x8, src2_8x8, src3_8x8; + int simd_loop = (xmax / 8) * 8; + int num_pix = 8; + +#if OPTIMISED_COEFF + int sx_start = 2; +#else + int sx_start = xofs[1]; +#endif + + for (int k = 0; k < count; k++) + { + const unsigned char *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; + + int start = sx_start - cn; + src1_8x8 = vld1_u8(S + start); +#if OPTIMISED_COEFF + for (; dx < simd_loop;) + { +#else + for (; dx < simd_loop; alpha += 32) + { +#endif + start += num_pix; + src2_8x8 = vld1_u8(S + start); + start += num_pix; + src3_8x8 = vld1_u8(S + start); + + uint16x8_t movl1_16x8 = vmovl_u8(src1_8x8); + uint16x8_t movl2_16x8 = vmovl_u8(src2_8x8); + uint16x8_t movl3_16x8 = vmovl_u8(src3_8x8); + int16x8_t s_movl1_16x8 = vreinterpretq_s16_u16(movl1_16x8); + int16x8_t s_movl2_16x8 = vreinterpretq_s16_u16(movl2_16x8); + int16x8_t s_movl3_16x8 = vreinterpretq_s16_u16(movl3_16x8); + int16x8x2_t t1 = vuzpq_s16(s_movl1_16x8, s_movl2_16x8); // 0 odd, 1 even + int16x8x2_t t2 = vuzpq_s16(s_movl3_16x8, s_movl3_16x8); + int16x8_t vx1 = vextq_s16(t1.val[0], t2.val[0], 1); // s_movl3_16x8,1); + int16x8_t vx2 = vextq_s16(t1.val[1], t2.val[1], 1); + int32x4_t m1_l = vmull_n_s16(vget_low_s16(t1.val[0]), alpha[0]); + int32x4_t m1_h = vmull_n_s16(vget_high_s16(t1.val[0]), alpha[0]); + int32x4_t m2_l = vmlal_n_s16(m1_l, vget_low_s16(vx1), alpha[1]); + int32x4_t m2_h = vmlal_n_s16(m1_h, vget_high_s16(vx1), alpha[1]); + int32x4_t m3_l = vmlal_n_s16(m2_l, vget_low_s16(t1.val[1]), alpha[2]); + int32x4_t m3_h = vmlal_n_s16(m2_h, vget_high_s16(t1.val[1]), alpha[2]); + int32x4_t out_l = vmlal_n_s16(m3_l, vget_low_s16(vx2), alpha[3]); // final out + int32x4_t out_h = vmlal_n_s16(m3_h, vget_high_s16(vx2), alpha[3]); // final out + + vst1q_s32(D + dx, out_l); + dx += 4; + vst1q_s32(D + dx, out_h); + dx += 4; + src1_8x8 = src3_8x8; + } + +#if OPTIMISED_COEFF + for (; dx < xmax; dx++) + { + int sx2 = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx2 = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + D[dx] = S[sx2 - 1] * alpha[0] + S[sx2] * alpha[1] + S[sx2 + 1] * alpha[2] + S[sx2 + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +void vresize_neon(const int **src, unsigned char *dst, const short *beta, int width) +{ + int32x4_t src_1, src_2, src_3, src_4, src_1_mul; + int32x4_t d4_q; + int32x4_t add_1; + int32x4_t add_delta; + int32x4_t shift_right_32x4; + uint16x4_t shift_right_16x4; + uint16x8_t shift_right_16x8; + int32x4_t dt; + uint8x8_t dt2; + + +#define BITS 22 + int bits = BITS; + + // int32x4_t SHIFT = vdupq_n_s32(bits); + int DELTA = (1 << (bits - 1)); + // b1_vq = vdupq_n_s32(beta[0]); + // b2_vq = vdupq_n_s32(beta[1]); + // b3_vq = vdupq_n_s32(beta[2]); + // b4_vq = vdupq_n_s32(beta[3]); + d4_q = vdupq_n_s32(DELTA); + src_1_mul = vdupq_n_s32(0); + + int32x4_t lower = vdupq_n_s32(0); + int32x4_t higher = vdupq_n_s32(255); + + for (int x = 0; x < width; x += 4) + { + src_1 = vld1q_s32(src[0] + x); + src_2 = vld1q_s32(src[1] + x); + src_3 = vld1q_s32(src[2] + x); + src_4 = vld1q_s32(src[3] + x); + + add_1 = vmlaq_n_s32(src_1_mul, src_1, beta[0]); + add_1 = vmlaq_n_s32(add_1, src_2, beta[1]); + add_1 = vmlaq_n_s32(add_1, src_3, beta[2]); + add_1 = vmlaq_n_s32(add_1, src_4, beta[3]); + + add_delta = vaddq_s32(add_1, d4_q); + + shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4 + + dt = vminq_s32(shift_right_32x4, higher); + dt = vmaxq_s32(dt, lower); + + // shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4 + + shift_right_16x4 = vqmovun_s32(dt); // 16x4 + shift_right_16x8 = vcombine_u16(shift_right_16x4, shift_right_16x4); // 16x8 + dt2 = vqmovn_u16(shift_right_16x8); // 8x8 + + vst1_lane_u32((unsigned int *)(dst + x), vreinterpret_u32_u8(dt2), 0); + } + +#undef BITS +} + +static int clip_neon(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +#if OPTIMISED_COEFF +void step_neon(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax) +#else +void step_neon(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("malloc fails\n"); + } + const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[MAX_ESIZE]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = clip_neon(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hresize_neon((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } +#if USE_C_VRESIZE + vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); +#elif !USE_C_VRESIZE + vresize_neon((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); +#endif +#else + if (k0 < ksize) + { + hresize_neon((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } +#if USE_C_VRESIZE + vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth); +#elif !USE_C_VRESIZE + vresize_neon((const int **)rows, (_dst + dwidth * dy), beta, dwidth); +#endif +#endif + + } + + free(_buffer); +} diff --git a/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h new file mode 100644 index 000000000..801a32c25 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h @@ -0,0 +1,30 @@ +/** + * + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#if OPTIMISED_COEFF +void step_neon(const unsigned char *_src, unsigned char *_dst, + const short *_alpha, const short *_beta, + int iwidth, int iheight, int dwidth, int channels, + int ksize, int start, int end, int xmin, int xmax); +#else +void step_neon(const unsigned char *_src, unsigned char *_dst, + const int *xofs, const int *yofs, + const short *_alpha, const short *_beta, + int iwidth, int iheight, int dwidth, int dheight, int channels, + int ksize, int start, int end, int xmin, int xmax); +#endif diff --git a/libvmaf/src/feature/third_party/funque/hbd_resizer.c b/libvmaf/src/feature/third_party/funque/hbd_resizer.c new file mode 100644 index 000000000..209a876b2 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/hbd_resizer.c @@ -0,0 +1,314 @@ +/** + * + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include "resizer.h" + +//const int HBD_INTER_RESIZE_COEF_SCALE = 2048; +//static const int HBD_MAX_ESIZE = 16; + +//#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \ +// : X) +//#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT) +//#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT) + +// enabled by default for funque since resize factor is always 0.5, disabled otherwise +//#define OPTIMISED_COEFF 1 + +//#define USE_C_VRESIZE 0 + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hbd_hresize(const unsigned short **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hbd_hresize(const unsigned short **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + for (int k = 0; k < count; k++) + { + const unsigned short *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax; dx++) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +unsigned short hbd_castOp(int64_t val, int bitdepth) +{ + int bits = 22; + int SHIFT = bits; + int DELTA = (1 << (bits - 1)); + return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1)); +} + +static int hbd_clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +void hbd_vresize(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + + for (int x = 0; x < width; x++) + dst[x] = hbd_castOp((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth); +} + +#if OPTIMISED_COEFF +void hbd_step(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#else +void hbd_step(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[MAX_ESIZE]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = hbd_clip(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hbd_hresize((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth); +#else + if (k0 < ksize) + { + hbd_hresize((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth); +#endif + } + free(_buffer); +} + +void hbd_resize(ResizerState m, const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth) +{ + // int depth = 0; + int cn = 1; + double inv_scale_x = (double)dwidth / iwidth; + + int ksize = 4, ksize2; + ksize2 = ksize / 2; + + int xmin = 0, xmax = dwidth; + +#if OPTIMISED_COEFF + const short ibeta[] = {-192, 1216, 1216, -192}; + const short ialpha[] = {-192, 1216, 1216, -192}; + double scale_x = 1. / inv_scale_x; + float fx; + int sx; + + for (int dx = 0; dx < dwidth; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = (int)floor(fx); + fx -= sx; + + if (sx < ksize2 - 1) + { + xmin = dx + 1; + } + + if (sx + ksize2 >= iwidth) + { + xmax = MIN(xmax, dx); + } + } + m.hbd_resizer_step(_src, _dst, ialpha, ibeta, iwidth, iheight, dwidth, cn, ksize, 0, dheight, xmin, xmax, bitdepth); +#else + double inv_scale_y = (double)dheight / iheight; + double scale_x = 1. / inv_scale_x, scale_y = 1. / inv_scale_y; + int width = dwidth * cn; + + int iscale_x = (int)scale_x; + int iscale_y = (int)scale_y; + + int k, sx, sy, dx, dy; + + float fx, fy; + + unsigned short *_buffer = (unsigned short *)malloc((width + dheight) * (sizeof(int) + sizeof(float) * ksize)); + + int *xofs = (int *)_buffer; + int *yofs = xofs + width; + float *alpha = (float *)(yofs + dheight); + short *ialpha = (short *)alpha; + float *beta = alpha + width * ksize; + short *ibeta = ialpha + width * ksize; + float cbuf[4] = {0}; + + for (dx = 0; dx < dwidth; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = (int)floor(fx); + fx -= sx; + + if (sx < ksize2 - 1) + { + xmin = dx + 1; + } + + if (sx + ksize2 >= iwidth) + { + xmax = MIN(xmax, dx); + } + + for (k = 0, sx *= cn; k < cn; k++) + xofs[dx * cn + k] = sx + k; + + interpolateCubic(fx, cbuf); + for (k = 0; k < ksize; k++) + ialpha[dx * cn * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE); + for (; k < cn * ksize; k++) + ialpha[dx * cn * ksize + k] = ialpha[dx * cn * ksize + k - ksize]; + } + + for (dy = 0; dy < dheight; dy++) + { + fy = (float)((dy + 0.5) * scale_y - 0.5); + sy = (int)floor(fy); + fy -= sy; + + yofs[dy] = sy; + + interpolateCubic(fy, cbuf); + for (k = 0; k < ksize; k++) + ibeta[dy * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE); + } + m.hbd_resizer_step(_src, _dst, xofs, yofs, ialpha, ibeta, iwidth, iheight, dwidth, dheight, cn, ksize, 0, dheight, xmin, xmax, bitdepth); +#endif + +} diff --git a/libvmaf/src/feature/third_party/funque/resizer.c b/libvmaf/src/feature/third_party/funque/resizer.c new file mode 100644 index 000000000..ef121cfa1 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/resizer.c @@ -0,0 +1,307 @@ +/** + * + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#if ARCH_AARCH64 +#include +#endif + +#include "resizer.h" + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hresize(const unsigned char **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hresize(const unsigned char **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif + +{ + for (int k = 0; k < count; k++) + { + const unsigned char *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax; dx++) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +unsigned char castOp(int val) +{ + int bits = 22; + int SHIFT = bits; + int DELTA = (1 << (bits - 1)); + return CLIP3((val + DELTA) >> SHIFT, 0, 255); +} + +void vresize(const int **src, unsigned char *dst, const short *beta, int width) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + + for (int x = 0; x < width; x++) + dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3); +} + +static int clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +#if OPTIMISED_COEFF +void step(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax) +#else +void step(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[MAX_ESIZE]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + + // regular c +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hresize((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); +#else + if (k0 < ksize) + { + hresize((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth); +#endif + } + free(_buffer); +} + +void resize(ResizerState m, const unsigned char *_src, unsigned char *_dst, int iwidth, int iheight, int dwidth, int dheight) +{ + // int depth = 0; + int cn = 1; + double inv_scale_x = (double)dwidth / iwidth; + int ksize = 4, ksize2; + ksize2 = ksize / 2; + + int xmin = 0, xmax = dwidth; + +#if OPTIMISED_COEFF + const short ibeta[] = {-192, 1216, 1216, -192}; + const short ialpha[] = {-192, 1216, 1216, -192}; + double scale_x = 1. / inv_scale_x; + float fx; + int sx; + + for (int dx = 0; dx < dwidth; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = (int)floor(fx); + fx -= sx; + + if (sx < ksize2 - 1) + { + xmin = dx + 1; + } + + if (sx + ksize2 >= iwidth) + { + xmax = MIN(xmax, dx); + } + } + m.resizer_step(_src, _dst, ialpha, ibeta, iwidth, iheight, dwidth, cn, ksize, 0, dheight, xmin, xmax); +#else + double inv_scale_y = (double)dheight / iheight; + double scale_x = 1. / inv_scale_x, scale_y = 1. / inv_scale_y; + + int width = dwidth * cn; + + int iscale_x = (int)scale_x; + int iscale_y = (int)scale_y; + + int k, sx, sy, dx, dy; + + float fx, fy; + + unsigned char *_buffer = (unsigned char *)malloc((width + dheight) * (sizeof(int) + sizeof(float) * ksize)); + + int *xofs = (int *)_buffer; + int *yofs = xofs + width; + float *alpha = (float *)(yofs + dheight); + short *ialpha = (short *)alpha; + float *beta = alpha + width * ksize; + short *ibeta = ialpha + width * ksize; + float cbuf[4] = {0}; + + for (dx = 0; dx < dwidth; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = (int)floor(fx); + fx -= sx; + + if (sx < ksize2 - 1) + { + xmin = dx + 1; + } + + if (sx + ksize2 >= iwidth) + { + xmax = MIN(xmax, dx); + } + + for (k = 0, sx *= cn; k < cn; k++) + xofs[dx * cn + k] = sx + k; + + interpolateCubic(fx, cbuf); + + for (k = 0; k < ksize; k++) + ialpha[dx * cn * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE); + for (; k < cn * ksize; k++) + ialpha[dx * cn * ksize + k] = ialpha[dx * cn * ksize + k - ksize]; + } + + for (dy = 0; dy < dheight; dy++) + { + fy = (float)((dy + 0.5) * scale_y - 0.5); + sy = (int)floor(fy); + fy -= sy; + + yofs[dy] = sy; + + interpolateCubic(fy, cbuf); + + for (k = 0; k < ksize; k++) + ibeta[dy * ksize + k] = (short)(cbuf[k] * INTER_RESIZE_COEF_SCALE); + } + m.resizer_step(_src, _dst, xofs, yofs, ialpha, ibeta, iwidth, iheight, dwidth, dheight, cn, ksize, 0, dheight, xmin, xmax); +#endif +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/resizer.h b/libvmaf/src/feature/third_party/funque/resizer.h new file mode 100644 index 000000000..13df96ff1 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/resizer.h @@ -0,0 +1,63 @@ +/** + * + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#ifndef FEATURE_RESIZER_H_ +#define FEATURE_RESIZER_H_ + +// #include "integer_funque_filters.h" + +#define INTER_RESIZE_COEF_BITS 11 +#define INTER_RESIZE_COEF_SCALE 2048 +#define MAX_ESIZE 16 + +// enabled by default for funque since resize factor is always 0.5, disabled otherwise +#define OPTIMISED_COEFF 1 +#define USE_C_VRESIZE 1 + +#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \ + : X) +#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT) +#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT) +#define MAX7(A, B, C, D, E, F, G) MAX(MAX(MAX(MAX(MAX(MAX(A, B), C), D), E), F), G) +#define MAX6(A, B, C, D, E, F) MAX(MAX(MAX(MAX(MAX(A, B), C), D), E), F) +#define MAX5(A, B, C, D, E) MAX(MAX(MAX(MAX(A, B), C), D), E) +#define MAX4(A, B, C, D) MAX(MAX(MAX(A, B), C), D) + +typedef struct ResizerState +{ +#if OPTIMISED_COEFF + void (*resizer_step)(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax); + void (*hbd_resizer_step)(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#else + void (*resizer_step)(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax); + void (*hbd_resizer_step)(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#endif +}ResizerState; + +unsigned char castOp(int val); +void vresize(const int **src, unsigned char *dst, const short *beta, int width); +#if OPTIMISED_COEFF +void step(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#else +void step(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#endif +void resize(ResizerState m, const unsigned char* _src, unsigned char* _dst, int iwidth, int iheight, int dwidth, int dheight); +void hbd_resize(ResizerState m, const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth); + +#endif /* _FEATURE_RESIZER_H_ */ diff --git a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c new file mode 100644 index 000000000..6a109d459 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c @@ -0,0 +1,590 @@ +/** + * + * Copyright (C) 2022 Intel Corporation. + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include + +#include "resizer_avx2.h" + +#define shift22_64b_signExt(a, r)\ +{ \ + r = _mm256_add_epi64( _mm256_srli_epi64(a, 22) , _mm256_and_si256(a, _mm256_set1_epi64x(0xFFFFFC0000000000)));\ +} + +#define shift22_64b_signExt_128(a, r)\ +{ \ + r = _mm_add_epi64( _mm_srli_epi64(a, 22) , _mm_and_si128(a, _mm_set1_epi64x(0xFFFFFC0000000000)));\ +} + +const int HBD_INTER_RESIZE_COEF_SCALE_avx2 = 2048; +static const int HBD_MAX_ESIZE_avx2 = 16; + +#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \ + : X) +#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT) +#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT) + +// enabled by default for funque since resize factor is always 0.5, disabled otherwise +//#define OPTIMISED_COEFF 1 + +//#define USE_C_VRESIZE 0 + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hbd_hresize_avx2(const unsigned short **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hbd_hresize_avx2(const unsigned short **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + __m256i coef0_256 = _mm256_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); + __m256i coef2_256 = _mm256_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); + __m256i zero_256 = _mm256_setzero_si256(); + + int xmax_16 = xmax - (xmax % 16); + int xmax_8 = xmax - (xmax % 8); + int xmax_4 = xmax - (xmax % 4); + for (int k = 0; k < count; k++) + { + const unsigned short *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax_16; dx+=16) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + __m256i val0_16 = _mm256_loadu_si256((__m256i*)(S + sx + 15)); + __m256i val2_16 = _mm256_loadu_si256((__m256i*)(S + sx + 17)); + + __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, zero_256); + __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, zero_256); + __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, zero_256); + __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, zero_256); + __m256i val0_16_lo = _mm256_unpacklo_epi16(val0_16, zero_256); + __m256i val0_16_hi = _mm256_unpackhi_epi16(val0_16, zero_256); + __m256i val2_16_lo = _mm256_unpacklo_epi16(val2_16, zero_256); + __m256i val2_16_hi = _mm256_unpackhi_epi16(val2_16, zero_256); + + __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0_256); + __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0_256); + __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2_256); + __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2_256); + __m256i mul0_16_lo = _mm256_mullo_epi32(val0_16_lo, coef0_256); + __m256i mul0_16_hi = _mm256_mullo_epi32(val0_16_hi, coef0_256); + __m256i mul2_16_lo = _mm256_mullo_epi32(val2_16_lo, coef2_256); + __m256i mul2_16_hi = _mm256_mullo_epi32(val2_16_hi, coef2_256); + + __m256i hadd0_0 = _mm256_hadd_epi32(mul0_0_lo, mul0_0_hi); + __m256i hadd2_0 = _mm256_hadd_epi32(mul2_0_lo, mul2_0_hi); + __m256i hadd0_16 = _mm256_hadd_epi32(mul0_16_lo, mul0_16_hi); + __m256i hadd2_16 = _mm256_hadd_epi32(mul2_16_lo, mul2_16_hi); + + __m256i res_0 = _mm256_add_epi32(hadd0_0, hadd2_0); + __m256i res_16 = _mm256_add_epi32(hadd0_16, hadd2_16); + + _mm256_storeu_si256((__m256i*)(D + dx), res_0); + _mm256_storeu_si256((__m256i*)(D + dx + 8), res_16); + + } + for (; dx < xmax_8; dx+=8) + { + int sx = dx * 2; + + __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + + __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, zero_256); + __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, zero_256); + __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, zero_256); + __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, zero_256); + + __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0_256); + __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0_256); + __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2_256); + __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2_256); + + __m256i hadd0_0 = _mm256_hadd_epi32(mul0_0_lo, mul0_0_hi); + __m256i hadd2_0 = _mm256_hadd_epi32(mul2_0_lo, mul2_0_hi); + + __m256i res_0 = _mm256_add_epi32(hadd0_0, hadd2_0); + + _mm256_storeu_si256((__m256i*)(D + dx), res_0); + } + for (; dx < xmax_4; dx+=4) + { + int sx = dx * 2; + + __m256i val0_0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(S + sx - 1))); + __m256i val2_0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(S + sx + 1))); + __m256i mul0_0 = _mm256_mullo_epi32(val0_0, coef0_256); + __m256i mul2_0 = _mm256_mullo_epi32(val2_0, coef2_256); + + __m256i hadd0 = _mm256_hadd_epi32(mul0_0, mul0_0); + __m256i hadd2 = _mm256_hadd_epi32(mul2_0, mul2_0); + + __m256i res_0 = _mm256_add_epi32(hadd0, hadd2); + res_0 = _mm256_permutevar8x32_epi32(res_0, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 1, 0)); + + _mm_storeu_si128 ((__m128i*)(D + dx), _mm256_castsi256_si128(res_0)); + } + + for (; dx < xmax; dx++) + { + int sx = dx * 2; + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +unsigned short hbd_castOp_avx2(int64_t val, int bitdepth) +{ + int bits = 22; + int SHIFT = bits; + int DELTA = (1 << (bits - 1)); + return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1)); +} + +static int hbd_clip_avx2(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + + int bits = 22; + + __m256i delta_256 = _mm256_set1_epi64x(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi64x(((1 << bitdepth) - 1)); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i perm_256 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i max_char_128 = _mm_set1_epi64x(((1 << bitdepth) - 1)); + __m128i delta_128 = _mm_set1_epi64x(1 << (bits - 1)); + __m128i coef0_128 = _mm_set1_epi32(beta[0]); + __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i zero_128 = _mm_setzero_si128(); + + int width_16 = width - (width % 16); + int width_8 = width - (width % 8); + int width_4 = width - (width % 4); + int x = 0; + for (; x < width_16; x+=16) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8)); + __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8)); + __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); + __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); + + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srli_si256(src0_0, 4), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srli_si256(src1_0, 4), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srli_si256(src2_0, 4), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srli_si256(src3_0, 4), coef0_256); + + __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0_256); + + __m256i mul0_12 = _mm256_mul_epi32(_mm256_srli_si256(src0_8, 4), coef0_256); + __m256i mul1_12 = _mm256_mul_epi32(_mm256_srli_si256(src1_8, 4), coef1_256); + __m256i mul2_12 = _mm256_mul_epi32(_mm256_srli_si256(src2_8, 4), coef1_256); + __m256i mul3_12 = _mm256_mul_epi32(_mm256_srli_si256(src3_8, 4), coef0_256); + + __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); + __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); + __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); + __m256i accum_01_8 = _mm256_add_epi64(mul0_8, mul1_8); + __m256i accum_23_8 = _mm256_add_epi64(mul2_8, mul3_8); + __m256i accum_01_12 = _mm256_add_epi64(mul0_12, mul1_12); + __m256i accum_23_12 = _mm256_add_epi64(mul2_12, mul3_12); + + __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); + __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); + __m256i accum_0123_8 = _mm256_add_epi64(accum_01_8, accum_23_8); + __m256i accum_0123_12 = _mm256_add_epi64(accum_01_12, accum_23_12); + + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta_256); + accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta_256); + + shift22_64b_signExt(accum_0123_0, accum_0123_0); + shift22_64b_signExt(accum_0123_4, accum_0123_4); + shift22_64b_signExt(accum_0123_8, accum_0123_8); + shift22_64b_signExt(accum_0123_12, accum_0123_12); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi32(accum_0123_4, zero_256); + accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256); + accum_0123_12 = _mm256_max_epi32(accum_0123_12, zero_256); + + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi32(accum_0123_4, max_char_256); + accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256); + accum_0123_12 = _mm256_min_epi32(accum_0123_12, max_char_256); + + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + accum_0123_8 = _mm256_or_si256(accum_0123_8, _mm256_slli_epi32(accum_0123_12, 16)); + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi64(accum_0123_8, 32)); + + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm_256); + + _mm256_storeu_si256((__m256i*)(dst + x), accum_0123_0); + } + for (; x < width_8; x+=8) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srli_si256(src0_0, 4), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srli_si256(src1_0, 4), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srli_si256(src2_0, 4), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srli_si256(src3_0, 4), coef0_256); + + __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); + __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); + __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); + __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); + __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); + + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + + shift22_64b_signExt(accum_0123_0, accum_0123_0); + shift22_64b_signExt(accum_0123_4, accum_0123_4); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi32(accum_0123_4, zero_256); + + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi32(accum_0123_4, max_char_256); + + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, perm_256)); + + _mm_storeu_si128((__m128i*)(dst + x), accum); + } + for (; x < width_4; x+=4) + { + __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); + __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); + __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); + __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); + + __m128i mul0_0 = _mm_mul_epi32(src0_0, coef0_128); + __m128i mul1_0 = _mm_mul_epi32(src1_0, coef1_128); + __m128i mul2_0 = _mm_mul_epi32(src2_0, coef1_128); + __m128i mul3_0 = _mm_mul_epi32(src3_0, coef0_128); + + __m128i mul0_4 = _mm_mul_epi32(_mm_srli_si128(src0_0, 4), coef0_128); + __m128i mul1_4 = _mm_mul_epi32(_mm_srli_si128(src1_0, 4), coef1_128); + __m128i mul2_4 = _mm_mul_epi32(_mm_srli_si128(src2_0, 4), coef1_128); + __m128i mul3_4 = _mm_mul_epi32(_mm_srli_si128(src3_0, 4), coef0_128); + + __m128i accum_01_0 = _mm_add_epi64(mul0_0, mul1_0); + __m128i accum_23_0 = _mm_add_epi64(mul2_0, mul3_0); + __m128i accum_01_4 = _mm_add_epi64(mul0_4, mul1_4); + __m128i accum_23_4 = _mm_add_epi64(mul2_4, mul3_4); + __m128i accum_0123_0 = _mm_add_epi64(accum_01_0, accum_23_0); + __m128i accum_0123_4 = _mm_add_epi64(accum_01_4, accum_23_4); + accum_0123_0 = _mm_add_epi64(accum_0123_0, delta_128); + accum_0123_4 = _mm_add_epi64(accum_0123_4, delta_128); + + shift22_64b_signExt_128(accum_0123_0, accum_0123_0); + shift22_64b_signExt_128(accum_0123_4, accum_0123_4); + + accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128); + accum_0123_4 = _mm_max_epi32(accum_0123_4, zero_128); + + accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128); + accum_0123_4 = _mm_min_epi32(accum_0123_4, max_char_128); + + accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_slli_epi32(accum_0123_4, 16)); + accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_srli_si128(accum_0123_0, 4)); + + _mm_storel_epi64((__m128i*)(dst + x), accum_0123_0); + } + for (; x < width; x++) + dst[x] = hbd_castOp_avx2((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth); +} + +#if OPTIMISED_COEFF +void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#else +void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[HBD_MAX_ESIZE_avx2]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = hbd_clip_avx2(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < HBD_MAX_ESIZE_avx2 && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hbd_hresize_avx2((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize_avx2((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth); +#else + if (k0 < ksize) + { + hbd_hresize_avx2((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize_avx2((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth); +#endif + } + free(_buffer); +} +/* +void hbd_resize_avx2(const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth) +{ + // int depth = 0; + int cn = 1; + double inv_scale_x = (double)dwidth / iwidth; + + int ksize = 4, ksize2; + ksize2 = ksize / 2; + + int xmin = 0, xmax = dwidth; + +#if OPTIMISED_COEFF + const short ibeta[] = {-192, 1216, 1216, -192}; + const short ialpha[] = {-192, 1216, 1216, -192}; + double scale_x = 1. / inv_scale_x; + float fx; + int sx; + + for (int dx = 0; dx < dwidth; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = (int)floor(fx); + fx -= sx; + + if (sx < ksize2 - 1) + { + xmin = dx + 1; + } + + if (sx + ksize2 >= iwidth) + { + xmax = MIN(xmax, dx); + } + } + hbd_step_avx2(_src, _dst, ialpha, ibeta, iwidth, iheight, dwidth, cn, ksize, 0, dheight, xmin, xmax, bitdepth); + +#else + double inv_scale_y = (double)dheight / iheight; + double scale_x = 1. / inv_scale_x, scale_y = 1. / inv_scale_y; + width = dwidth * cn; + + int iscale_x = (int)scale_x; + int iscale_y = (int)scale_y; + + int k, sx, sy, dx, dy; + + float fx, fy; + + unsigned short *_buffer = (unsigned short *)malloc((width + dheight) * (sizeof(int) + sizeof(float) * ksize)); + + int *xofs = (int *)_buffer; + int *yofs = xofs + width; + float *alpha = (float *)(yofs + dheight); + short *ialpha = (short *)alpha; + float *beta = alpha + width * ksize; + short *ibeta = ialpha + width * ksize; + float cbuf[4] = {0}; + + for (dx = 0; dx < dwidth; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = (int)floor(fx); + fx -= sx; + + if (sx < ksize2 - 1) + { + xmin = dx + 1; + } + + if (sx + ksize2 >= iwidth) + { + xmax = MIN(xmax, dx); + } + + for (k = 0, sx *= cn; k < cn; k++) + xofs[dx * cn + k] = sx + k; + + interpolateCubic(fx, cbuf); + for (k = 0; k < ksize; k++) + ialpha[dx * cn * ksize + k] = (short)(cbuf[k] * HBD_INTER_RESIZE_COEF_SCALE_avx2); + for (; k < cn * ksize; k++) + ialpha[dx * cn * ksize + k] = ialpha[dx * cn * ksize + k - ksize]; + } + + for (dy = 0; dy < dheight; dy++) + { + fy = (float)((dy + 0.5) * scale_y - 0.5); + sy = (int)floor(fy); + fy -= sy; + + yofs[dy] = sy; + + interpolateCubic(fy, cbuf); + for (k = 0; k < ksize; k++) + ibeta[dy * ksize + k] = (short)(cbuf[k] * HBD_INTER_RESIZE_COEF_SCALE_avx2); + } + hbd_step_avx2(_src, _dst, xofs, yofs, ialpha, ibeta, iwidth, iheight, dwidth, dheight, cn, ksize, 0, dheight, xmin, xmax, bitdepth); +#endif + +} +*/ \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c new file mode 100644 index 000000000..2fe2073b1 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c @@ -0,0 +1,581 @@ +/** + * + * Copyright (C) 2022 Intel Corporation. + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include + +#include "resizer_avx512.h" + + +#define shift22_64b_signExt_512(a, r)\ +{ \ + r = _mm512_add_epi64( _mm512_srli_epi64(a, 22) , _mm512_and_si512(a, _mm512_set1_epi64(0xFFFFFC0000000000)));\ +} + +#define shift22_64b_signExt_256(a, r)\ +{ \ + r = _mm256_add_epi64( _mm256_srli_epi64(a, 22) , _mm256_and_si256(a, _mm256_set1_epi64x(0xFFFFFC0000000000)));\ +} + +#define shift22_64b_signExt_128(a, r)\ +{ \ + r = _mm_add_epi64( _mm_srli_epi64(a, 22) , _mm_and_si128(a, _mm_set1_epi64x(0xFFFFFC0000000000)));\ +} + +const int HBD_INTER_RESIZE_COEF_SCALE_avx512 = 2048; +static const int HBD_MAX_ESIZE_avx512 = 16; + +#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \ + : X) +#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT) +#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT) + +// enabled by default for funque since resize factor is always 0.5, disabled otherwise +//#define OPTIMISED_COEFF 1 + +//#define USE_C_VRESIZE 0 + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hbd_hresize_avx512(const unsigned short **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hbd_hresize_avx512(const unsigned short **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + __m512i idx_extract_ab_512 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_extract_cd_512 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + __m512i coef0_512 = _mm512_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); + __m512i coef2_512 = _mm512_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); + + int xmax_32 = xmax - (xmax % 32); + int xmax_16 = xmax - (xmax % 16); + int xmax_8 = xmax - (xmax % 8); + for (int k = 0; k < count; k++) + { + const unsigned short *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax_32; dx+=32) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + __m512i val32 = _mm512_loadu_si512((__m512i*)(S + sx - 1 + 32)); + __m512i val34 = _mm512_loadu_si512((__m512i*)(S + sx + 1 + 32)); + + __m512i val0_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val0)); + __m512i val0_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val0, 1)); + __m512i val2_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val2)); + __m512i val2_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val2, 1)); + __m512i val32_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val32)); + __m512i val32_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val32, 1)); + __m512i val34_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val34)); + __m512i val34_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val34, 1)); + + __m512i mul0_lo = _mm512_mullo_epi32(val0_lo, coef0_512); + __m512i mul0_hi = _mm512_mullo_epi32(val0_hi, coef0_512); + __m512i mul2_lo = _mm512_mullo_epi32(val2_lo, coef2_512); + __m512i mul2_hi = _mm512_mullo_epi32(val2_hi, coef2_512); + + __m512i mul32_lo = _mm512_mullo_epi32(val32_lo, coef0_512); + __m512i mul32_hi = _mm512_mullo_epi32(val32_hi, coef0_512); + __m512i mul34_lo = _mm512_mullo_epi32(val34_lo, coef2_512); + __m512i mul34_hi = _mm512_mullo_epi32(val34_hi, coef2_512); + + __m512i ac_bd_0_lo = _mm512_add_epi32(mul0_lo, mul2_lo); + __m512i ac_bd_0_hi = _mm512_add_epi32(mul0_hi, mul2_hi); + __m512i ac_bd_32_lo = _mm512_add_epi32(mul32_lo, mul34_lo); + __m512i ac_bd_32_hi = _mm512_add_epi32(mul32_hi, mul34_hi); + + __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_hi); + __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_hi); + __m512i ac_32 = _mm512_permutex2var_epi32(ac_bd_32_lo, idx_extract_ab_512, ac_bd_32_hi); + __m512i bd_32 = _mm512_permutex2var_epi32(ac_bd_32_lo, idx_extract_cd_512, ac_bd_32_hi); + + __m512i res_0 = _mm512_add_epi32(ac_0, bd_0); + __m512i res_32 = _mm512_add_epi32(ac_32, bd_32); + + _mm512_storeu_si512((__m512i*)(D + dx), res_0); + _mm512_storeu_si512((__m512i*)(D + dx + 16), res_32); + } + for (; dx < xmax_16; dx+=16) + { + int sx = dx * 2; + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + + __m512i val0_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val0)); + __m512i val0_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val0, 1)); + __m512i val2_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val2)); + __m512i val2_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val2, 1)); + + __m512i mul0_lo = _mm512_mullo_epi32(val0_lo, coef0_512); + __m512i mul0_hi = _mm512_mullo_epi32(val0_hi, coef0_512); + __m512i mul2_lo = _mm512_mullo_epi32(val2_lo, coef2_512); + __m512i mul2_hi = _mm512_mullo_epi32(val2_hi, coef2_512); + + __m512i ac_bd_0_lo = _mm512_add_epi32(mul0_lo, mul2_lo); + __m512i ac_bd_0_hi = _mm512_add_epi32(mul0_hi, mul2_hi); + + __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_hi); + __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_hi); + + __m512i res_0 = _mm512_add_epi32(ac_0, bd_0); + + _mm512_storeu_si512((__m512i*)(D + dx), res_0); + } + for (; dx < xmax_8; dx+=8) + { + int sx = dx * 2; + __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + __m512i val0 = _mm512_cvtepu16_epi32(val0_0); + __m512i val2 = _mm512_cvtepu16_epi32(val2_0); + + __m512i mul0 = _mm512_mullo_epi32(val0, coef0_512); + __m512i mul2 = _mm512_mullo_epi32(val2, coef2_512); + __m512i ac_bd_0_lo = _mm512_add_epi32(mul0, mul2); + __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_lo); + __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_lo); + __m512i res_0 = _mm512_add_epi32(ac_0, bd_0); + + _mm256_storeu_si256((__m256i*)(D + dx), _mm512_castsi512_si256(res_0)); + } + for (; dx < xmax; dx++) + { + int sx = dx * 2; + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +unsigned short hbd_castOp_avx512(int64_t val, int bitdepth) +{ + int bits = 22; + int SHIFT = bits; + int DELTA = (1 << (bits - 1)); + return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1)); +} + +static int hbd_clip_avx512(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +void hbd_vresize_avx512(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int bits = 22; + + __m512i delta_512 = _mm512_set1_epi64(1 << (bits - 1)); + __m512i max_char_512 = _mm512_set1_epi64(((1 << bitdepth) - 1)); + __m512i coef0_512 = _mm512_set1_epi32(beta[0]); + __m512i coef1_512 = _mm512_set1_epi32(beta[1]); + __m512i perm_512 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i delta_256 = _mm256_set1_epi64x(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi64x(((1 << bitdepth) - 1)); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i perm_256 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i max_char_128 = _mm_set1_epi64x(((1 << bitdepth) - 1)); + __m128i delta_128 = _mm_set1_epi64x(1 << (bits - 1)); + __m128i coef0_128 = _mm_set1_epi32(beta[0]); + __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i zero_128 = _mm_setzero_si128(); + + int width_32 = width - (width % 32); + int width_16 = width - (width % 16); + int width_8 = width - (width % 8); + int width_4 = width - (width % 4); + int x = 0; + + for (; x < width_32; x+=32) + { + __m512i src0_0 = _mm512_loadu_si512((__m512i*)(S0 + x)); + __m512i src1_0 = _mm512_loadu_si512((__m512i*)(S1 + x)); + __m512i src2_0 = _mm512_loadu_si512((__m512i*)(S2 + x)); + __m512i src3_0 = _mm512_loadu_si512((__m512i*)(S3 + x)); + + __m512i src0_16 = _mm512_loadu_si512((__m512i*)(S0 + x + 16)); + __m512i src1_16 = _mm512_loadu_si512((__m512i*)(S1 + x + 16)); + __m512i src2_16 = _mm512_loadu_si512((__m512i*)(S2 + x + 16)); + __m512i src3_16 = _mm512_loadu_si512((__m512i*)(S3 + x + 16)); + + __m512i mul0_0 = _mm512_mul_epi32(src0_0, coef0_512); + __m512i mul1_0 = _mm512_mul_epi32(src1_0, coef1_512); + __m512i mul2_0 = _mm512_mul_epi32(src2_0, coef1_512); + __m512i mul3_0 = _mm512_mul_epi32(src3_0, coef0_512); + + __m512i mul0_4 = _mm512_mul_epi32(_mm512_srai_epi64(src0_0, 32), coef0_512); + __m512i mul1_4 = _mm512_mul_epi32(_mm512_srai_epi64(src1_0, 32), coef1_512); + __m512i mul2_4 = _mm512_mul_epi32(_mm512_srai_epi64(src2_0, 32), coef1_512); + __m512i mul3_4 = _mm512_mul_epi32(_mm512_srai_epi64(src3_0, 32), coef0_512); + + __m512i mul0_8 = _mm512_mul_epi32(src0_16, coef0_512); + __m512i mul1_8 = _mm512_mul_epi32(src1_16, coef1_512); + __m512i mul2_8 = _mm512_mul_epi32(src2_16, coef1_512); + __m512i mul3_8 = _mm512_mul_epi32(src3_16, coef0_512); + + __m512i mul0_12 = _mm512_mul_epi32(_mm512_srai_epi64(src0_16, 32), coef0_512); + __m512i mul1_12 = _mm512_mul_epi32(_mm512_srai_epi64(src1_16, 32), coef1_512); + __m512i mul2_12 = _mm512_mul_epi32(_mm512_srai_epi64(src2_16, 32), coef1_512); + __m512i mul3_12 = _mm512_mul_epi32(_mm512_srai_epi64(src3_16, 32), coef0_512); + + __m512i accum_01_0 = _mm512_add_epi64(mul0_0, mul1_0); + __m512i accum_23_0 = _mm512_add_epi64(mul2_0, mul3_0); + __m512i accum_01_4 = _mm512_add_epi64(mul0_4, mul1_4); + __m512i accum_23_4 = _mm512_add_epi64(mul2_4, mul3_4); + __m512i accum_01_8 = _mm512_add_epi64(mul0_8, mul1_8); + __m512i accum_23_8 = _mm512_add_epi64(mul2_8, mul3_8); + __m512i accum_01_12 = _mm512_add_epi64(mul0_12, mul1_12); + __m512i accum_23_12 = _mm512_add_epi64(mul2_12, mul3_12); + + __m512i accum_0123_0 = _mm512_add_epi64(accum_01_0, accum_23_0); + __m512i accum_0123_4 = _mm512_add_epi64(accum_01_4, accum_23_4); + __m512i accum_0123_8 = _mm512_add_epi64(accum_01_8, accum_23_8); + __m512i accum_0123_12 = _mm512_add_epi64(accum_01_12, accum_23_12); + + accum_0123_0 = _mm512_add_epi64(accum_0123_0, delta_512); + accum_0123_4 = _mm512_add_epi64(accum_0123_4, delta_512); + accum_0123_8 = _mm512_add_epi64(accum_0123_8, delta_512); + accum_0123_12 = _mm512_add_epi64(accum_0123_12, delta_512); + + shift22_64b_signExt_512(accum_0123_0, accum_0123_0); + shift22_64b_signExt_512(accum_0123_4, accum_0123_4); + shift22_64b_signExt_512(accum_0123_8, accum_0123_8); + shift22_64b_signExt_512(accum_0123_12, accum_0123_12); + + accum_0123_0 = _mm512_max_epi64(accum_0123_0, zero_512); + accum_0123_4 = _mm512_max_epi64(accum_0123_4, zero_512); + accum_0123_8 = _mm512_max_epi64(accum_0123_8, zero_512); + accum_0123_12 = _mm512_max_epi64(accum_0123_12, zero_512); + + accum_0123_0 = _mm512_min_epi64(accum_0123_0, max_char_512); + accum_0123_4 = _mm512_min_epi64(accum_0123_4, max_char_512); + accum_0123_8 = _mm512_min_epi64(accum_0123_8, max_char_512); + accum_0123_12 = _mm512_min_epi64(accum_0123_12, max_char_512); + + accum_0123_0 = _mm512_or_si512(accum_0123_0, _mm512_slli_epi32(accum_0123_4, 16)); + accum_0123_8 = _mm512_or_si512(accum_0123_8, _mm512_slli_epi32(accum_0123_12, 16)); + accum_0123_0 = _mm512_or_si512(accum_0123_0, _mm512_slli_epi64(accum_0123_8, 32)); + accum_0123_0 = _mm512_permutexvar_epi32(perm_512, accum_0123_0); + + _mm512_storeu_si512((__m512i*)(dst + x), accum_0123_0); + } + for (; x < width_16; x+=16) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8)); + __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8)); + __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); + __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); + + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srai_epi64(src0_0, 32), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srai_epi64(src1_0, 32), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srai_epi64(src2_0, 32), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srai_epi64(src3_0, 32), coef0_256); + + __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0_256); + + __m256i mul0_12 = _mm256_mul_epi32(_mm256_srai_epi64(src0_8, 32), coef0_256); + __m256i mul1_12 = _mm256_mul_epi32(_mm256_srai_epi64(src1_8, 32), coef1_256); + __m256i mul2_12 = _mm256_mul_epi32(_mm256_srai_epi64(src2_8, 32), coef1_256); + __m256i mul3_12 = _mm256_mul_epi32(_mm256_srai_epi64(src3_8, 32), coef0_256); + + __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); + __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); + __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); + __m256i accum_01_8 = _mm256_add_epi64(mul0_8, mul1_8); + __m256i accum_23_8 = _mm256_add_epi64(mul2_8, mul3_8); + __m256i accum_01_12 = _mm256_add_epi64(mul0_12, mul1_12); + __m256i accum_23_12 = _mm256_add_epi64(mul2_12, mul3_12); + + __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); + __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); + __m256i accum_0123_8 = _mm256_add_epi64(accum_01_8, accum_23_8); + __m256i accum_0123_12 = _mm256_add_epi64(accum_01_12, accum_23_12); + + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta_256); + accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta_256); + + shift22_64b_signExt_256(accum_0123_0, accum_0123_0); + shift22_64b_signExt_256(accum_0123_4, accum_0123_4); + shift22_64b_signExt_256(accum_0123_8, accum_0123_8); + shift22_64b_signExt_256(accum_0123_12, accum_0123_12); + + accum_0123_0 = _mm256_max_epi64(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi64(accum_0123_4, zero_256); + accum_0123_8 = _mm256_max_epi64(accum_0123_8, zero_256); + accum_0123_12 = _mm256_max_epi64(accum_0123_12, zero_256); + + accum_0123_0 = _mm256_min_epi64(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi64(accum_0123_4, max_char_256); + accum_0123_8 = _mm256_min_epi64(accum_0123_8, max_char_256); + accum_0123_12 = _mm256_min_epi64(accum_0123_12, max_char_256); + + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + accum_0123_8 = _mm256_or_si256(accum_0123_8, _mm256_slli_epi32(accum_0123_12, 16)); + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi64(accum_0123_8, 32)); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm_256); + + _mm256_storeu_si256((__m256i*)(dst + x), accum_0123_0); + } + for (; x < width_8; x+=8) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srai_epi64(src0_0, 32), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srai_epi64(src1_0, 32), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srai_epi64(src2_0, 32), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srai_epi64(src3_0, 32), coef0_256); + + __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); + __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); + __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); + + __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); + __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); + + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + + shift22_64b_signExt_256(accum_0123_0, accum_0123_0); + shift22_64b_signExt_256(accum_0123_4, accum_0123_4); + + accum_0123_0 = _mm256_max_epi64(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi64(accum_0123_4, zero_256); + accum_0123_0 = _mm256_min_epi64(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi64(accum_0123_4, max_char_256); + + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, perm_256)); + _mm_storeu_si128((__m128i*)(dst + x), accum); + } + for (; x < width_4; x+=4) + { + __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); + __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); + __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); + __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); + + __m128i mul0_0 = _mm_mul_epi32(src0_0, coef0_128); + __m128i mul1_0 = _mm_mul_epi32(src1_0, coef1_128); + __m128i mul2_0 = _mm_mul_epi32(src2_0, coef1_128); + __m128i mul3_0 = _mm_mul_epi32(src3_0, coef0_128); + + __m128i mul0_4 = _mm_mul_epi32(_mm_srli_si128(src0_0, 4), coef0_128); + __m128i mul1_4 = _mm_mul_epi32(_mm_srli_si128(src1_0, 4), coef1_128); + __m128i mul2_4 = _mm_mul_epi32(_mm_srli_si128(src2_0, 4), coef1_128); + __m128i mul3_4 = _mm_mul_epi32(_mm_srli_si128(src3_0, 4), coef0_128); + + __m128i accum_01_0 = _mm_add_epi64(mul0_0, mul1_0); + __m128i accum_23_0 = _mm_add_epi64(mul2_0, mul3_0); + __m128i accum_01_4 = _mm_add_epi64(mul0_4, mul1_4); + __m128i accum_23_4 = _mm_add_epi64(mul2_4, mul3_4); + __m128i accum_0123_0 = _mm_add_epi64(accum_01_0, accum_23_0); + __m128i accum_0123_4 = _mm_add_epi64(accum_01_4, accum_23_4); + + accum_0123_0 = _mm_add_epi64(accum_0123_0, delta_128); + accum_0123_4 = _mm_add_epi64(accum_0123_4, delta_128); + + shift22_64b_signExt_128(accum_0123_0, accum_0123_0); + shift22_64b_signExt_128(accum_0123_4, accum_0123_4); + + accum_0123_0 = _mm_max_epi64(accum_0123_0, zero_128); + accum_0123_4 = _mm_max_epi64(accum_0123_4, zero_128); + accum_0123_0 = _mm_min_epi64(accum_0123_0, max_char_128); + accum_0123_4 = _mm_min_epi64(accum_0123_4, max_char_128); + + accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_slli_epi32(accum_0123_4, 16)); + accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_srli_si128(accum_0123_0, 4)); + + _mm_storel_epi64((__m128i*)(dst + x), accum_0123_0); + } + for (; x < width; x++) + dst[x] = hbd_castOp_avx512((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth); +} + +#if OPTIMISED_COEFF +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#else +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[HBD_MAX_ESIZE_avx512]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = hbd_clip_avx512(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < HBD_MAX_ESIZE_avx512 && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + // printf("%d ", dy); + +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hbd_hresize_avx512((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize_avx512((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth); +#else + if (k0 < ksize) + { + hbd_hresize_avx512((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize_avx512((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth); +#endif + } + free(_buffer); +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.c b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.c new file mode 100644 index 000000000..f5564d9d5 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.c @@ -0,0 +1,518 @@ +/** + * + * Copyright (C) 2022 Intel Corporation. + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#if ARCH_AARCH64 +#include +#endif + +#include "resizer_avx2.h" +#include +#include + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hresize_avx2(const unsigned char **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hresize_avx2(const unsigned char **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif + +{ + int xmax_64 = xmax - (xmax % 64); + int xmax_32 = xmax - (xmax % 32); + int xmax_16 = xmax - (xmax % 16); + int xmax_8 = xmax - (xmax % 8); + int xmax_4 = xmax - (xmax % 4); + __m256i coef0_256 = _mm256_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m256i coef2_256 = _mm256_set1_epi32(alpha[2] + (alpha[3] << 16)); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i coef0_128 = _mm_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m128i coef2_128 = _mm_set1_epi32(alpha[2] + (alpha[3] << 16)); + + for (int k = 0; k < count; k++) + { + const unsigned char *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax_64; dx+=64) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + __m256i val0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + __m256i val32 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 32)); + __m256i val34 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 32)); + __m256i val64 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 64)); + __m256i val66 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 64)); + __m256i val96 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 96)); + __m256i val98 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 96)); + + __m256i val0_lo = _mm256_unpacklo_epi8(val0, zero_256); + __m256i val0_hi = _mm256_unpackhi_epi8(val0, zero_256); + __m256i val2_lo = _mm256_unpacklo_epi8(val2, zero_256); + __m256i val2_hi = _mm256_unpackhi_epi8(val2, zero_256); + + __m256i val32_lo = _mm256_unpacklo_epi8(val32, zero_256); + __m256i val32_hi = _mm256_unpackhi_epi8(val32, zero_256); + __m256i val34_lo = _mm256_unpacklo_epi8(val34, zero_256); + __m256i val34_hi = _mm256_unpackhi_epi8(val34, zero_256); + + __m256i val64_lo = _mm256_unpacklo_epi8(val64, zero_256); + __m256i val64_hi = _mm256_unpackhi_epi8(val64, zero_256); + __m256i val66_lo = _mm256_unpacklo_epi8(val66, zero_256); + __m256i val66_hi = _mm256_unpackhi_epi8(val66, zero_256); + + __m256i val96_lo = _mm256_unpacklo_epi8(val96, zero_256); + __m256i val96_hi = _mm256_unpackhi_epi8(val96, zero_256); + __m256i val98_lo = _mm256_unpacklo_epi8(val98, zero_256); + __m256i val98_hi = _mm256_unpackhi_epi8(val98, zero_256); + + __m256i res0_lo = _mm256_madd_epi16(val0_lo, coef0_256); + __m256i res0_hi = _mm256_madd_epi16(val0_hi, coef0_256); + __m256i res2_lo = _mm256_madd_epi16(val2_lo, coef2_256); + __m256i res2_hi = _mm256_madd_epi16(val2_hi, coef2_256); + __m256i res32_lo = _mm256_madd_epi16(val32_lo, coef0_256); + __m256i res32_hi = _mm256_madd_epi16(val32_hi, coef0_256); + __m256i res34_lo = _mm256_madd_epi16(val34_lo, coef2_256); + __m256i res34_hi = _mm256_madd_epi16(val34_hi, coef2_256); + + __m256i res64_lo = _mm256_madd_epi16(val64_lo, coef0_256); + __m256i res64_hi = _mm256_madd_epi16(val64_hi, coef0_256); + __m256i res66_lo = _mm256_madd_epi16(val66_lo, coef2_256); + __m256i res66_hi = _mm256_madd_epi16(val66_hi, coef2_256); + __m256i res96_lo = _mm256_madd_epi16(val96_lo, coef0_256); + __m256i res96_hi = _mm256_madd_epi16(val96_hi, coef0_256); + __m256i res98_lo = _mm256_madd_epi16(val98_lo, coef2_256); + __m256i res98_hi = _mm256_madd_epi16(val98_hi, coef2_256); + + __m256i acc0_lo = _mm256_add_epi32(res0_lo, res2_lo); + __m256i acc0_hi = _mm256_add_epi32(res0_hi, res2_hi); + __m256i acc32_lo = _mm256_add_epi32(res32_lo, res34_lo); + __m256i acc32_hi = _mm256_add_epi32(res32_hi, res34_hi); + __m256i acc64_lo = _mm256_add_epi32(res64_lo, res66_lo); + __m256i acc64_hi = _mm256_add_epi32(res64_hi, res66_hi); + __m256i acc96_lo = _mm256_add_epi32(res96_lo, res98_lo); + __m256i acc96_hi = _mm256_add_epi32(res96_hi, res98_hi); + + __m256i tmp0 = acc0_lo; + __m256i tmp32 = acc32_lo; + __m256i tmp64 = acc64_lo; + __m256i tmp96 = acc96_lo; + + acc0_lo = _mm256_inserti128_si256(acc0_lo, _mm256_castsi256_si128(acc0_hi), 1); + acc0_hi = _mm256_inserti128_si256(acc0_hi, _mm256_extracti128_si256(tmp0, 1), 0); + acc32_lo = _mm256_inserti128_si256(acc32_lo, _mm256_castsi256_si128(acc32_hi), 1); + acc32_hi = _mm256_inserti128_si256(acc32_hi, _mm256_extracti128_si256(tmp32, 1), 0); + acc64_lo = _mm256_inserti128_si256(acc64_lo, _mm256_castsi256_si128(acc64_hi), 1); + acc64_hi = _mm256_inserti128_si256(acc64_hi, _mm256_extracti128_si256(tmp64, 1), 0); + acc96_lo = _mm256_inserti128_si256(acc96_lo, _mm256_castsi256_si128(acc96_hi), 1); + acc96_hi = _mm256_inserti128_si256(acc96_hi, _mm256_extracti128_si256(tmp96, 1), 0); + + _mm256_storeu_si256((__m256i*)(D + dx), acc0_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 8), acc0_hi); + _mm256_storeu_si256((__m256i*)(D + dx + 16), acc32_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 24), acc32_hi); + _mm256_storeu_si256((__m256i*)(D + dx + 32), acc64_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 40), acc64_hi); + _mm256_storeu_si256((__m256i*)(D + dx + 48), acc96_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 56), acc96_hi); + } + for (; dx < xmax_32; dx+=32) + { + int sx = dx * 2; + + __m256i val0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + __m256i val32 = _mm256_loadu_si256((__m256i*)(S + sx - 1 + 32)); + __m256i val34 = _mm256_loadu_si256((__m256i*)(S + sx + 1 + 32)); + + __m256i val0_lo = _mm256_unpacklo_epi8(val0, zero_256); + __m256i val0_hi = _mm256_unpackhi_epi8(val0, zero_256); + __m256i val2_lo = _mm256_unpacklo_epi8(val2, zero_256); + __m256i val2_hi = _mm256_unpackhi_epi8(val2, zero_256); + + __m256i val32_lo = _mm256_unpacklo_epi8(val32, zero_256); + __m256i val32_hi = _mm256_unpackhi_epi8(val32, zero_256); + __m256i val34_lo = _mm256_unpacklo_epi8(val34, zero_256); + __m256i val34_hi = _mm256_unpackhi_epi8(val34, zero_256); + + __m256i res0_lo = _mm256_madd_epi16(val0_lo, coef0_256); + __m256i res0_hi = _mm256_madd_epi16(val0_hi, coef0_256); + __m256i res2_lo = _mm256_madd_epi16(val2_lo, coef2_256); + __m256i res2_hi = _mm256_madd_epi16(val2_hi, coef2_256); + __m256i res32_lo = _mm256_madd_epi16(val32_lo, coef0_256); + __m256i res32_hi = _mm256_madd_epi16(val32_hi, coef0_256); + __m256i res34_lo = _mm256_madd_epi16(val34_lo, coef2_256); + __m256i res34_hi = _mm256_madd_epi16(val34_hi, coef2_256); + + __m256i acc0_lo = _mm256_add_epi32(res0_lo, res2_lo); + __m256i acc0_hi = _mm256_add_epi32(res0_hi, res2_hi); + __m256i acc32_lo = _mm256_add_epi32(res32_lo, res34_lo); + __m256i acc32_hi = _mm256_add_epi32(res32_hi, res34_hi); + __m256i tmp0 = acc0_lo; + __m256i tmp32 = acc32_lo; + + acc0_lo = _mm256_inserti128_si256(acc0_lo, _mm256_castsi256_si128(acc0_hi), 1); + acc0_hi = _mm256_inserti128_si256(acc0_hi, _mm256_extracti128_si256(tmp0, 1), 0); + acc32_lo = _mm256_inserti128_si256(acc32_lo, _mm256_castsi256_si128(acc32_hi), 1); + acc32_hi = _mm256_inserti128_si256(acc32_hi, _mm256_extracti128_si256(tmp32, 1), 0); + + _mm256_storeu_si256((__m256i*)(D + dx), acc0_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 8), acc0_hi); + _mm256_storeu_si256((__m256i*)(D + dx + 16), acc32_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 24), acc32_hi); + } + for (; dx < xmax_16; dx+=16) + { + int sx = dx * 2; + + __m256i val0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + + __m256i val0_lo = _mm256_unpacklo_epi8(val0, zero_256); + __m256i val0_hi = _mm256_unpackhi_epi8(val0, zero_256); + __m256i val2_lo = _mm256_unpacklo_epi8(val2, zero_256); + __m256i val2_hi = _mm256_unpackhi_epi8(val2, zero_256); + + __m256i res0_lo = _mm256_madd_epi16(val0_lo, coef0_256); + __m256i res0_hi = _mm256_madd_epi16(val0_hi, coef0_256); + __m256i res2_lo = _mm256_madd_epi16(val2_lo, coef2_256); + __m256i res2_hi = _mm256_madd_epi16(val2_hi, coef2_256); + + __m256i res_lo = _mm256_add_epi32(res0_lo, res2_lo); + __m256i res_hi = _mm256_add_epi32(res0_hi, res2_hi); + __m256i tmp = res_lo; + + res_lo = _mm256_inserti128_si256(res_lo, _mm256_castsi256_si128(res_hi), 1); + res_hi = _mm256_inserti128_si256(res_hi, _mm256_extracti128_si256(tmp, 1), 0); + _mm256_storeu_si256((__m256i*)(D + dx), res_lo); + _mm256_storeu_si256((__m256i*)(D + dx + 8), res_hi); + } + for (; dx < xmax_8; dx+=8) + { + int sx = dx * 2; + + __m256i val0_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(S + sx - 1))); + __m256i val2_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(S + sx + 1))); + + __m256i res0 = _mm256_madd_epi16(val0_16, coef0_256); + __m256i res2 = _mm256_madd_epi16(val2_16, coef2_256); + + __m256i res = _mm256_add_epi32(res0, res2); + _mm256_storeu_si256((__m256i*)(D + dx), res); + } + for (; dx < xmax_4; dx+=4) + { + int sx = dx * 2; + + __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1)); + __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1)); + + __m128i val0_16 = _mm_cvtepu8_epi16(val0); + __m128i val2_16 = _mm_cvtepu8_epi16(val2); + + __m128i res0 = _mm_madd_epi16(val0_16, coef0_128); + __m128i res2 = _mm_madd_epi16(val2_16, coef2_128); + + __m128i res = _mm_add_epi32(res0, res2); + _mm_storeu_si128((__m128i*)(D + dx), res); + } + for (; dx < xmax; dx++) + { + int sx = dx * 2; + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +void vresize_avx2(const int **src, unsigned char *dst, const short *beta, int width) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int bits = 22; + + __m256i perm0_256 = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 4, 0); + __m256i perm8_256 = _mm256_set_epi32(1, 1, 1, 1, 4, 0, 1, 1); + __m256i sh_32_to_8_256 = _mm256_set_epi64x(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i zero_256 = _mm256_setzero_si256(); + __m256i delta_256 = _mm256_set1_epi32(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi32(255); + + __m128i sh_32_to_8_128 = _mm_set_epi64x(0x8080808080808080, 0x808080800C080400); + __m128i coef0_128 = _mm_set1_epi32(beta[0]); + __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i zero_128 = _mm_setzero_si128(); + __m128i delta_128 = _mm_set1_epi32(1 << (bits - 1)); + __m128i max_char_128 = _mm_set1_epi32(255); + + int width_16 = width - (width % 16); + int width_8 = width - (width % 8); + int width_4 = width - (width % 4); + int x = 0; + + for (; x < width_16; x+=16) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8)); + __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8)); + __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); + __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); + + __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256); + + __m256i mul0_8 = _mm256_mullo_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mullo_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mullo_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mullo_epi32(src3_8, coef0_256); + + __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0); + __m256i accum_01_8 = _mm256_add_epi32(mul0_8, mul1_8); + __m256i accum_23_8 = _mm256_add_epi32(mul2_8, mul3_8); + __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0); + __m256i accum_0123_8 = _mm256_add_epi32(accum_01_8, accum_23_8); + + accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256); + accum_0123_8 = _mm256_add_epi32(accum_0123_8, delta_256); + accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits); + accum_0123_8 = _mm256_srai_epi32(accum_0123_8, bits); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256); + + accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256); + accum_0123_8 = _mm256_shuffle_epi8(accum_0123_8, sh_32_to_8_256); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256); + accum_0123_8 = _mm256_permutevar8x32_epi32(accum_0123_8, perm8_256); + + __m128i accum = _mm256_extracti128_si256(_mm256_or_si256(accum_0123_0, accum_0123_8), 0); + _mm_storeu_si128((__m128i*)(dst + x), accum); + } + for (; x < width_8; x+=8) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256); + + __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0); + __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0); + + accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256); + accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + + accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256); + + __m128i accum = _mm256_castsi256_si128(accum_0123_0); + _mm_storel_epi64((__m128i*)(dst + x), accum); + } + for (; x < width_4; x+=4) + { + __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); + __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); + __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); + __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); + + __m128i mul0_0 = _mm_mullo_epi32(src0_0, coef0_128); + __m128i mul1_0 = _mm_mullo_epi32(src1_0, coef1_128); + __m128i mul2_0 = _mm_mullo_epi32(src2_0, coef1_128); + __m128i mul3_0 = _mm_mullo_epi32(src3_0, coef0_128); + + __m128i accum_01_0 = _mm_add_epi32(mul0_0, mul1_0); + __m128i accum_23_0 = _mm_add_epi32(mul2_0, mul3_0); + __m128i accum_0123_0 = _mm_add_epi32(accum_01_0, accum_23_0); + + accum_0123_0 = _mm_add_epi32(accum_0123_0, delta_128); + accum_0123_0 = _mm_srai_epi32(accum_0123_0, bits); + + accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128); + accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128); + + accum_0123_0 = _mm_shuffle_epi8(accum_0123_0, sh_32_to_8_128); + _mm_maskstore_epi32((int*)(dst + x), _mm_set_epi32(0, 0, 0, 0x80000000), accum_0123_0); + } + + for (; x < width; x++) + dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3); +} + +static int clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +#if OPTIMISED_COEFF +void step_avx2(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax) +#else +void step_avx2(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[MAX_ESIZE]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + + // regular c +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hresize_avx2((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize_avx2((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); +#else + if (k0 < ksize) + { + hresize_avx2((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize_avx2((const int **)rows, (_dst + dwidth * dy), beta, dwidth); +#endif + } + free(_buffer); +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.h b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.h new file mode 100644 index 000000000..45676e7cf --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx2.h @@ -0,0 +1,31 @@ +/** + * + * Copyright (C) 2022 Intel Corporation. + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +// #include "integer_funque_filters.h" +#include "../resizer.h" + +void vresize_avx2(const int **src, unsigned char *dst, const short *beta, int width); +#if OPTIMISED_COEFF +void step_avx2(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#else +void step_avx2(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step_avx2(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#endif +//void hbd_resize(const unsigned short *_src, unsigned short *_dst, int iwidth, int iheight, int dwidth, int dheight, int bitdepth); \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c new file mode 100644 index 000000000..782c91ba6 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c @@ -0,0 +1,519 @@ +/** + * + * Copyright (C) 2022 Intel Corporation. + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#if ARCH_AARCH64 +#include +#endif + +#include "resizer_avx512.h" +#include +#include + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hresize_avx512(const unsigned char **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hresize_avx512(const unsigned char **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + int xmax_64 = xmax - (xmax % 64); + int xmax_32 = xmax - (xmax % 32); + int xmax_16 = xmax - (xmax % 16); + int xmax_8 = xmax - (xmax % 8); + int xmax_4 = xmax - (xmax % 4); + + __m512i coef0_512 = _mm512_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m512i coef2_512 = _mm512_set1_epi32(alpha[2] + (alpha[3] << 16)); + __m512i permlo_512 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); + __m512i permhi_512 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i coef0_256 = _mm256_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m256i coef2_256 = _mm256_set1_epi32(alpha[2] + (alpha[3] << 16)); + + __m128i coef0_128 = _mm_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m128i coef2_128 = _mm_set1_epi32(alpha[2] + (alpha[3] << 16)); + + for (int k = 0; k < count; k++) + { + const unsigned char *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax_64; dx+=64) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + __m512i val64 = _mm512_loadu_si512((__m512i*)(S + sx - 1 + 64)); + __m512i val66 = _mm512_loadu_si512((__m512i*)(S + sx + 1 + 64)); + + __m512i val0_lo = _mm512_unpacklo_epi8(val0, zero_512); + __m512i val0_hi = _mm512_unpackhi_epi8(val0, zero_512); + __m512i val2_lo = _mm512_unpacklo_epi8(val2, zero_512); + __m512i val2_hi = _mm512_unpackhi_epi8(val2, zero_512); + + __m512i val64_lo = _mm512_unpacklo_epi8(val64, zero_512); + __m512i val64_hi = _mm512_unpackhi_epi8(val64, zero_512); + __m512i val66_lo = _mm512_unpacklo_epi8(val66, zero_512); + __m512i val66_hi = _mm512_unpackhi_epi8(val66, zero_512); + + __m512i res0_lo = _mm512_madd_epi16(val0_lo, coef0_512); + __m512i res0_hi = _mm512_madd_epi16(val0_hi, coef0_512); + __m512i res2_lo = _mm512_madd_epi16(val2_lo, coef2_512); + __m512i res2_hi = _mm512_madd_epi16(val2_hi, coef2_512); + + __m512i res64_lo = _mm512_madd_epi16(val64_lo, coef0_512); + __m512i res64_hi = _mm512_madd_epi16(val64_hi, coef0_512); + __m512i res66_lo = _mm512_madd_epi16(val66_lo, coef2_512); + __m512i res66_hi = _mm512_madd_epi16(val66_hi, coef2_512); + + __m512i r0_lo = _mm512_add_epi32(res0_lo, res2_lo); + __m512i r0_hi = _mm512_add_epi32(res0_hi, res2_hi); + __m512i r1_lo = _mm512_add_epi32(res64_lo, res66_lo); + __m512i r1_hi = _mm512_add_epi32(res64_hi, res66_hi); + __m512i tmp0 = r0_lo; + __m512i tmp1 = r1_lo; + + r0_lo = _mm512_permutex2var_epi64(r0_lo, permlo_512, r0_hi); + r0_hi = _mm512_permutex2var_epi64(tmp0, permhi_512, r0_hi); + r1_lo = _mm512_permutex2var_epi64(r1_lo, permlo_512, r1_hi); + r1_hi = _mm512_permutex2var_epi64(tmp1, permhi_512, r1_hi); + + _mm512_storeu_si512((__m512i*)(D + dx), r0_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 16), r0_hi); + _mm512_storeu_si512((__m512i*)(D + dx + 32), r1_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 48), r1_hi); + } + for (; dx < xmax_32; dx+=32) + { + int sx = dx * 2; + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + + __m512i val0_lo = _mm512_unpacklo_epi8(val0, zero_512); + __m512i val0_hi = _mm512_unpackhi_epi8(val0, zero_512); + + __m512i val2_lo = _mm512_unpacklo_epi8(val2, zero_512); + __m512i val2_hi = _mm512_unpackhi_epi8(val2, zero_512); + + __m512i res0_lo = _mm512_madd_epi16(val0_lo, coef0_512); + __m512i res0_hi = _mm512_madd_epi16(val0_hi, coef0_512); + __m512i res2_lo = _mm512_madd_epi16(val2_lo, coef2_512); + __m512i res2_hi = _mm512_madd_epi16(val2_hi, coef2_512); + + __m512i res_lo = _mm512_add_epi32(res0_lo, res2_lo); + __m512i res_hi = _mm512_add_epi32(res0_hi, res2_hi); + __m512i tmp = res_lo; + + res_lo = _mm512_permutex2var_epi64(res_lo, permlo_512, res_hi); + res_hi = _mm512_permutex2var_epi64(tmp, permhi_512, res_hi); + + _mm512_storeu_si512((__m512i*)(D + dx), res_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 16), res_hi); + } + for (; dx < xmax_16; dx+=16) + { + int sx = dx * 2; + __m512i val0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(S + sx - 1))); + __m512i val2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(S + sx + 1))); + + __m512i res0_lo = _mm512_madd_epi16(val0, coef0_512); + __m512i res0_hi = _mm512_madd_epi16(val0, coef0_512); + __m512i res2_lo = _mm512_madd_epi16(val2, coef2_512); + __m512i res2_hi = _mm512_madd_epi16(val2, coef2_512); + + __m512i res_lo = _mm512_add_epi32(res0_lo, res2_lo); + __m512i res_hi = _mm512_add_epi32(res0_hi, res2_hi); + + _mm512_storeu_si512((__m512i*)(D + dx), res_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 16), res_hi); + } + for (; dx < xmax_8; dx+=8) + { + int sx = dx * 2; + + __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1)); + __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1)); + + __m256i val0_16 = _mm256_cvtepu8_epi16(val0); + __m256i val2_16 = _mm256_cvtepu8_epi16(val2); + + __m256i res0 = _mm256_madd_epi16(val0_16, coef0_256); + __m256i res2 = _mm256_madd_epi16(val2_16, coef2_256); + + __m256i res = _mm256_add_epi32(res0, res2); + _mm256_storeu_si256((__m256i*)(D + dx), res); + } + for (; dx < xmax_4; dx+=4) + { + int sx = dx * 2; + + __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1)); + __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1)); + + __m128i val0_16 = _mm_cvtepu8_epi16(val0); + __m128i val2_16 = _mm_cvtepu8_epi16(val2); + + __m128i res0 = _mm_madd_epi16(val0_16, coef0_128); + __m128i res2 = _mm_madd_epi16(val2_16, coef2_128); + + __m128i res = _mm_add_epi32(res0, res2); + _mm_storeu_si128((__m128i*)(D + dx), res); + } + for (; dx < xmax; dx++) + { + int sx = dx * 2; + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +void vresize_avx512(const int **src, unsigned char *dst, const short *beta, int width) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int bits = 22; + + __m512i sh_32_to_8_512 = _mm512_set_epi64(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400); + __m512i perm0_512 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 8, 4, 0); + __m512i perm8_512 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 12, 8, 4, 0, 1, 1, 1, 1); + __m512i coef0_512 = _mm512_set1_epi32(beta[0]); + __m512i coef1_512 = _mm512_set1_epi32(beta[1]); + __m512i delta_512 = _mm512_set1_epi32(1 << (bits - 1)); + __m512i max_char_512 = _mm512_set1_epi32(255); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i sh_32_to_8_256 = _mm256_set_epi64x(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400); + __m256i perm0_256 = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 4, 0); + __m256i perm8_256 = _mm256_set_epi32(1, 1, 1, 1, 4, 0, 1, 1); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i delta_256 = _mm256_set1_epi32(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi32(255); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i sh_32_to_8_128 = _mm_set_epi64x(0x8080808080808080, 0x808080800C080400); + __m128i coef0_128 = _mm_set1_epi32(beta[0]); + __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i delta_128 = _mm_set1_epi32(1 << (bits - 1)); + __m128i max_char_128 = _mm_set1_epi32(255); + __m128i zero_128 = _mm_setzero_si128(); + + int width_32 = width - (width % 32); + int width_16 = width - (width % 16); + int width_8 = width - (width % 8); + int width_4 = width - (width % 4); + int x = 0; + + for (; x < width_32; x+=32) + { + __m512i src0_0 = _mm512_loadu_si512((__m512i*)(S0 + x)); + __m512i src1_0 = _mm512_loadu_si512((__m512i*)(S1 + x)); + __m512i src2_0 = _mm512_loadu_si512((__m512i*)(S2 + x)); + __m512i src3_0 = _mm512_loadu_si512((__m512i*)(S3 + x)); + + __m512i src0_16 = _mm512_loadu_si512((__m512i*)(S0 + x + 16)); + __m512i src1_16 = _mm512_loadu_si512((__m512i*)(S1 + x + 16)); + __m512i src2_16 = _mm512_loadu_si512((__m512i*)(S2 + x + 16)); + __m512i src3_16 = _mm512_loadu_si512((__m512i*)(S3 + x + 16)); + + __m512i mul0_0 = _mm512_mullo_epi32(src0_0, coef0_512); + __m512i mul1_0 = _mm512_mullo_epi32(src1_0, coef1_512); + __m512i mul2_0 = _mm512_mullo_epi32(src2_0, coef1_512); + __m512i mul3_0 = _mm512_mullo_epi32(src3_0, coef0_512); + + __m512i mul0_8 = _mm512_mullo_epi32(src0_16, coef0_512); + __m512i mul1_8 = _mm512_mullo_epi32(src1_16, coef1_512); + __m512i mul2_8 = _mm512_mullo_epi32(src2_16, coef1_512); + __m512i mul3_8 = _mm512_mullo_epi32(src3_16, coef0_512); + + __m512i accum_01_0 = _mm512_add_epi32(mul0_0, mul1_0); + __m512i accum_23_0 = _mm512_add_epi32(mul2_0, mul3_0); + __m512i accum_01_8 = _mm512_add_epi32(mul0_8, mul1_8); + __m512i accum_23_8 = _mm512_add_epi32(mul2_8, mul3_8); + __m512i accum_0123_0 = _mm512_add_epi32(accum_01_0, accum_23_0); + __m512i accum_0123_8 = _mm512_add_epi32(accum_01_8, accum_23_8); + + accum_0123_0 = _mm512_add_epi32(accum_0123_0, delta_512); + accum_0123_8 = _mm512_add_epi32(accum_0123_8, delta_512); + accum_0123_0 = _mm512_srai_epi32(accum_0123_0, bits); + accum_0123_8 = _mm512_srai_epi32(accum_0123_8, bits); + + accum_0123_0 = _mm512_max_epi32(accum_0123_0, zero_512); + accum_0123_8 = _mm512_max_epi32(accum_0123_8, zero_512); + accum_0123_0 = _mm512_min_epi32(accum_0123_0, max_char_512); + accum_0123_8 = _mm512_min_epi32(accum_0123_8,max_char_512); + + accum_0123_0 = _mm512_shuffle_epi8(accum_0123_0, sh_32_to_8_512); + accum_0123_8 = _mm512_shuffle_epi8(accum_0123_8, sh_32_to_8_512); + + accum_0123_0 = _mm512_permutexvar_epi32(perm0_512, accum_0123_0); + accum_0123_8 = _mm512_permutexvar_epi32(perm8_512, accum_0123_8); + __m256i accum = _mm512_extracti32x8_epi32(_mm512_or_si512(accum_0123_0, accum_0123_8), 0); + _mm256_storeu_si256((__m256i*)(dst + x), accum); + } + for (; x < width_16; x+=16) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8)); + __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8)); + __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); + __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); + + __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256); + + __m256i mul0_8 = _mm256_mullo_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mullo_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mullo_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mullo_epi32(src3_8, coef0_256); + + __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0); + __m256i accum_01_8 = _mm256_add_epi32(mul0_8, mul1_8); + __m256i accum_23_8 = _mm256_add_epi32(mul2_8, mul3_8); + __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0); + __m256i accum_0123_8 = _mm256_add_epi32(accum_01_8, accum_23_8); + + accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256); + accum_0123_8 = _mm256_add_epi32(accum_0123_8, delta_256); + accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits); + accum_0123_8 = _mm256_srai_epi32(accum_0123_8, bits); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256); + + accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256); + accum_0123_8 = _mm256_shuffle_epi8(accum_0123_8, sh_32_to_8_256); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256); + accum_0123_8 = _mm256_permutevar8x32_epi32(accum_0123_8, perm8_256); + + __m128i accum = _mm256_extracti128_si256(_mm256_or_si256(accum_0123_0, accum_0123_8), 0); + _mm_storeu_si128((__m128i*)(dst + x), accum); + } + for (; x < width_8; x+=8) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256); + + __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0); + __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0); + + accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256); + accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + + accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256); + + __m128i accum = _mm256_castsi256_si128(accum_0123_0); + _mm_storel_epi64((__m128i*)(dst + x), accum); + } + for (; x < width_4; x+=4) + { + __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); + __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); + __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); + __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); + + __m128i mul0_0 = _mm_mullo_epi32(src0_0, coef0_128); + __m128i mul1_0 = _mm_mullo_epi32(src1_0, coef1_128); + __m128i mul2_0 = _mm_mullo_epi32(src2_0, coef1_128); + __m128i mul3_0 = _mm_mullo_epi32(src3_0, coef0_128); + + __m128i accum_01_0 = _mm_add_epi32(mul0_0, mul1_0); + __m128i accum_23_0 = _mm_add_epi32(mul2_0, mul3_0); + __m128i accum_0123_0 = _mm_add_epi32(accum_01_0, accum_23_0); + + accum_0123_0 = _mm_add_epi32(accum_0123_0, delta_128); + accum_0123_0 = _mm_srai_epi32(accum_0123_0, bits); + + accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128); + accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128); + + accum_0123_0 = _mm_shuffle_epi8(accum_0123_0, sh_32_to_8_128); + _mm_maskstore_epi32((int*)(dst + x), _mm_set_epi32(0, 0, 0, 0x80000000), accum_0123_0); + } + + for (; x < width; x++) + dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3); +} + +static int clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +#if OPTIMISED_COEFF +void step_avx512(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax) +#else +void step_avx512(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[MAX_ESIZE]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + + // regular c +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hresize_avx512((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize_avx512((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); +#else + if (k0 < ksize) + { + hresize_avx512((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize_avx512((const int **)rows, (_dst + dwidth * dy), beta, dwidth); +#endif + } + free(_buffer); +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h new file mode 100644 index 000000000..a3adc148a --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h @@ -0,0 +1,29 @@ +/** + * + * Copyright (C) 2022 Intel Corporation. + * Copyright (c) 2022-2024 Meta, Inc. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/license/bsd-3-clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "../resizer.h" + +void vresize_avx512(const int **src, unsigned char *dst, const short *beta, int width); +#if OPTIMISED_COEFF +void step_avx512(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#else +void step_avx512(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#endif \ No newline at end of file