From b2f99d967c3b4eeaaacebb6681aa6a664a3aad7c Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Thu, 11 Apr 2024 10:43:41 -0700 Subject: [PATCH] simd rfx yuv --- module/amd64/Makefile.am | 1 + .../a8r8g8b8_to_yuvalp_box_amd64_sse2.asm | 178 ++++++++++++++++++ module/amd64/funcs_amd64.h | 4 + module/rdp.h | 1 + module/rdpCapture.c | 95 ++++++---- module/rdpCapture.h | 4 + module/rdpSimd.c | 87 +++++++++ module/x86/Makefile.am | 1 + .../x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm | 171 +++++++++++++++++ module/x86/funcs_x86.h | 4 + 10 files changed, 505 insertions(+), 41 deletions(-) create mode 100644 module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm create mode 100644 module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am index cd2a0204..be4d928b 100644 --- a/module/amd64/Makefile.am +++ b/module/amd64/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \ a8r8g8b8_to_nv12_box_amd64_sse2.asm \ + a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \ cpuid_amd64.asm \ i420_to_rgb32_amd64_sse2.asm \ uyvy_to_rgb32_amd64_sse2.asm \ diff --git a/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm new file mode 100644 index 00000000..cfe9d6af --- /dev/null +++ b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm @@ -0,0 +1,178 @@ +; +;Copyright 2024 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to YUVALP +;amd64 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width must be multiple of 8 and > 0 +; height must be > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + cw128 times 8 dw 128 + cw77 times 8 dw 77 + cw150 times 8 dw 150 + cw29 times 8 dw 29 + cw43 times 8 dw 43 + cw85 times 8 dw 85 + cw107 times 8 dw 107 + cw21 times 8 dw 21 + +%define LS8 [rsp + 0] ; s8 +%define LSRC_STRIDE [rsp + 8] ; src_stride +%define LD8 [rsp + 16] ; d8 +%define LDST_STRIDE [rsp + 24] ; dst_stride +%define LWIDTH [rsp + 32] ; width +%define LHEIGHT [rsp + 40] ; height + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, +; uint8_t *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_yuvalp_box_amd64_sse2 + push rbx + push rbp + sub rsp, 48 ; local vars, 48 bytes + + mov LS8, rdi ; s8 + mov LSRC_STRIDE, rsi ; src_stride + mov LD8, rdx ; d8 + mov LDST_STRIDE, rcx ; dst_stride + mov LWIDTH, r8 ; width + mov LHEIGHT, r9 ; height + + pxor xmm7, xmm7 + + mov ebx, LHEIGHT ; ebx = height + +row_loop1: + mov rsi, LS8 ; s8 + mov rdi, LD8 ; d8 + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + movdqa xmm4, xmm0 ; alpha + psrld xmm4, 24 ; alpha + pand xmm4, [lsym(cd255)] ; alpha + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm5, xmm0 ; alpha + psrld xmm5, 24 ; alpha + pand xmm5, [lsym(cd255)] ; alpha + packssdw xmm4, xmm5 ; xmm4 = 8 alphas + packuswb xmm4, xmm7 + movq [rdi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw29)] + pmullw xmm5, [lsym(cw150)] + pmullw xmm6, [lsym(cw77)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw85)] + pmullw xmm6, [lsym(cw43)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [rdi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu + + ; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw107)] + pmullw xmm6, [lsym(cw21)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [rdi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv + + ; move right + lea rsi, [rsi + 32] + lea rdi, [rdi + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov rax, LS8 ; s8 + add rax, LSRC_STRIDE ; s8 += src_stride + mov LS8, rax + + ; update d8 + mov rax, LD8 ; d8 + add rax, LDST_STRIDE ; d8 += dst_stride + mov LD8, rax + + dec ebx + jnz row_loop1 + + mov rax, 0 ; return value + add rsp, 48 ; local vars, 48 bytes + pop rbp + pop rbx + ret +END_OF_FILE diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h index ae38c53b..3b54e2b2 100644 --- a/module/amd64/funcs_amd64.h +++ b/module/amd64/funcs_amd64.h @@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +int +a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif diff --git a/module/rdp.h b/module/rdp.h index 844db424..f9e4ac61 100644 --- a/module/rdp.h +++ b/module/rdp.h @@ -297,6 +297,7 @@ struct _rdpRec copy_box_proc a8r8g8b8_to_a8b8g8r8_box; copy_box_dst2_proc a8r8g8b8_to_nv12_box; + copy_box_proc a8r8g8b8_to_yuvalp_box; /* multimon */ struct monitor_info minfo[16]; /* client monitor data */ diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 334a4880..893917ce 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -124,24 +124,18 @@ rdpFillBox_yuvalp(int ax, int ay, /* 19595 38470 7471 -11071 -21736 32807 32756 -27429 -5327 */ -static int -rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, - const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - BoxPtr rects, int num_rects) +int +a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) { - const uint8_t *s8; - uint8_t *d8; uint8_t *yptr; uint8_t *uptr; uint8_t *vptr; uint8_t *aptr; const uint32_t *s32; - int index; int jndex; int kndex; - int width; - int height; uint32_t pixel; uint8_t a; int r; @@ -150,6 +144,51 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, int y; int u; int v; + + for (jndex = 0; jndex < height; jndex++) + { + s32 = (const uint32_t *) s8; + yptr = d8; + uptr = yptr + 64 * 64; + vptr = uptr + 64 * 64; + aptr = vptr + 64 * 64; + kndex = 0; + while (kndex < width) + { + pixel = *(s32++); + RGB_SPLIT(a, r, g, b, pixel); + y = (r * 19595 + g * 38470 + b * 7471) >> 16; + u = (r * -11071 + g * -21736 + b * 32807) >> 16; + v = (r * 32756 + g * -27429 + b * -5327) >> 16; + u = u + 128; + v = v + 128; + y = RDPCLAMP(y, 0, UCHAR_MAX); + u = RDPCLAMP(u, 0, UCHAR_MAX); + v = RDPCLAMP(v, 0, UCHAR_MAX); + *(yptr++) = y; + *(uptr++) = u; + *(vptr++) = v; + *(aptr++) = a; + kndex++; + } + d8 += dst_stride; + s8 += src_stride; + } + return 0; +} + +/******************************************************************************/ +static int +rdpCopyBox_a8r8g8b8_to_yuvalp(rdpClientCon *clientCon, int ax, int ay, + const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + BoxPtr rects, int num_rects) +{ + const uint8_t *s8; + uint8_t *d8; + int index; + int width; + int height; BoxPtr box; dst = dst + (ay << 8) * (dst_stride >> 8) + (ax << 8); @@ -162,35 +201,9 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, d8 += box->x1 - ax; width = box->x2 - box->x1; height = box->y2 - box->y1; - for (jndex = 0; jndex < height; jndex++) - { - s32 = (const uint32_t *) s8; - yptr = d8; - uptr = yptr + 64 * 64; - vptr = uptr + 64 * 64; - aptr = vptr + 64 * 64; - kndex = 0; - while (kndex < width) - { - pixel = *(s32++); - RGB_SPLIT(a, r, g, b, pixel); - y = (r * 19595 + g * 38470 + b * 7471) >> 16; - u = (r * -11071 + g * -21736 + b * 32807) >> 16; - v = (r * 32756 + g * -27429 + b * -5327) >> 16; - u = u + 128; - v = v + 128; - y = RDPCLAMP(y, 0, UCHAR_MAX); - u = RDPCLAMP(u, 0, UCHAR_MAX); - v = RDPCLAMP(v, 0, UCHAR_MAX); - *(yptr++) = y; - *(uptr++) = u; - *(vptr++) = v; - *(aptr++) = a; - kndex++; - } - d8 += 64; - s8 += src_stride; - } + clientCon->dev->a8r8g8b8_to_yuvalp_box(s8, src_stride, + d8, 64, + width, height); } return 0; } @@ -946,7 +959,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, rects = REGION_RECTS(&tile_reg); num_rects = REGION_NUM_RECTS(&tile_reg); crc = wyhash((const void*)rects, num_rects * sizeof(BoxRec), crc, _wyp); - rdpCopyBox_a8r8g8b8_to_yuvalp(x, y, + rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y, src, src_stride, dst, dst_stride, rects, num_rects); @@ -975,7 +988,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /* lazily only do this if hash wasn't identical */ if (rcode != rgnPART) { - rdpCopyBox_a8r8g8b8_to_yuvalp(x, y, + rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y, src, src_stride, dst, dst_stride, &rect, 1); diff --git a/module/rdpCapture.h b/module/rdpCapture.h index c4dd04a6..5bf3fc76 100644 --- a/module/rdpCapture.h +++ b/module/rdpCapture.h @@ -49,5 +49,9 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +extern _X_EXPORT int +a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif diff --git a/module/rdpSimd.c b/module/rdpSimd.c index 49a3653e..05dd6a03 100644 --- a/module/rdpSimd.c +++ b/module/rdpSimd.c @@ -62,6 +62,90 @@ int g_simd_use_accel = 1; #define LLOGLN(_level, _args) \ do { if (_level < LOG_LEVEL) { ErrorF _args ; ErrorF("\n"); } } while (0) +#if SIMD_USE_ACCEL + +#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64) +/*****************************************************************************/ +int +a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_yuvalp_box_amd64_sse2(s8, src_stride, + d8, dst_stride, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride, + d8 + aligned_width, dst_stride, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} +#endif + +#if defined(__x86__) || defined(_M_IX86) || defined(__i386__) +/*****************************************************************************/ +int +a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_yuvalp_box_x86_sse2(s8, src_stride, + d8, dst_stride, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride, + d8 + aligned_width, dst_stride, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} +#endif + +#endif + /*****************************************************************************/ Bool rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) @@ -77,6 +161,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = UYVY_to_RGB32; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box; #if SIMD_USE_ACCEL if (g_simd_use_accel) { @@ -93,6 +178,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned")); } #elif defined(__x86__) || defined(_M_IX86) || defined(__i386__) @@ -108,6 +194,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned")); } #endif diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am index ed106863..9539f8c0 100644 --- a/module/x86/Makefile.am +++ b/module/x86/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_I386 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \ a8r8g8b8_to_nv12_box_x86_sse2.asm \ + a8r8g8b8_to_yuvalp_box_x86_sse2.asm \ cpuid_x86.asm \ i420_to_rgb32_x86_sse2.asm \ uyvy_to_rgb32_x86_sse2.asm \ diff --git a/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm new file mode 100644 index 00000000..cec02043 --- /dev/null +++ b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm @@ -0,0 +1,171 @@ +; +;Copyright 2024 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to YUVALP +;x86 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width must be multiple of 8 and > 0 +; height must be > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + cw128 times 8 dw 128 + cw77 times 8 dw 77 + cw150 times 8 dw 150 + cw29 times 8 dw 29 + cw43 times 8 dw 43 + cw85 times 8 dw 85 + cw107 times 8 dw 107 + cw21 times 8 dw 21 + +%define LS8 [esp + 20] ; s8 +%define LSRC_STRIDE [esp + 24] ; src_stride +%define LD8 [esp + 28] ; d8 +%define LDST_STRIDE [esp + 32] ; dst_stride +%define LWIDTH [esp + 36] ; width +%define LHEIGHT [esp + 40] ; height + +;int +;a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, +; uint8_t *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_yuvalp_box_x86_sse2 + push ebx + RETRIEVE_RODATA + push esi + push edi + push ebp + + pxor xmm7, xmm7 + + mov ebp, LHEIGHT ; ebp = height + +row_loop1: + mov esi, LS8 ; s8 + mov edi, LD8 ; d8 + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + movdqa xmm4, xmm0 ; alpha + psrld xmm4, 24 ; alpha + pand xmm4, [lsym(cd255)] ; alpha + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm5, xmm0 ; alpha + psrld xmm5, 24 ; alpha + pand xmm5, [lsym(cd255)] ; alpha + packssdw xmm4, xmm5 ; xmm4 = 8 alphas + packuswb xmm4, xmm7 + movq [edi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw29)] + pmullw xmm5, [lsym(cw150)] + pmullw xmm6, [lsym(cw77)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw85)] + pmullw xmm6, [lsym(cw43)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [edi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu + + ; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw107)] + pmullw xmm6, [lsym(cw21)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [edi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv + + ; move right + lea esi, [esi + 32] + lea edi, [edi + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov eax, LS8 ; s8 + add eax, LSRC_STRIDE ; s8 += src_stride + mov LS8, eax + + ; update d8 + mov eax, LD8 ; d8 + add eax, LDST_STRIDE ; d8 += dst_stride + mov LD8, eax + + dec ebp + jnz row_loop1 + + mov eax, 0 ; return value + pop ebp + pop edi + pop esi + pop ebx + ret +END_OF_FILE diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h index c70cc8cf..d1f3357d 100644 --- a/module/x86/funcs_x86.h +++ b/module/x86/funcs_x86.h @@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +int +a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif