From b2f99d967c3b4eeaaacebb6681aa6a664a3aad7c Mon Sep 17 00:00:00 2001
From: Jay Sorg <jay.sorg@gmail.com>
Date: Thu, 11 Apr 2024 10:43:41 -0700
Subject: [PATCH] simd rfx yuv

---
 module/amd64/Makefile.am                      |   1 +
 .../a8r8g8b8_to_yuvalp_box_amd64_sse2.asm     | 178 ++++++++++++++++++
 module/amd64/funcs_amd64.h                    |   4 +
 module/rdp.h                                  |   1 +
 module/rdpCapture.c                           |  95 ++++++----
 module/rdpCapture.h                           |   4 +
 module/rdpSimd.c                              |  87 +++++++++
 module/x86/Makefile.am                        |   1 +
 .../x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm   | 171 +++++++++++++++++
 module/x86/funcs_x86.h                        |   4 +
 10 files changed, 505 insertions(+), 41 deletions(-)
 create mode 100644 module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm
 create mode 100644 module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm

diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am
index cd2a0204..be4d928b 100644
--- a/module/amd64/Makefile.am
+++ b/module/amd64/Makefile.am
@@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64
 ASMSOURCES = \
   a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \
   a8r8g8b8_to_nv12_box_amd64_sse2.asm \
+  a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \
   cpuid_amd64.asm \
   i420_to_rgb32_amd64_sse2.asm \
   uyvy_to_rgb32_amd64_sse2.asm \
diff --git a/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm
new file mode 100644
index 00000000..cfe9d6af
--- /dev/null
+++ b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm
@@ -0,0 +1,178 @@
+;
+;Copyright 2024 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;ARGB to YUVALP
+;amd64 SSE2
+;
+; notes
+;   address s8 should be aligned on 16 bytes, will be slower if not
+;   width must be multiple of 8 and > 0
+;   height must be > 0
+
+%include "common.asm"
+
+PREPARE_RODATA
+    cd255  times 4 dd 255
+    cw128  times 8 dw 128
+    cw77   times 8 dw 77
+    cw150  times 8 dw 150
+    cw29   times 8 dw 29
+    cw43   times 8 dw 43
+    cw85   times 8 dw 85
+    cw107  times 8 dw 107
+    cw21   times 8 dw 21
+
+%define LS8            [rsp +   0] ; s8
+%define LSRC_STRIDE    [rsp +   8] ; src_stride
+%define LD8            [rsp +  16] ; d8
+%define LDST_STRIDE    [rsp +  24] ; dst_stride
+%define LWIDTH         [rsp +  32] ; width
+%define LHEIGHT        [rsp +  40] ; height
+
+;The first six integer or pointer arguments are passed in registers
+; RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
+;                                  uint8_t *d8, int dst_stride,
+;                                  int width, int height);
+PROC a8r8g8b8_to_yuvalp_box_amd64_sse2
+    push rbx
+    push rbp
+    sub rsp, 48                 ; local vars, 48 bytes
+
+    mov LS8, rdi                ; s8
+    mov LSRC_STRIDE, rsi        ; src_stride
+    mov LD8, rdx                ; d8
+    mov LDST_STRIDE, rcx        ; dst_stride
+    mov LWIDTH, r8              ; width
+    mov LHEIGHT, r9             ; height
+
+    pxor xmm7, xmm7
+
+    mov ebx, LHEIGHT            ; ebx = height
+
+row_loop1:
+    mov rsi, LS8                ; s8
+    mov rdi, LD8                ; d8
+
+    mov ecx, LWIDTH             ; ecx = width
+    shr ecx, 3                  ; doing 8 pixels at a time
+
+loop1:
+    movdqu xmm0, [rsi]          ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0           ; blue
+    pand xmm1, [lsym(cd255)]    ; blue
+    movdqa xmm2, xmm0           ; green
+    psrld xmm2, 8               ; green
+    pand xmm2, [lsym(cd255)]    ; green
+    movdqa xmm3, xmm0           ; red
+    psrld xmm3, 16              ; red
+    pand xmm3, [lsym(cd255)]    ; red
+    movdqa xmm4, xmm0           ; alpha
+    psrld xmm4, 24              ; alpha
+    pand xmm4, [lsym(cd255)]    ; alpha
+
+    movdqu xmm0, [rsi + 16]     ; 4 pixels, 16 bytes
+    movdqa xmm5, xmm0           ; alpha
+    psrld xmm5, 24              ; alpha
+    pand xmm5, [lsym(cd255)]    ; alpha
+    packssdw xmm4, xmm5         ; xmm4 = 8 alphas
+    packuswb xmm4, xmm7
+    movq [rdi + 3 * 64 * 64], xmm4  ; out 8 bytes aaaaaaaa
+    movdqa xmm4, xmm0           ; blue
+    pand xmm4, [lsym(cd255)]    ; blue
+    movdqa xmm5, xmm0           ; green
+    psrld xmm5, 8               ; green
+    pand xmm5, [lsym(cd255)]    ; green
+    movdqa xmm6, xmm0           ; red
+    psrld xmm6, 16              ; red
+    pand xmm6, [lsym(cd255)]    ; red
+
+    packssdw xmm1, xmm4         ; xmm1 = 8 blues
+    packssdw xmm2, xmm5         ; xmm2 = 8 greens
+    packssdw xmm3, xmm6         ; xmm3 = 8 reds
+
+    ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw29)]
+    pmullw xmm5, [lsym(cw150)]
+    pmullw xmm6, [lsym(cw77)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [rdi], xmm4            ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw85)]
+    pmullw xmm6, [lsym(cw43)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [rdi + 1 * 64 * 64], xmm4  ; out 8 bytes uuuuuuuu
+
+    ; _V = ((128 * _R - 107 * _G -  21 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm4, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw107)]
+    pmullw xmm6, [lsym(cw21)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [rdi + 2 * 64 * 64], xmm4  ; out 8 bytes vvvvvvvv
+
+    ; move right
+    lea rsi, [rsi + 32]
+    lea rdi, [rdi + 8]
+
+    dec ecx
+    jnz loop1
+
+    ; update s8
+    mov rax, LS8                ; s8
+    add rax, LSRC_STRIDE        ; s8 += src_stride
+    mov LS8, rax
+
+    ; update d8
+    mov rax, LD8                ; d8
+    add rax, LDST_STRIDE        ; d8 += dst_stride
+    mov LD8, rax
+
+    dec ebx
+    jnz row_loop1
+
+    mov rax, 0                  ; return value
+    add rsp, 48                 ; local vars, 48 bytes
+    pop rbp
+    pop rbx
+    ret
+END_OF_FILE
diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h
index ae38c53b..3b54e2b2 100644
--- a/module/amd64/funcs_amd64.h
+++ b/module/amd64/funcs_amd64.h
@@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride,
                                 uint8_t *d8_y, int dst_stride_y,
                                 uint8_t *d8_uv, int dst_stride_uv,
                                 int width, int height);
+int
+a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
+                                  uint8_t *d8, int dst_stride,
+                                  int width, int height);
 
 #endif
 
diff --git a/module/rdp.h b/module/rdp.h
index 844db424..f9e4ac61 100644
--- a/module/rdp.h
+++ b/module/rdp.h
@@ -297,6 +297,7 @@ struct _rdpRec
 
     copy_box_proc a8r8g8b8_to_a8b8g8r8_box;
     copy_box_dst2_proc a8r8g8b8_to_nv12_box;
+    copy_box_proc a8r8g8b8_to_yuvalp_box;
 
     /* multimon */
     struct monitor_info minfo[16]; /* client monitor data */
diff --git a/module/rdpCapture.c b/module/rdpCapture.c
index 334a4880..893917ce 100644
--- a/module/rdpCapture.c
+++ b/module/rdpCapture.c
@@ -124,24 +124,18 @@ rdpFillBox_yuvalp(int ax, int ay,
 /* 19595  38470   7471
   -11071 -21736  32807
    32756 -27429  -5327 */
-static int
-rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
-                              const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
-                              BoxPtr rects, int num_rects)
+int
+a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride,
+                       uint8_t *d8, int dst_stride,
+                       int width, int height)
 {
-    const uint8_t *s8;
-    uint8_t *d8;
     uint8_t *yptr;
     uint8_t *uptr;
     uint8_t *vptr;
     uint8_t *aptr;
     const uint32_t *s32;
-    int index;
     int jndex;
     int kndex;
-    int width;
-    int height;
     uint32_t pixel;
     uint8_t a;
     int r;
@@ -150,6 +144,51 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
     int y;
     int u;
     int v;
+
+    for (jndex = 0; jndex < height; jndex++)
+    {
+        s32 = (const uint32_t *) s8;
+        yptr = d8;
+        uptr = yptr + 64 * 64;
+        vptr = uptr + 64 * 64;
+        aptr = vptr + 64 * 64;
+        kndex = 0;
+        while (kndex < width)
+        {
+            pixel = *(s32++);
+            RGB_SPLIT(a, r, g, b, pixel);
+            y = (r *  19595 + g *  38470 + b *   7471) >> 16;
+            u = (r * -11071 + g * -21736 + b *  32807) >> 16;
+            v = (r *  32756 + g * -27429 + b *  -5327) >> 16;
+            u = u + 128;
+            v = v + 128;
+            y = RDPCLAMP(y, 0, UCHAR_MAX);
+            u = RDPCLAMP(u, 0, UCHAR_MAX);
+            v = RDPCLAMP(v, 0, UCHAR_MAX);
+            *(yptr++) = y;
+            *(uptr++) = u;
+            *(vptr++) = v;
+            *(aptr++) = a;
+            kndex++;
+        }
+        d8 += dst_stride;
+        s8 += src_stride;
+    }
+    return 0;
+}
+
+/******************************************************************************/
+static int
+rdpCopyBox_a8r8g8b8_to_yuvalp(rdpClientCon *clientCon, int ax, int ay,
+                              const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              BoxPtr rects, int num_rects)
+{
+    const uint8_t *s8;
+    uint8_t *d8;
+    int index;
+    int width;
+    int height;
     BoxPtr box;
 
     dst = dst + (ay << 8) * (dst_stride >> 8) + (ax << 8);
@@ -162,35 +201,9 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
         d8 += box->x1 - ax;
         width = box->x2 - box->x1;
         height = box->y2 - box->y1;
-        for (jndex = 0; jndex < height; jndex++)
-        {
-            s32 = (const uint32_t *) s8;
-            yptr = d8;
-            uptr = yptr + 64 * 64;
-            vptr = uptr + 64 * 64;
-            aptr = vptr + 64 * 64;
-            kndex = 0;
-            while (kndex < width)
-            {
-                pixel = *(s32++);
-                RGB_SPLIT(a, r, g, b, pixel);
-                y = (r *  19595 + g *  38470 + b *   7471) >> 16;
-                u = (r * -11071 + g * -21736 + b *  32807) >> 16;
-                v = (r *  32756 + g * -27429 + b *  -5327) >> 16;
-                u = u + 128;
-                v = v + 128;
-                y = RDPCLAMP(y, 0, UCHAR_MAX);
-                u = RDPCLAMP(u, 0, UCHAR_MAX);
-                v = RDPCLAMP(v, 0, UCHAR_MAX);
-                *(yptr++) = y;
-                *(uptr++) = u;
-                *(vptr++) = v;
-                *(aptr++) = a;
-                kndex++;
-            }
-            d8 += 64;
-            s8 += src_stride;
-        }
+        clientCon->dev->a8r8g8b8_to_yuvalp_box(s8, src_stride,
+                                               d8, 64,
+                                               width, height);
     }
     return 0;
 }
@@ -946,7 +959,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
                     rects = REGION_RECTS(&tile_reg);
                     num_rects = REGION_NUM_RECTS(&tile_reg);
                     crc = wyhash((const void*)rects, num_rects * sizeof(BoxRec), crc, _wyp);
-                    rdpCopyBox_a8r8g8b8_to_yuvalp(x, y,
+                    rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y,
                                                   src, src_stride,
                                                   dst, dst_stride,
                                                   rects, num_rects);
@@ -975,7 +988,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
                     /* lazily only do this if hash wasn't identical */
                     if (rcode != rgnPART)
                     {
-                        rdpCopyBox_a8r8g8b8_to_yuvalp(x, y,
+                        rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y,
                                 src, src_stride,
                                 dst, dst_stride,
                                 &rect, 1);
diff --git a/module/rdpCapture.h b/module/rdpCapture.h
index c4dd04a6..5bf3fc76 100644
--- a/module/rdpCapture.h
+++ b/module/rdpCapture.h
@@ -49,5 +49,9 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride,
                      uint8_t *d8_y, int dst_stride_y,
                      uint8_t *d8_uv, int dst_stride_uv,
                      int width, int height);
+extern _X_EXPORT int
+a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride,
+                       uint8_t *d8, int dst_stride,
+                       int width, int height);
 
 #endif
diff --git a/module/rdpSimd.c b/module/rdpSimd.c
index 49a3653e..05dd6a03 100644
--- a/module/rdpSimd.c
+++ b/module/rdpSimd.c
@@ -62,6 +62,90 @@ int g_simd_use_accel = 1;
 #define LLOGLN(_level, _args) \
     do { if (_level < LOG_LEVEL) { ErrorF _args ; ErrorF("\n"); } } while (0)
 
+#if SIMD_USE_ACCEL
+
+#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64)
+/*****************************************************************************/
+int
+a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride,
+                                       uint8_t *d8, int dst_stride,
+                                       int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box_amd64_sse2(s8, src_stride,
+                                                      d8, dst_stride,
+                                                      aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride,
+                                           d8 + aligned_width, dst_stride,
+                                           left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+#endif
+
+#if defined(__x86__) || defined(_M_IX86) || defined(__i386__)
+/*****************************************************************************/
+int
+a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride,
+                                     uint8_t *d8, int dst_stride,
+                                     int width, int height)
+{
+    int aligned_width;
+    int left_over_width;
+    int error;
+
+    aligned_width = width & ~7;
+    left_over_width = width - aligned_width;
+    if (height > 0)
+    {
+        if (aligned_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box_x86_sse2(s8, src_stride,
+                                                    d8, dst_stride,
+                                                    aligned_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+        if (left_over_width > 0)
+        {
+            error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride,
+                                           d8 + aligned_width, dst_stride,
+                                           left_over_width, height);
+            if (error != 0)
+            {
+                return error;
+            }
+        }
+    }
+    return 0;
+}
+#endif
+
+#endif
+
 /*****************************************************************************/
 Bool
 rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
@@ -77,6 +161,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
     dev->uyvy_to_rgb32 = UYVY_to_RGB32;
     dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box;
     dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box;
+    dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box;
 #if SIMD_USE_ACCEL
     if (g_simd_use_accel)
     {
@@ -93,6 +178,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
             dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2;
             dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2;
             dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2;
+            dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap;
             LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned"));
         }
 #elif defined(__x86__) || defined(_M_IX86) || defined(__i386__)
@@ -108,6 +194,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
             dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2;
             dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2;
             dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2;
+            dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap;
             LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned"));
         }
 #endif
diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am
index ed106863..9539f8c0 100644
--- a/module/x86/Makefile.am
+++ b/module/x86/Makefile.am
@@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_I386
 ASMSOURCES = \
   a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \
   a8r8g8b8_to_nv12_box_x86_sse2.asm \
+  a8r8g8b8_to_yuvalp_box_x86_sse2.asm \
   cpuid_x86.asm \
   i420_to_rgb32_x86_sse2.asm \
   uyvy_to_rgb32_x86_sse2.asm \
diff --git a/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm
new file mode 100644
index 00000000..cec02043
--- /dev/null
+++ b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm
@@ -0,0 +1,171 @@
+;
+;Copyright 2024 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;ARGB to YUVALP
+;x86 SSE2
+;
+; notes
+;   address s8 should be aligned on 16 bytes, will be slower if not
+;   width must be multiple of 8 and > 0
+;   height must be > 0
+
+%include "common.asm"
+
+PREPARE_RODATA
+    cd255  times 4 dd 255
+    cw128  times 8 dw 128
+    cw77   times 8 dw 77
+    cw150  times 8 dw 150
+    cw29   times 8 dw 29
+    cw43   times 8 dw 43
+    cw85   times 8 dw 85
+    cw107  times 8 dw 107
+    cw21   times 8 dw 21
+
+%define LS8            [esp + 20]   ; s8
+%define LSRC_STRIDE    [esp + 24]   ; src_stride
+%define LD8            [esp + 28]   ; d8
+%define LDST_STRIDE    [esp + 32]   ; dst_stride
+%define LWIDTH         [esp + 36]   ; width
+%define LHEIGHT        [esp + 40]   ; height
+
+;int
+;a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride,
+;                                uint8_t *d8, int dst_stride,
+;                                int width, int height);
+PROC a8r8g8b8_to_yuvalp_box_x86_sse2
+    push ebx
+    RETRIEVE_RODATA
+    push esi
+    push edi
+    push ebp
+
+    pxor xmm7, xmm7
+
+    mov ebp, LHEIGHT            ; ebp = height
+
+row_loop1:
+    mov esi, LS8                ; s8
+    mov edi, LD8                ; d8
+
+    mov ecx, LWIDTH             ; ecx = width
+    shr ecx, 3                  ; doing 8 pixels at a time
+
+loop1:
+    movdqu xmm0, [esi]          ; 4 pixels, 16 bytes
+    movdqa xmm1, xmm0           ; blue
+    pand xmm1, [lsym(cd255)]    ; blue
+    movdqa xmm2, xmm0           ; green
+    psrld xmm2, 8               ; green
+    pand xmm2, [lsym(cd255)]    ; green
+    movdqa xmm3, xmm0           ; red
+    psrld xmm3, 16              ; red
+    pand xmm3, [lsym(cd255)]    ; red
+    movdqa xmm4, xmm0           ; alpha
+    psrld xmm4, 24              ; alpha
+    pand xmm4, [lsym(cd255)]    ; alpha
+
+    movdqu xmm0, [esi + 16]     ; 4 pixels, 16 bytes
+    movdqa xmm5, xmm0           ; alpha
+    psrld xmm5, 24              ; alpha
+    pand xmm5, [lsym(cd255)]    ; alpha
+    packssdw xmm4, xmm5         ; xmm4 = 8 alphas
+    packuswb xmm4, xmm7
+    movq [edi + 3 * 64 * 64], xmm4  ; out 8 bytes aaaaaaaa
+    movdqa xmm4, xmm0           ; blue
+    pand xmm4, [lsym(cd255)]    ; blue
+    movdqa xmm5, xmm0           ; green
+    psrld xmm5, 8               ; green
+    pand xmm5, [lsym(cd255)]    ; green
+    movdqa xmm6, xmm0           ; red
+    psrld xmm6, 16              ; red
+    pand xmm6, [lsym(cd255)]    ; red
+
+    packssdw xmm1, xmm4         ; xmm1 = 8 blues
+    packssdw xmm2, xmm5         ; xmm2 = 8 greens
+    packssdw xmm3, xmm6         ; xmm3 = 8 reds
+
+    ; _Y = (77 * _R + 150 * _G +  29 * _B) >> 8;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw29)]
+    pmullw xmm5, [lsym(cw150)]
+    pmullw xmm6, [lsym(cw77)]
+    paddw xmm4, xmm5
+    paddw xmm4, xmm6
+    psrlw xmm4, 8
+    packuswb xmm4, xmm7
+    movq [edi], xmm4            ; out 8 bytes yyyyyyyy
+
+    ; _U = ((-43 * _R -  85 * _G + 128 * _B) >> 8) + 128;
+    movdqa xmm4, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm6, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw85)]
+    pmullw xmm6, [lsym(cw43)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [edi + 1 * 64 * 64], xmm4  ; out 8 bytes uuuuuuuu
+
+    ; _V = ((128 * _R - 107 * _G -  21 * _B) >> 8) + 128;
+    movdqa xmm6, xmm1           ; blue
+    movdqa xmm5, xmm2           ; green
+    movdqa xmm4, xmm3           ; red
+    pmullw xmm4, [lsym(cw128)]
+    pmullw xmm5, [lsym(cw107)]
+    pmullw xmm6, [lsym(cw21)]
+    psubw xmm4, xmm5
+    psubw xmm4, xmm6
+    psraw xmm4, 8
+    paddw xmm4, [lsym(cw128)]
+    packuswb xmm4, xmm7
+    movq [edi + 2 * 64 * 64], xmm4  ; out 8 bytes vvvvvvvv
+
+    ; move right
+    lea esi, [esi + 32]
+    lea edi, [edi + 8]
+
+    dec ecx
+    jnz loop1
+
+    ; update s8
+    mov eax, LS8                ; s8
+    add eax, LSRC_STRIDE        ; s8 += src_stride
+    mov LS8, eax
+
+    ; update d8
+    mov eax, LD8                ; d8
+    add eax, LDST_STRIDE        ; d8 += dst_stride
+    mov LD8, eax
+
+    dec ebp
+    jnz row_loop1
+
+    mov eax, 0                  ; return value
+    pop ebp
+    pop edi
+    pop esi
+    pop ebx
+    ret
+END_OF_FILE
diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h
index c70cc8cf..d1f3357d 100644
--- a/module/x86/funcs_x86.h
+++ b/module/x86/funcs_x86.h
@@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride,
                               uint8_t *d8_y, int dst_stride_y,
                               uint8_t *d8_uv, int dst_stride_uv,
                               int width, int height);
+int
+a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride,
+                                uint8_t *d8, int dst_stride,
+                                int width, int height);
 
 #endif