Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xorgxrdp capture changes for GFX h264 #312

Merged
merged 5 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions module/amd64/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64
ASMSOURCES = \
a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \
a8r8g8b8_to_nv12_box_amd64_sse2.asm \
a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm \
a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \
cpuid_amd64.asm \
i420_to_rgb32_amd64_sse2.asm \
Expand Down
304 changes: 304 additions & 0 deletions module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
;
;Copyright 2015 Jay Sorg
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
;the above copyright notice appear in all copies and that both that
;copyright notice and this permission notice appear in supporting
;documentation.
;
;The above copyright notice and this permission notice shall be included in
;all copies or substantial portions of the Software.
;
;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
;
;ARGB to NV12 709 full range
;amd64 SSE2
;
; notes
; address s8 should be aligned on 16 bytes, will be slower if not
; width should be multiple of 8 and > 0
; height should be even and > 0

%include "common.asm"

PREPARE_RODATA
cd255 times 4 dd 255

cw255 times 8 dw 255
cw128 times 8 dw 128
cw54 times 8 dw 54
cw183 times 8 dw 183
cw18 times 8 dw 18
cw29 times 8 dw 29
cw99 times 8 dw 99
cw116 times 8 dw 116
cw12 times 8 dw 12
cw2 times 8 dw 2

%define LS8 [rsp + 0] ; s8
%define LSRC_STRIDE [rsp + 8] ; src_stride
%define LD8_Y [rsp + 16] ; d8_y
%define LDST_Y_STRIDE [rsp + 24] ; dst_stride_y
%define LD8_UV [rsp + 32] ; d8_uv
%define LDST_UV_STRIDE [rsp + 40] ; dst_stride_uv
%define LU1 [rsp + 48] ; first line U, 8 bytes
%define LV1 [rsp + 56] ; first line V, 8 bytes
%define LU2 [rsp + 64] ; second line U, 8 bytes
%define LV2 [rsp + 72] ; second line V, 8 bytes

%define LWIDTH [rsp + 104] ; width
%define LHEIGHT [rsp + 112] ; height

;The first six integer or pointer arguments are passed in registers
; RDI, RSI, RDX, RCX, R8, and R9

;int
;a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const char *s8, int src_stride,
; char *d8_y, int dst_stride_y,
; char *d8_uv, int dst_stride_uv,
; int width, int height);
PROC a8r8g8b8_to_nv12_709fr_box_amd64_sse2
push rbx
push rbp
sub rsp, 80 ; local vars, 80 bytes

mov LS8, rdi ; s8
mov LSRC_STRIDE, rsi ; src_stride
mov LD8_Y, rdx ; d8_y
mov LDST_Y_STRIDE, rcx ; dst_stride_y
mov LD8_UV, r8 ; d8_uv
mov LDST_UV_STRIDE, r9 ; dst_stride_uv

pxor xmm7, xmm7

mov ebx, LHEIGHT ; ebx = height
shr ebx, 1 ; doing 2 lines at a time

row_loop1:
mov rsi, LS8 ; s8
mov rdi, LD8_Y ; d8_y
mov rdx, LD8_UV ; d8_uv

mov ecx, LWIDTH ; ecx = width
shr ecx, 3 ; doing 8 pixels at a time

loop1:
; first line
movdqu xmm0, [rsi] ; 4 pixels, 16 bytes
movdqa xmm1, xmm0 ; blue
pand xmm1, [lsym(cd255)] ; blue
movdqa xmm2, xmm0 ; green
psrld xmm2, 8 ; green
pand xmm2, [lsym(cd255)] ; green
movdqa xmm3, xmm0 ; red
psrld xmm3, 16 ; red
pand xmm3, [lsym(cd255)] ; red

movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes
movdqa xmm4, xmm0 ; blue
pand xmm4, [lsym(cd255)] ; blue
movdqa xmm5, xmm0 ; green
psrld xmm5, 8 ; green
pand xmm5, [lsym(cd255)] ; green
movdqa xmm6, xmm0 ; red
psrld xmm6, 16 ; red
pand xmm6, [lsym(cd255)] ; red

packssdw xmm1, xmm4 ; xmm1 = 8 blues
packssdw xmm2, xmm5 ; xmm2 = 8 greens
packssdw xmm3, xmm6 ; xmm3 = 8 reds

; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8);
movdqa xmm4, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm6, xmm3 ; red
pmullw xmm4, [lsym(cw18)]
pmullw xmm5, [lsym(cw183)]
pmullw xmm6, [lsym(cw54)]
paddw xmm4, xmm5
paddw xmm4, xmm6
psrlw xmm4, 8
packuswb xmm4, xmm7
movq [rdi], xmm4 ; out 8 bytes yyyyyyyy

; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128;
movdqa xmm4, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm6, xmm3 ; red
pmullw xmm4, [lsym(cw128)]
pmullw xmm5, [lsym(cw99)]
pmullw xmm6, [lsym(cw29)]
psubw xmm4, xmm5
psubw xmm4, xmm6
psraw xmm4, 8
paddw xmm4, [lsym(cw128)]
packuswb xmm4, xmm7
movq LU1, xmm4 ; save for later

; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128;
movdqa xmm6, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm4, xmm3 ; red
pmullw xmm4, [lsym(cw128)]
pmullw xmm5, [lsym(cw116)]
pmullw xmm6, [lsym(cw12)]
psubw xmm4, xmm5
psubw xmm4, xmm6
psraw xmm4, 8
paddw xmm4, [lsym(cw128)]
packuswb xmm4, xmm7
movq LV1, xmm4 ; save for later

; go down to second line
add rsi, LSRC_STRIDE
add rdi, LDST_Y_STRIDE

; second line
movdqu xmm0, [rsi] ; 4 pixels, 16 bytes
movdqa xmm1, xmm0 ; blue
pand xmm1, [lsym(cd255)] ; blue
movdqa xmm2, xmm0 ; green
psrld xmm2, 8 ; green
pand xmm2, [lsym(cd255)] ; green
movdqa xmm3, xmm0 ; red
psrld xmm3, 16 ; red
pand xmm3, [lsym(cd255)] ; red

movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes
movdqa xmm4, xmm0 ; blue
pand xmm4, [lsym(cd255)] ; blue
movdqa xmm5, xmm0 ; green
psrld xmm5, 8 ; green
pand xmm5, [lsym(cd255)] ; green
movdqa xmm6, xmm0 ; red
psrld xmm6, 16 ; red
pand xmm6, [lsym(cd255)] ; red

packssdw xmm1, xmm4 ; xmm1 = 8 blues
packssdw xmm2, xmm5 ; xmm2 = 8 greens
packssdw xmm3, xmm6 ; xmm3 = 8 reds

; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8);
movdqa xmm4, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm6, xmm3 ; red
pmullw xmm4, [lsym(cw18)]
pmullw xmm5, [lsym(cw183)]
pmullw xmm6, [lsym(cw54)]
paddw xmm4, xmm5
paddw xmm4, xmm6
psrlw xmm4, 8
packuswb xmm4, xmm7
movq [rdi], xmm4 ; out 8 bytes yyyyyyyy

; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128;
movdqa xmm4, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm6, xmm3 ; red
pmullw xmm4, [lsym(cw128)]
pmullw xmm5, [lsym(cw99)]
pmullw xmm6, [lsym(cw29)]
psubw xmm4, xmm5
psubw xmm4, xmm6
psraw xmm4, 8
paddw xmm4, [lsym(cw128)]
packuswb xmm4, xmm7
movq LU2, xmm4 ; save for later

; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128;
movdqa xmm6, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm4, xmm3 ; red
pmullw xmm4, [lsym(cw128)]
pmullw xmm5, [lsym(cw116)]
pmullw xmm6, [lsym(cw12)]
psubw xmm4, xmm5
psubw xmm4, xmm6
psraw xmm4, 8
paddw xmm4, [lsym(cw128)]
packuswb xmm4, xmm7
movq LV2, xmm4 ; save for later

; uv add and divide(average)
movq mm1, LU1 ; u from first line
movq mm3, mm1
pand mm1, [lsym(cw255)]
psrlw mm3, 8
pand mm3, [lsym(cw255)]
paddw mm1, mm3 ; add
movq mm2, LU2 ; u from second line
movq mm3, mm2
pand mm2, [lsym(cw255)]
paddw mm1, mm2 ; add
psrlw mm3, 8
pand mm3, [lsym(cw255)]
paddw mm1, mm3 ; add
paddw mm1, [lsym(cw2)] ; add 2
psrlw mm1, 2 ; div 4

movq mm2, LV1 ; v from first line
movq mm4, mm2
pand mm2, [lsym(cw255)]
psrlw mm4, 8
pand mm4, [lsym(cw255)]
paddw mm2, mm4 ; add
movq mm3, LV2 ; v from second line
movq mm4, mm3
pand mm3, [lsym(cw255)]
paddw mm2, mm3 ; add
psrlw mm4, 8
pand mm4, [lsym(cw255)]
paddw mm2, mm4 ; add
paddw mm2, [lsym(cw2)] ; add 2
psrlw mm2, 2 ; div 4

packuswb mm1, mm1
packuswb mm2, mm2

punpcklbw mm1, mm2 ; uv
movq [rdx], mm1 ; out 8 bytes uvuvuvuv

; go up to first line
sub rsi, LSRC_STRIDE
sub rdi, LDST_Y_STRIDE

; move right
lea rsi, [rsi + 32]
lea rdi, [rdi + 8]
lea rdx, [rdx + 8]

dec ecx
jnz loop1

; update s8
mov rax, LS8 ; s8
add rax, LSRC_STRIDE ; s8 += src_stride
add rax, LSRC_STRIDE ; s8 += src_stride
mov LS8, rax

; update d8_y
mov rax, LD8_Y ; d8_y
add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y
add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y
mov LD8_Y, rax

; update d8_uv
mov rax, LD8_UV ; d8_uv
add rax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv
mov LD8_UV, rax

dec ebx
jnz row_loop1

mov rax, 0 ; return value
add rsp, 80 ; local vars, 80 bytes
pop rbp
pop rbx
ret
END_OF_FILE
5 changes: 5 additions & 0 deletions module/amd64/funcs_amd64.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride,
uint8_t *d8_uv, int dst_stride_uv,
int width, int height);
int
a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const uint8_t *s8, int src_stride,
uint8_t *d8_y, int dst_stride_y,
uint8_t *d8_uv, int dst_stride_uv,
int width, int height);
int
a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
uint8_t *d8, int dst_stride,
int width, int height);
Expand Down
1 change: 1 addition & 0 deletions module/rdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ struct _rdpRec

copy_box_proc a8r8g8b8_to_a8b8g8r8_box;
copy_box_dst2_proc a8r8g8b8_to_nv12_box;
copy_box_dst2_proc a8r8g8b8_to_nv12_709fr_box;
copy_box_proc a8r8g8b8_to_yuvalp_box;

/* multimon */
Expand Down
Loading