From 52ad69474e41b3f037c708f803b7eaa46228a461 Mon Sep 17 00:00:00 2001 From: mirabilos Date: Thu, 26 Jan 2017 21:42:07 +0100 Subject: [PATCH] Convert i386 asm code to ELF PIC --- .../rfxcodec_encode_dwt_shift_x86_sse2.asm | 173 ++++++++++-------- .../rfxcodec_encode_dwt_shift_x86_sse41.asm | 173 ++++++++++-------- 2 files changed, 198 insertions(+), 148 deletions(-) diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm index 23c0fd8..1e0be46 100644 --- a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm +++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm @@ -1,5 +1,6 @@ ; ;Copyright 2016 Jay Sorg +;Copyright 2017 mirabilos ; ;Permission to use, copy, modify, distribute, and sell this software and its ;documentation for any purpose is hereby granted without fee, provided that @@ -21,6 +22,29 @@ %ifidn __OUTPUT_FORMAT__,elf section .note.GNU-stack noalloc noexec nowrite progbits +%ifdef PIC +section .text +extern _GLOBAL_OFFSET_TABLE_ +..@get_GOT: + mov ebx,[esp] + ret +%define lsym(name) ebx + name wrt ..gotoff +%macro get_GOT 0 + call ..@get_GOT + add ebx,_GLOBAL_OFFSET_TABLE_+$$-..@get_GOT wrt ..gotpc +%endmacro +%else +%endif +%else +; not ELF +%ifdef PIC +%error Position-Independent Code is currently only supported for ELF +%endif +%endif +%ifndef lsym +%define lsym(name) name +%macro get_GOT 0 +%endmacro %endif section .data @@ -68,8 +92,8 @@ loop1a: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -79,8 +103,8 @@ loop1a: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -98,8 +122,8 @@ loop1a: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -260,8 +284,8 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -271,8 +295,8 @@ loop1c: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -290,8 +314,8 @@ loop1c: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -341,8 +365,8 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -352,8 +376,8 @@ loop1c: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -371,8 +395,8 @@ loop1c: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -436,8 +460,8 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -447,8 +471,8 @@ loop1c1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -466,8 +490,8 @@ loop1c1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -514,8 +538,8 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -525,8 +549,8 @@ loop1c1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -544,8 +568,8 @@ loop1c1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -703,8 +727,8 @@ loop1e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -714,8 +738,8 @@ loop1e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -733,8 +757,8 @@ loop1e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -787,8 +811,8 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -798,8 +822,8 @@ loop2e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -817,8 +841,8 @@ loop2e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -870,8 +894,8 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -881,8 +905,8 @@ loop2e: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -900,8 +924,8 @@ loop2e: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -965,8 +989,8 @@ loop1e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -976,8 +1000,8 @@ loop1e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -995,8 +1019,8 @@ loop1e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1046,8 +1070,8 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -1057,8 +1081,8 @@ loop2e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -1076,8 +1100,8 @@ loop2e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1126,8 +1150,8 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] pslld xmm1, 16 pslld xmm2, 16 psrad xmm1, 16 @@ -1137,8 +1161,8 @@ loop2e1: movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] pslld xmm2, 16 pslld xmm3, 16 psrad xmm2, 16 @@ -1156,8 +1180,8 @@ loop2e1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] pslld xmm3, 16 pslld xmm4, 16 psrad xmm3, 16 @@ -1220,9 +1244,9 @@ loop1f: punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm1, [cw128] - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm1, [lsym(cw128)] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm1, 5 psllw xmm2, 5 psllw xmm3, 5 @@ -1254,8 +1278,8 @@ loop2f: movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm2, 5 psllw xmm3, 5 movdqa xmm4, xmm1 @@ -1287,7 +1311,7 @@ loop2f: movdqa xmm1, xmm3 ; src[2n] movq xmm2, [esi + 64 * 1] ; src[2n + 1] punpcklbw xmm2, xmm0 - psubw xmm2, [cw128] + psubw xmm2, [lsym(cw128)] psllw xmm2, 5 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1328,7 +1352,7 @@ set_quants_hi: movd xmm1, eax movdqa LHI_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LHI_ADD, xmm1 @@ -1339,7 +1363,7 @@ set_quants_lo: movd xmm1, eax movdqa LLO_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LLO_ADD, xmm1 @@ -1375,6 +1399,7 @@ PROC _rfxcodec_encode_dwt_shift_x86_sse2 movdqu [esp], xmm0 ; save registers push ebx + get_GOT push esi push edi push ebp diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm index d4a8778..dd7eeaf 100644 --- a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm +++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm @@ -1,5 +1,6 @@ ; ;Copyright 2016 Jay Sorg +;Copyright 2017 mirabilos ; ;Permission to use, copy, modify, distribute, and sell this software and its ;documentation for any purpose is hereby granted without fee, provided that @@ -21,6 +22,29 @@ %ifidn __OUTPUT_FORMAT__,elf section .note.GNU-stack noalloc noexec nowrite progbits +%ifdef PIC +section .text +extern _GLOBAL_OFFSET_TABLE_ +..@get_GOT: + mov ebx,[esp] + ret +%define lsym(name) ebx + name wrt ..gotoff +%macro get_GOT 0 + call ..@get_GOT + add ebx,_GLOBAL_OFFSET_TABLE_+$$-..@get_GOT wrt ..gotpc +%endmacro +%else +%endif +%else +; not ELF +%ifdef PIC +%error Position-Independent Code is currently only supported for ELF +%endif +%endif +%ifndef lsym +%define lsym(name) name +%macro get_GOT 0 +%endmacro %endif section .data @@ -68,15 +92,15 @@ loop1a: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -90,8 +114,8 @@ loop1a: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -248,15 +272,15 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -270,8 +294,8 @@ loop1c: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -317,15 +341,15 @@ loop1c: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -339,8 +363,8 @@ loop1c: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -400,15 +424,15 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -422,8 +446,8 @@ loop1c1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -466,15 +490,15 @@ loop1c1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -488,8 +512,8 @@ loop1c1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -643,15 +667,15 @@ loop1e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -665,8 +689,8 @@ loop1e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -715,15 +739,15 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -737,8 +761,8 @@ loop2e: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -786,15 +810,15 @@ loop2e: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -808,8 +832,8 @@ loop2e: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -869,15 +893,15 @@ loop1e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -891,8 +915,8 @@ loop1e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -938,15 +962,15 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -960,8 +984,8 @@ loop2e1: movd xmm5, eax pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1006,15 +1030,15 @@ loop2e1: movdqa xmm2, [esi + 16] movdqa xmm6, xmm1 movdqa xmm7, xmm2 - pand xmm1, [cdFFFF] - pand xmm2, [cdFFFF] + pand xmm1, [lsym(cdFFFF)] + pand xmm2, [lsym(cdFFFF)] packusdw xmm1, xmm2 movdqa xmm2, xmm6 ; src[2n + 1] movdqa xmm3, xmm7 psrldq xmm2, 2 psrldq xmm3, 2 - pand xmm2, [cdFFFF] - pand xmm3, [cdFFFF] + pand xmm2, [lsym(cdFFFF)] + pand xmm3, [lsym(cdFFFF)] packusdw xmm2, xmm3 movdqa xmm3, xmm6 ; src[2n + 2] movdqa xmm4, xmm7 @@ -1028,8 +1052,8 @@ loop2e1: psrldq xmm5, 12 pslldq xmm5, 12 por xmm4, xmm5 - pand xmm3, [cdFFFF] - pand xmm4, [cdFFFF] + pand xmm3, [lsym(cdFFFF)] + pand xmm4, [lsym(cdFFFF)] packusdw xmm3, xmm4 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1088,9 +1112,9 @@ loop1f: punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm1, [cw128] - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm1, [lsym(cw128)] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm1, 5 psllw xmm2, 5 psllw xmm3, 5 @@ -1122,8 +1146,8 @@ loop2f: movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] punpcklbw xmm2, xmm0 punpcklbw xmm3, xmm0 - psubw xmm2, [cw128] - psubw xmm3, [cw128] + psubw xmm2, [lsym(cw128)] + psubw xmm3, [lsym(cw128)] psllw xmm2, 5 psllw xmm3, 5 movdqa xmm4, xmm1 @@ -1155,7 +1179,7 @@ loop2f: movdqa xmm1, xmm3 ; src[2n] movq xmm2, [esi + 64 * 1] ; src[2n + 1] punpcklbw xmm2, xmm0 - psubw xmm2, [cw128] + psubw xmm2, [lsym(cw128)] psllw xmm2, 5 movdqa xmm4, xmm1 movdqa xmm5, xmm2 @@ -1196,7 +1220,7 @@ set_quants_hi: movd xmm1, eax movdqa LHI_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LHI_ADD, xmm1 @@ -1207,7 +1231,7 @@ set_quants_lo: movd xmm1, eax movdqa LLO_SFT, xmm1 imul eax, 16 - lea edx, [cwa0] + lea edx, [lsym(cwa0)] add edx, eax movdqa xmm1, [edx] movdqa LLO_ADD, xmm1 @@ -1243,6 +1267,7 @@ PROC _rfxcodec_encode_dwt_shift_x86_sse41 movdqu [esp], xmm0 ; save registers push ebx + get_GOT push esi push edi push ebp