Skip to content

Commit

Permalink
impls: fetch from upstream glibc
Browse files Browse the repository at this point in the history
  • Loading branch information
dmoj-build committed Mar 15, 2024
1 parent aca2e28 commit 7c07e5c
Show file tree
Hide file tree
Showing 12 changed files with 420 additions and 3,152 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_library(memcpy STATIC
impls/memmove-avx-unaligned-erms.s
impls/memmove-avx512-no-vzeroupper.s
impls/memmove-avx512-unaligned-erms.s
impls/memmove-erms.s
impls/memmove-evex-unaligned-erms.s
impls/memmove-sse2-unaligned-erms.s
impls/memmove-ssse3-back.s
Expand Down
40 changes: 21 additions & 19 deletions impls/memmove-avx-unaligned-erms-rtm.s
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ __memmove_avx_unaligned_rtm:
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi,%rdx)

jmp .Lreturn
jmp .Lreturn_vzeroupper

.globl __mempcpy_avx_unaligned_erms_rtm
__mempcpy_avx_unaligned_erms_rtm:
Expand All @@ -45,9 +45,9 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu -32(%rsi, %rdx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

xtest; jz 1f; vzeroall; ret; 1: vzeroupper; ret
xtest; jnz 1f; vzeroupper; ret; 1: vzeroall; ret
.p2align 4
.Lless_vec:
cmpl $16, %edx
Expand Down Expand Up @@ -105,7 +105,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm1, 32(%rdi)
vmovdqu %ymm2, -32(%rdi, %rdx)
vmovdqu %ymm3, -(32 * 2)(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4

Expand Down Expand Up @@ -136,7 +136,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm5, -(32 * 2)(%rdi, %rdx)
vmovdqu %ymm6, -(32 * 3)(%rdi, %rdx)
vmovdqu %ymm7, -(32 * 4)(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4,, 4
.Lmore_8x_vec:
Expand Down Expand Up @@ -205,7 +205,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm0, (%rcx)

.Lnop_backward:
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4,, 8
.Lmore_8x_vec_backward_check_nop:
Expand Down Expand Up @@ -249,7 +249,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm7, (32 * 3)(%rdi)

vmovdqu %ymm8, -32(%rdx, %rdi)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 5,, 16

Expand Down Expand Up @@ -277,7 +277,7 @@ __memmove_avx_unaligned_erms_rtm:

vmovdqu %ymm1, 32(%r8)

jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4,, 12
.Lmovsb:
Expand All @@ -293,7 +293,8 @@ __memmove_avx_unaligned_erms_rtm:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand All @@ -319,13 +320,15 @@ __memmove_avx_unaligned_erms_rtm:

vmovdqu %ymm1, 32(%r8)

jmp .Lreturn
jmp .Lreturn_vzeroupper
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -349,17 +352,17 @@ __memmove_avx_unaligned_erms_rtm:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 32 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -418,11 +421,10 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm1, -(32 * 3)(%rdi, %rdx)
vmovdqu %ymm2, -(32 * 2)(%rdi, %rdx)
vmovdqu %ymm3, -32(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down Expand Up @@ -490,7 +492,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm1, -(32 * 3)(%rdi, %rdx)
vmovdqu %ymm2, -(32 * 2)(%rdi, %rdx)
vmovdqu %ymm3, -32(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.globl __memcpy_avx_unaligned_erms_rtm
.set __memcpy_avx_unaligned_erms_rtm, __memmove_avx_unaligned_erms_rtm
Expand Down
20 changes: 11 additions & 9 deletions impls/memmove-avx-unaligned-erms.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __memmove_avx_unaligned_erms:
vmovdqu -32(%rsi, %rdx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

vzeroupper; ret
.p2align 4
Expand Down Expand Up @@ -293,7 +293,8 @@ __memmove_avx_unaligned_erms:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand Down Expand Up @@ -323,9 +324,11 @@ __memmove_avx_unaligned_erms:
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -349,17 +352,17 @@ __memmove_avx_unaligned_erms:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 32 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -422,7 +425,6 @@ __memmove_avx_unaligned_erms:

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down
20 changes: 11 additions & 9 deletions impls/memmove-avx512-unaligned-erms.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __memmove_avx512_unaligned_erms:
vmovdqu64 -64(%rsi, %rdx), %zmm17
vmovdqu64 %zmm16, (%rdi)
vmovdqu64 %zmm17, -64(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

; ret
.p2align 4,, 8
Expand Down Expand Up @@ -301,7 +301,8 @@ __memmove_avx512_unaligned_erms:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand All @@ -326,9 +327,11 @@ __memmove_avx512_unaligned_erms:
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -346,17 +349,17 @@ __memmove_avx512_unaligned_erms:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 64 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -419,7 +422,6 @@ __memmove_avx512_unaligned_erms:

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down
38 changes: 38 additions & 0 deletions impls/memmove-erms.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
.text
.globl __mempcpy_erms
__mempcpy_erms:
mov %rdi, %rax

test %rdx, %rdx
jz 2f
add %rdx, %rax
jmp .Lstart_movsb

.globl __memmove_erms
__memmove_erms:
movq %rdi, %rax

test %rdx, %rdx
jz 2f
.Lstart_movsb:
mov %rdx, %rcx
cmp %rsi, %rdi
jb 1f

je 2f
lea (%rsi,%rcx), %rdx
cmp %rdx, %rdi
jb .Lmovsb_backward
1:
rep movsb
2:
ret
.Lmovsb_backward:
leaq -1(%rdi,%rcx), %rdi
leaq -1(%rsi,%rcx), %rsi
std
rep movsb
cld
ret
.globl __memcpy_erms
.set __memcpy_erms, __memmove_erms
20 changes: 11 additions & 9 deletions impls/memmove-evex-unaligned-erms.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __memmove_evex_unaligned_erms:
vmovdqu64 -32(%rsi, %rdx), %ymm17
vmovdqu64 %ymm16, (%rdi)
vmovdqu64 %ymm17, -32(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

; ret
.p2align 4,, 8
Expand Down Expand Up @@ -292,7 +292,8 @@ __memmove_evex_unaligned_erms:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand Down Expand Up @@ -322,9 +323,11 @@ __memmove_evex_unaligned_erms:
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -348,17 +351,17 @@ __memmove_evex_unaligned_erms:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 32 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -421,7 +424,6 @@ __memmove_evex_unaligned_erms:

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down
Loading

0 comments on commit 7c07e5c

Please sign in to comment.