-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
362 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,361 @@ | ||
#define DST r0 | ||
#define SRC r1 | ||
#define LEN r8 | ||
#define DST_RET r12 | ||
|
||
.arm | ||
.syntax unified | ||
|
||
.text | ||
.align 4 | ||
|
||
/* void *memcpy(void *dst, const void *src, size_t l) */ | ||
.globl memcpy | ||
.type memcpy, %function | ||
memcpy: | ||
str r8, [sp, #-8]! | ||
mov r8, r2 | ||
cmp LEN, #64 | ||
mov DST_RET, DST /* preserve return value */ | ||
|
||
bhs .LblkCopy | ||
|
||
/* less than 64 bytes - always copy as if block was always unaligned */ | ||
|
||
.Ltail63Unaligned: | ||
/* unaligned copy, 0-63 bytes */ | ||
|
||
/* r3 = LEN / 4 */ | ||
movs r3, LEN, lsr #2 | ||
beq .Ltail63Un0 | ||
|
||
.Ltail63Un4: | ||
ldr r2, [SRC], #4 | ||
str r2, [DST], #4 | ||
subs r3, #1 | ||
bne .Ltail63Un4 | ||
|
||
.Ltail63Un0: | ||
/* LEN = LEN % 4 */ | ||
ands LEN, #3 | ||
beq .Ltail63UnRet | ||
|
||
/* 1 <= LEN <= 3 */ | ||
1: | ||
subs LEN, #1 | ||
ldrb r2, [SRC], #1 | ||
strb r2, [DST], #1 | ||
bne 1b | ||
|
||
.Ltail63UnRet: | ||
ldr r8, [sp], #8 | ||
mov r0, DST_RET | ||
bx lr | ||
|
||
|
||
.LblkCopyUnaligned64: | ||
/* src/dst not mutually aligned, more than 64 bytes to copy */ | ||
|
||
/* align dst to 64 bytes */ | ||
add r2, DST, #63 | ||
bic r2, #63 | ||
|
||
/* r2 = distance to 64-byte alignment */ | ||
subs r2, DST | ||
beq .LblkCopyUnaligned /* dst already aligned */ | ||
|
||
sub LEN, r2 | ||
|
||
1: | ||
ldrb r3, [SRC], #1 | ||
strb r3, [DST], #1 | ||
subs r2, #1 | ||
bne 1b | ||
|
||
cmp LEN, #64 | ||
blo .Ltail63Unaligned | ||
|
||
sub LEN, #64 | ||
|
||
.p2align 5 | ||
.LblkCopyUnaligned: | ||
/* copy block of >64 bytes */ | ||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
ldr r2, [SRC], #4 | ||
ldr r3, [SRC], #4 | ||
strd r2, r3, [DST], #8 | ||
|
||
subs LEN, #64 | ||
bhs .LblkCopyUnaligned | ||
|
||
bne 2f | ||
|
||
/* LEN = 0 */ | ||
ldr r8, [sp], #8 | ||
mov r0, DST_RET | ||
bx lr | ||
|
||
2: | ||
/* LEN < 0 */ | ||
and LEN, #63 /* make LEN positive again */ | ||
b .Ltail63Unaligned | ||
|
||
.LblkCopy: | ||
/* copy more than 64 bytes */ | ||
|
||
/* check src/dst alignment */ | ||
and r3, SRC, #7 | ||
and r2, DST, #7 | ||
cmp r3, r2 | ||
bne .LblkCopyUnaligned64 | ||
|
||
/* src/dst mutually 8-byte aligned */ | ||
|
||
/* handle leading misalignment, 1-3 bytes */ | ||
tst r3, #3 | ||
beq .LblkAligned4 | ||
|
||
rsb r3, #4 | ||
sub LEN, r3 | ||
|
||
3: | ||
ldrb r2, [SRC], #1 | ||
strb r2, [DST], #1 | ||
tst SRC, #3 | ||
bne 3b | ||
|
||
.LblkAligned4: | ||
ands r3, SRC, #7 | ||
|
||
/* leading misalignment aligned to 4 bytes */ | ||
ldrne r2, [SRC], #4 | ||
strne r2, [DST], #4 | ||
subne LEN, LEN, #4 | ||
|
||
/* src/dst aligned to 8 bytes */ | ||
cmp LEN, #64 | ||
blo .Ltail63Aligned | ||
|
||
/* check if copy bigger than 512 bytes */ | ||
cmp LEN, #512 | ||
bhs .LblkCopy512Aligned | ||
|
||
pld [SRC] | ||
|
||
sub SRC, #8 | ||
sub DST, #8 | ||
|
||
.Ltail127Aligned: | ||
/* we may land here after long copy (64 <= LEN <= 127) */ | ||
sub LEN, #64 | ||
|
||
.p2align 5 | ||
.LblkCopy64: | ||
/* copy 64-512 bytes in 64-byte chunks */ | ||
pld [SRC, #40] | ||
|
||
ldrd r2, r3, [SRC, #8] | ||
strd r2, r3, [DST, #8] | ||
ldrd r2, r3, [SRC, #16] | ||
strd r2, r3, [DST, #16] | ||
ldrd r2, r3, [SRC, #24] | ||
strd r2, r3, [DST, #24] | ||
ldrd r2, r3, [SRC, #32] | ||
strd r2, r3, [DST, #32] | ||
|
||
pld [SRC, #72] | ||
|
||
ldrd r2, r3, [SRC, #40] | ||
strd r2, r3, [DST, #40] | ||
ldrd r2, r3, [SRC, #48] | ||
strd r2, r3, [DST, #48] | ||
ldrd r2, r3, [SRC, #56] | ||
strd r2, r3, [DST, #56] | ||
ldrd r2, r3, [SRC, #64]! | ||
strd r2, r3, [DST, #64]! | ||
subs LEN, #64 | ||
bhs .LblkCopy64 | ||
|
||
bne 2f | ||
|
||
/* LEN = 0 */ | ||
ldr r8, [sp], #8 | ||
mov r0, DST_RET | ||
bx lr | ||
|
||
2: | ||
/* LEN < 0 */ | ||
and LEN, #63 /* make LEN positive again */ | ||
|
||
add SRC, #8 | ||
add DST, #8 | ||
|
||
.Ltail63Aligned: | ||
/* copy the tail, 0-63 bytes | ||
* src/dst are 8-byte aligned | ||
*/ | ||
|
||
/* r3 = LEN / 8 */ | ||
movs r3, LEN, lsr #3 | ||
/* LEN = LEN % 8 */ | ||
and LEN, #7 | ||
beq .Ltail63Al0 | ||
|
||
push {r4, r5} | ||
|
||
.Ltail63Al8: | ||
ldrd r4, r5, [SRC], #8 | ||
strd r4, r5, [DST], #8 | ||
subs r3, #1 | ||
bne .Ltail63Al8 | ||
|
||
pop {r4, r5} | ||
|
||
.Ltail63Al0: | ||
/* copied all 8 byte aligned memory, now copy whats left */ | ||
cmp LEN, #0 | ||
beq .Ltail63AlRet | ||
|
||
/* 1 <= LEN <= 7 */ | ||
ands r3, LEN, #4 | ||
ldrne r2, [SRC], #4 | ||
strne r2, [DST], #4 | ||
|
||
ands r3, LEN, #2 | ||
ldrhne r2, [SRC], #2 | ||
strhne r2, [DST], #2 | ||
|
||
ands r3, LEN, #1 | ||
ldrbne r2, [SRC], #1 | ||
strbne r2, [DST], #1 | ||
|
||
.Ltail63AlRet: | ||
ldr r8, [sp], #8 | ||
mov r0, DST_RET | ||
bx lr | ||
|
||
|
||
.LblkCopy512Aligned: | ||
/* Copy more than 512 bytes, src/dst are 8-byte aligned */ | ||
stmfd sp!, {r4-r7, r10, r11} | ||
|
||
/* Align ld/st to 64 bytes */ | ||
ands r11, SRC, #63 | ||
|
||
sub SRC, #8 | ||
sub DST, #8 | ||
sub LEN, LEN, #128 | ||
|
||
pld [SRC, #8] | ||
|
||
beq 4f | ||
|
||
/* copy leading misalignment */ | ||
3: | ||
ldrd r2, r3, [SRC, #8]! | ||
strd r2, r3, [DST, #8]! | ||
subs r11, #8 | ||
sub LEN, #8 | ||
bne 3b | ||
|
||
pld [SRC, #8] | ||
|
||
.p2align 5 | ||
4: | ||
/* ld/st are cache line aligned */ | ||
pld [SRC, #40] | ||
|
||
ldrd r2, r3, [SRC, #8] | ||
strd r2, r3, [DST, #8] | ||
ldrd r4, r5, [SRC, #16] | ||
strd r4, r5, [DST, #16] | ||
ldrd r6, r7, [SRC, #24] | ||
strd r6, r7, [DST, #24] | ||
ldrd r10, r11, [SRC, #32] | ||
strd r10, r11, [DST, #32] | ||
|
||
pld [SRC, #72] | ||
|
||
ldrd r2, r3, [SRC, #40] | ||
strd r2, r3, [DST, #40] | ||
ldrd r4, r5, [SRC, #48] | ||
strd r4, r5, [DST, #48] | ||
ldrd r6, r7, [SRC, #56] | ||
strd r6, r7, [DST, #56] | ||
ldrd r10, r11, [SRC, #64] | ||
strd r10, r11, [DST, #64] | ||
|
||
pld [SRC, #108] | ||
|
||
ldrd r2, r3, [SRC, #72] | ||
strd r2, r3, [DST, #72] | ||
ldrd r4, r5, [SRC, #80] | ||
strd r4, r5, [DST, #80] | ||
ldrd r6, r7, [SRC, #88] | ||
strd r6, r7, [DST, #88] | ||
ldrd r10, r11, [SRC, #96] | ||
strd r10, r11, [DST, #96] | ||
|
||
pld [SRC, #136] | ||
|
||
ldrd r2, r3, [SRC, #104] | ||
strd r2, r3, [DST, #104] | ||
ldrd r4, r5, [SRC, #112] | ||
strd r4, r5, [DST, #112] | ||
ldrd r6, r7, [SRC, #120] | ||
strd r6, r7, [DST, #120] | ||
ldrd r10, r11, [SRC, #128]! | ||
strd r10, r11, [DST, #128]! | ||
|
||
subs LEN, LEN, #128 | ||
bhs 4b | ||
|
||
ldmfd sp!, {r4-r7, r10, r11} | ||
|
||
bne 6f | ||
|
||
/* LEN = 0 */ | ||
ldr r8, [sp], #8 | ||
mov r0, DST_RET | ||
bx lr | ||
|
||
6: | ||
/* LEN < 0 */ | ||
and LEN, #127 /* make LEN positive again */ | ||
|
||
cmp LEN, #63 | ||
bhi .Ltail127Aligned | ||
|
||
add SRC, #8 | ||
add DST, #8 | ||
b .Ltail63Aligned | ||
|
||
.size memcpy, .-memcpy | ||
.ltorg |
Oops, something went wrong.