Skip to content

Commit

Permalink
memcpy thumb
Browse files Browse the repository at this point in the history
  • Loading branch information
lukileczo committed Mar 4, 2024
1 parent 8db0322 commit 83b4346
Showing 1 changed file with 46 additions and 157 deletions.
203 changes: 46 additions & 157 deletions arch/armv7a/memcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@

#define DST r0
#define SRC r1
#define LEN r8
#define DST_RET r12
#define LEN r12

.arm
.thumb
.syntax unified

.text
Expand All @@ -30,25 +29,22 @@
* Implementation is divided into the following cases:
* 1. len < 64 bytes - copying is done without alignment checks,
* using possibly unaligned `ldr`, `str` instructions.
* 2. src/dst mutually aligned to 8 bytes and 64 <= len <= 512 bytes:
* 2. src/dst mutually aligned to 8 bytes and len >= 64 bytes:
* unrolled `ldrd`, `strd` instructions are used to copy 64 bytes at a time.
* 3. src/dst mutually aligned to 8 bytes and len > 512 bytes:
* unrolled `ldrd`, `strd` instructions are used to copy 128 bytes at a time.
* 4. src/dst not mutually aligned, len >= 64 bytes:
* 3. src/dst not mutually aligned, len >= 64 bytes:
* dst is brought to 64-byte alignment and then 64-byte chunks are copied
* using 2 unaligned `ldr` and aligned `strd` instructions.
*
* Long loops are aligned to icache line size (32 bytes) and before long copies
* src and dst are brought to dcache line alignment (64 bytes) if possible.
* Long loops are aligned to 64 bytes.
*/

.thumb_func
.globl memcpy
.type memcpy, %function
memcpy:
str r8, [sp, #-8]!
mov r8, r2
str DST, [sp, #-8]!
mov LEN, r2
cmp LEN, #64
mov DST_RET, DST /* preserve return value */

bhs .LblkCopy

Expand Down Expand Up @@ -80,8 +76,7 @@ memcpy:
bne 1b

.Lreturn:
ldr r8, [sp], #8
mov r0, DST_RET
ldr r0, [sp], #8
bx lr


Expand All @@ -94,49 +89,41 @@ memcpy:

/* r2 = distance to 64-byte alignment */
subs r2, DST
beq 6f /* dst already aligned */

str r4, [sp, #-8]!
beq 4f /* dst already aligned */

sub LEN, r2

/* the same logic as in .Ltail63Unaligned */
tst DST, #1
itt ne
ldrbne r3, [SRC], #1
strbne r3, [DST], #1

/* r3 = LEN / 4 */
movs r3, r2, lsr #2
tst DST, #2
itt ne
ldrhne r3, [SRC], #2
strhne r3, [DST], #2

tst DST, #(7 << 3)
beq 3f

2:
/* align dst to 4 */
ldr r4, [SRC], #4
str r4, [DST], #4
subs r3, #1
/* align dst to 64 */
ldr r3, [SRC], #4
str r3, [DST], #4
tst DST, #(7 << 3)
bne 2b

3:
/* r2 = r2 % 4 */
ands r2, #3
beq 5f

4:
ldrb r3, [SRC], #1
strb r3, [DST], #1
subs r2, #1
bne 4b

5:
ldr r4, [sp], #8

cmp LEN, #64
blo .Ltail63Unaligned

6:
4:
sub LEN, #64

sub DST, #8
sub SRC, #8

.p2align 5
.p2align 6
.LblkCopyUnaligned:
/* copy block of 64 bytes */
ldr r2, [SRC, #8]
Expand Down Expand Up @@ -201,40 +188,34 @@ memcpy:
rsb r3, #4
sub LEN, r3

7:
5:
ldrb r2, [SRC], #1
strb r2, [DST], #1
tst SRC, #3
bne 7b
bne 5b

.LblkAligned4:
ands r3, SRC, #7

tst SRC, #4
/* leading misalignment aligned to 4 bytes */
ittt ne
ldrne r2, [SRC], #4
strne r2, [DST], #4
subne LEN, LEN, #4
subne LEN, #4

/* src/dst aligned to 8 bytes */
cmp LEN, #64
blo .Ltail63Aligned

/* check if copy bigger than 512 bytes */
cmp LEN, #512
bhs .LblkCopy512Aligned

pld [SRC]

sub SRC, #8
sub DST, #8

.Ltail127Aligned:
/* we may land here after long copy (64 <= LEN <= 127) */
sub LEN, #64

.p2align 5
.p2align 6
.LblkCopy64:
/* copy 64-512 bytes in 64-byte chunks */
/* copy block of 64 bytes */
pld [SRC, #40]

ldrd r2, r3, [SRC, #8]
Expand Down Expand Up @@ -270,131 +251,39 @@ memcpy:
* src/dst are 8-byte aligned
*/

/* r3 = LEN / 8 */
movs r3, LEN, lsr #3
/* LEN = LEN % 8 */
and LEN, #7
tst LEN, #(7 << 3)
beq .Ltail63Al0

push {r4, r5}
sub LEN, #8

.Ltail63Al8:
ldrd r4, r5, [SRC], #8
strd r4, r5, [DST], #8
subs r3, #1
bne .Ltail63Al8

pop {r4, r5}
subs LEN, #8
ldrd r2, r3, [SRC], #8
strd r2, r3, [DST], #8
bhs .Ltail63Al8

.Ltail63Al0:
/* copied all 8 byte aligned memory, now copy what's left */
cmp LEN, #0
tst LEN, #7
beq .Lreturn

/* 1 <= LEN <= 7 */
ands r3, LEN, #4
tst LEN, #4
itt ne
ldrne r2, [SRC], #4
strne r2, [DST], #4

ands r3, LEN, #2
tst LEN, #2
itt ne
ldrhne r2, [SRC], #2
strhne r2, [DST], #2

ands r3, LEN, #1
tst LEN, #1
itt ne
ldrbne r2, [SRC], #1
strbne r2, [DST], #1

b .Lreturn


.LblkCopy512Aligned:
/* Copy more than 512 bytes, src/dst are 8-byte aligned */
stmfd sp!, {r4-r7, r10, r11}

/* Align ld/st to 64 bytes */
tst SRC, #63

pld [SRC]

sub SRC, #8
sub DST, #8

sub LEN, LEN, #128

beq 9f

/* copy leading misalignment */
8:
ldrd r2, r3, [SRC, #8]!
strd r2, r3, [DST, #8]!
sub LEN, #8
tst SRC, #63
bne 8b

pld [SRC, #8]

.p2align 5
9:
/* ld/st are cache line aligned */
pld [SRC, #40]

ldrd r2, r3, [SRC, #8]
strd r2, r3, [DST, #8]
ldrd r4, r5, [SRC, #16]
strd r4, r5, [DST, #16]
ldrd r6, r7, [SRC, #24]
strd r6, r7, [DST, #24]
ldrd r10, r11, [SRC, #32]
strd r10, r11, [DST, #32]

pld [SRC, #72]

ldrd r2, r3, [SRC, #40]
strd r2, r3, [DST, #40]
ldrd r4, r5, [SRC, #48]
strd r4, r5, [DST, #48]
ldrd r6, r7, [SRC, #56]
strd r6, r7, [DST, #56]
ldrd r10, r11, [SRC, #64]
strd r10, r11, [DST, #64]

pld [SRC, #108]

ldrd r2, r3, [SRC, #72]
strd r2, r3, [DST, #72]
ldrd r4, r5, [SRC, #80]
strd r4, r5, [DST, #80]
ldrd r6, r7, [SRC, #88]
strd r6, r7, [DST, #88]
ldrd r10, r11, [SRC, #96]
strd r10, r11, [DST, #96]

pld [SRC, #136]

ldrd r2, r3, [SRC, #104]
strd r2, r3, [DST, #104]
ldrd r4, r5, [SRC, #112]
strd r4, r5, [DST, #112]
ldrd r6, r7, [SRC, #120]
strd r6, r7, [DST, #120]
ldrd r10, r11, [SRC, #128]!
strd r10, r11, [DST, #128]!

subs LEN, LEN, #128
bhs 9b

ldmfd sp!, {r4-r7, r10, r11}

and LEN, #127 /* make LEN positive again */
beq .Lreturn /* LEN = 0 */

cmp LEN, #63
pld [SRC, #8]
bhi .Ltail127Aligned

add SRC, #8
add DST, #8
b .Ltail63Aligned

.size memcpy, .-memcpy
.ltorg

0 comments on commit 83b4346

Please sign in to comment.