memcpy thumb

phoenix-rtos · Mar 4, 2024 · 83b4346 · 83b4346
1 parent 8db0322
commit 83b4346
Showing 1 changed file with 46 additions and 157 deletions.
diff --git a/arch/armv7a/memcpy.S b/arch/armv7a/memcpy.S
@@ -15,10 +15,9 @@
 
 #define DST r0
 #define SRC r1
-#define LEN r8
-#define DST_RET r12
+#define LEN r12
 
-.arm
+.thumb
 .syntax unified
 
 .text
@@ -30,25 +29,22 @@
  * Implementation is divided into the following cases:
  * 1. len < 64 bytes - copying is done without alignment checks,
  *    using possibly unaligned `ldr`, `str` instructions.
- * 2. src/dst mutually aligned to 8 bytes and 64 <= len <= 512 bytes:
+ * 2. src/dst mutually aligned to 8 bytes and len >= 64 bytes:
  *    unrolled `ldrd`, `strd` instructions are used to copy 64 bytes at a time.
- * 3. src/dst mutually aligned to 8 bytes and len > 512 bytes:
- *    unrolled `ldrd`, `strd` instructions are used to copy 128 bytes at a time.
- * 4. src/dst not mutually aligned, len >= 64 bytes:
+ * 3. src/dst not mutually aligned, len >= 64 bytes:
  *    dst is brought to 64-byte alignment and then 64-byte chunks are copied
  *    using 2 unaligned `ldr` and aligned `strd` instructions.
  *
- * Long loops are aligned to icache line size (32 bytes) and before long copies
- * src and dst are brought to dcache line alignment (64 bytes) if possible.
+ * Long loops are aligned to 64 bytes.
  */
 
+.thumb_func
 .globl memcpy
 .type memcpy, %function
 memcpy:
-	str r8, [sp, #-8]!
-	mov r8, r2
+	str DST, [sp, #-8]!
+	mov LEN, r2
 	cmp LEN, #64
-	mov DST_RET, DST /* preserve return value */
 
 	bhs .LblkCopy
 
@@ -80,8 +76,7 @@ memcpy:
 	bne 1b
 
 .Lreturn:
-	ldr r8, [sp], #8
-	mov r0, DST_RET
+	ldr r0, [sp], #8
 	bx lr
 
 
@@ -94,49 +89,41 @@ memcpy:
 
 	/* r2 = distance to 64-byte alignment */
 	subs r2, DST
-	beq 6f /* dst already aligned */
-
-	str r4, [sp, #-8]!
+	beq 4f /* dst already aligned */
 
 	sub LEN, r2
 
-	/* the same logic as in .Ltail63Unaligned */
+	tst DST, #1
+	itt ne
+	ldrbne r3, [SRC], #1
+	strbne r3, [DST], #1
 
-	/* r3 = LEN / 4 */
-	movs r3, r2, lsr #2
+	tst DST, #2
+	itt ne
+	ldrhne r3, [SRC], #2
+	strhne r3, [DST], #2
+
+	tst DST, #(7 << 3)
 	beq 3f
 
 2:
-	/* align dst to 4 */
-	ldr r4, [SRC], #4
-	str r4, [DST], #4
-	subs r3, #1
+	/* align dst to 64 */
+	ldr r3, [SRC], #4
+	str r3, [DST], #4
+	tst DST, #(7 << 3)
 	bne 2b
 
 3:
-	/* r2 = r2 % 4 */
-	ands r2, #3
-	beq 5f
-
-4:
-	ldrb r3, [SRC], #1
-	strb r3, [DST], #1
-	subs r2, #1
-	bne 4b
-
-5:
-	ldr r4, [sp], #8
-
 	cmp LEN, #64
 	blo .Ltail63Unaligned
 
-6:
+4:
 	sub LEN, #64
 
 	sub DST, #8
 	sub SRC, #8
 
-.p2align 5
+.p2align 6
 .LblkCopyUnaligned:
 	/* copy block of 64 bytes */
 	ldr r2, [SRC, #8]
@@ -201,40 +188,34 @@ memcpy:
 	rsb r3, #4
 	sub LEN, r3
 
-7:
+5:
 	ldrb r2, [SRC], #1
 	strb r2, [DST], #1
 	tst SRC, #3
-	bne 7b
+	bne 5b
 
 .LblkAligned4:
-	ands r3, SRC, #7
-
+	tst SRC, #4
 	/* leading misalignment aligned to 4 bytes */
+	ittt ne
 	ldrne r2, [SRC], #4
 	strne r2, [DST], #4
-	subne LEN, LEN, #4
+	subne LEN, #4
 
 	/* src/dst aligned to 8 bytes */
 	cmp LEN, #64
 	blo .Ltail63Aligned
 
-	/* check if copy bigger than 512 bytes */
-	cmp LEN, #512
-	bhs .LblkCopy512Aligned
-
 	pld [SRC]
 
 	sub SRC, #8
 	sub DST, #8
 
-.Ltail127Aligned:
-	/* we may land here after long copy (64 <= LEN <= 127) */
 	sub LEN, #64
 
-.p2align 5
+.p2align 6
 .LblkCopy64:
-	/* copy 64-512 bytes in 64-byte chunks */
+	/* copy block of 64 bytes */
 	pld [SRC, #40]
 
 	ldrd r2, r3, [SRC, #8]
@@ -270,131 +251,39 @@ memcpy:
 	 * src/dst are 8-byte aligned
 	 */
 
-	/* r3 = LEN / 8 */
-	movs r3, LEN, lsr #3
-	/* LEN = LEN % 8 */
-	and LEN, #7
+	tst LEN, #(7 << 3)
 	beq .Ltail63Al0
 
-	push {r4, r5}
+	sub LEN, #8
 
 .Ltail63Al8:
-	ldrd r4, r5, [SRC], #8
-	strd r4, r5, [DST], #8
-	subs r3, #1
-	bne .Ltail63Al8
-
-	pop {r4, r5}
+	subs LEN, #8
+	ldrd r2, r3, [SRC], #8
+	strd r2, r3, [DST], #8
+	bhs .Ltail63Al8
 
 .Ltail63Al0:
 	/* copied all 8 byte aligned memory, now copy what's left */
-	cmp LEN, #0
+	tst LEN, #7
 	beq .Lreturn
 
 	/* 1 <= LEN <= 7 */
-	ands r3, LEN, #4
+	tst LEN, #4
+	itt ne
 	ldrne r2, [SRC], #4
 	strne r2, [DST], #4
 
-	ands r3, LEN, #2
+	tst LEN, #2
+	itt ne
 	ldrhne r2, [SRC], #2
 	strhne r2, [DST], #2
 
-	ands r3, LEN, #1
+	tst LEN, #1
+	itt ne
 	ldrbne r2, [SRC], #1
 	strbne r2, [DST], #1
 
 	b .Lreturn
 
-
-.LblkCopy512Aligned:
-	/* Copy more than 512 bytes, src/dst are 8-byte aligned */
-	stmfd sp!, {r4-r7, r10, r11}
-
-	/* Align ld/st to 64 bytes */
-	tst SRC, #63
-
-	pld [SRC]
-
-	sub SRC, #8
-	sub DST, #8
-
-	sub LEN, LEN, #128
-
-	beq 9f
-
-	/* copy leading misalignment */
-8:
-	ldrd r2, r3, [SRC, #8]!
-	strd r2, r3, [DST, #8]!
-	sub LEN, #8
-	tst SRC, #63
-	bne 8b
-
-	pld [SRC, #8]
-
-.p2align 5
-9:
-	/* ld/st are cache line aligned */
-	pld [SRC, #40]
-
-	ldrd r2, r3, [SRC, #8]
-	strd r2, r3, [DST, #8]
-	ldrd r4, r5, [SRC, #16]
-	strd r4, r5, [DST, #16]
-	ldrd r6, r7, [SRC, #24]
-	strd r6, r7, [DST, #24]
-	ldrd r10, r11, [SRC, #32]
-	strd r10, r11, [DST, #32]
-
-	pld [SRC, #72]
-
-	ldrd r2, r3, [SRC, #40]
-	strd r2, r3, [DST, #40]
-	ldrd r4, r5, [SRC, #48]
-	strd r4, r5, [DST, #48]
-	ldrd r6, r7, [SRC, #56]
-	strd r6, r7, [DST, #56]
-	ldrd r10, r11, [SRC, #64]
-	strd r10, r11, [DST, #64]
-
-	pld [SRC, #108]
-
-	ldrd r2, r3, [SRC, #72]
-	strd r2, r3, [DST, #72]
-	ldrd r4, r5, [SRC, #80]
-	strd r4, r5, [DST, #80]
-	ldrd r6, r7, [SRC, #88]
-	strd r6, r7, [DST, #88]
-	ldrd r10, r11, [SRC, #96]
-	strd r10, r11, [DST, #96]
-
-	pld [SRC, #136]
-
-	ldrd r2, r3, [SRC, #104]
-	strd r2, r3, [DST, #104]
-	ldrd r4, r5, [SRC, #112]
-	strd r4, r5, [DST, #112]
-	ldrd r6, r7, [SRC, #120]
-	strd r6, r7, [DST, #120]
-	ldrd r10, r11, [SRC, #128]!
-	strd r10, r11, [DST, #128]!
-
-	subs LEN, LEN, #128
-	bhs 9b
-
-	ldmfd sp!, {r4-r7, r10, r11}
-
-	and LEN, #127  /* make LEN positive again */
-	beq .Lreturn   /* LEN = 0 */
-
-	cmp LEN, #63
-	pld [SRC, #8]
-	bhi .Ltail127Aligned
-
-	add SRC, #8
-	add DST, #8
-	b .Ltail63Aligned
-
 .size memcpy, .-memcpy
 .ltorg