memset: use neon-less impl, reformat other asm

2024-12-23 04:41:12 +00:00 · 2020-03-11 08:45:00 -07:00 · 2020-03-11 08:45:00 -07:00 · 6c52cc3e26
commit 6c52cc3e26
parent e42d3a3abf
4 changed files with 401 additions and 358 deletions
--- a/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s
+++ b/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s
@ -13,18 +13,7 @@
 #include "asmdefs.h"
-#define SAVE_FPU_REGISTERS
+#define DC_ZVA_THRESHOLD 512
 #define SKIP_ZVA_CHECK
 #ifdef SAVE_FPU_REGISTERS
 #define SAVE_Q0    str q0, [sp, #-16]!
 #define RESTORE_Q0 ldr q0, [sp], #16
 #else
 #define SAVE_Q0
 #define RESTORE_Q0
 #endif
 #define RETURN RESTORE_Q0; ret; .p2align 4
 #define dstin	x0
 #define val	    x1
@ -35,93 +24,147 @@
 #define zva_val	x5
 ENTRY (memset)
    SAVE_Q0
-	dup	v0.16B, valw
+    bfi     valw, valw,  8,  8
    bfi     valw, valw, 16, 16
    bfi     val,   val, 32, 32
 	add	    dstend, dstin, count
 	cmp 	count, 96
 	b.hi	L(set_long)
 	cmp	    count, 16
 	b.hs	L(set_medium)
 	mov	val, v0.D[0]
 	/* Set 0..15 bytes.  */
 	tbz	    count, 3, 1f
 	str	    val, [dstin]
 	str	    val, [dstend, -8]
-    RETURN
+    ret
 1:	tbz	    count, 2, 2f
 	str	    valw, [dstin]
 	str	    valw, [dstend, -4]
-    RETURN
+    ret
 2:	cbz	    count, 3f
 	strb	valw, [dstin]
 	tbz	    count, 1, 3f
 	strh	valw, [dstend, -2]
-3:	RETURN
+3:	ret
-	/* Set 17..96 bytes.  */
+	/* Set 16..96 bytes.  */
    .p2align 4
 L(set_medium):
-	str	q0, [dstin]
+    stp     val, val, [dstin]
 	tbnz	count, 6, L(set96)
-	str	q0, [dstend, -16]
+	stp	    val, val, [dstend, -16]
 	tbz	    count, 5, 1f
-	str	q0, [dstin, 16]
+	stp	    val, val, [dstin, 16]
-	str	q0, [dstend, -32]
+	stp	    val, val, [dstend, -32]
-1:	RETURN
+1:	ret
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
 L(set96):
-	str	q0, [dstin, 16]
+	stp	    val, val, [dstin, 16]
-	stp	q0, q0, [dstin, 32]
+	stp	    val, val, [dstin, 32]
-	stp	q0, q0, [dstend, -32]
+	stp	    val, val, [dstin, 48]
-    RETURN
+	stp	    val, val, [dstend, -32]
 	stp	    val, val, [dstend, -16]
    ret
 	.p2align 4
 L(set_long):
-	and	valw, valw, 255
+	stp	    val, val, [dstin]
 	bic	    dst, dstin, 15
-	str	q0, [dstin]
+#if DC_ZVA_THRESHOLD
-	cmp	count, 160
+	cmp	    count, DC_ZVA_THRESHOLD
-	ccmp	valw, 0, 0, hs
+	ccmp	val, 0, 0, cs
-	b.ne	L(no_zva)
+	b.eq	L(zva_64)
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
-	str	q0, [dst, 16]
+	/* Small-size or non-zero memset does not use DC ZVA. */
-	stp	q0, q0, [dst, 32]
+	sub	    count, dstend, dst
 	bic	dst, dst, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
 	sub	count, count, 128	/* Adjust count and bias for loop.  */
 	/*
 	 * Adjust count and bias for loop. By substracting extra 1 from count,
 	 * it is easy to use tbz instruction to check whether loop tailing
 	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
 	 */
 	sub	    count, count, 64+16+1
 #if DC_ZVA_THRESHOLD
 	/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
 	nop
 #endif
 1:	stp	    val, val, [dst, 16]
 	stp	    val, val, [dst, 32]
 	stp	    val, val, [dst, 48]
 	stp	    val, val, [dst, 64]!
 	subs	count, count, 64
 	b.hs	1b
 	tbz	    count, 5, 1f	/* Remaining count is less than 33 bytes? */
 	stp	    val, val, [dst, 16]
 	stp	    val, val, [dst, 32]
 1:	stp	    val, val, [dstend, -32]
 	stp	    val, val, [dstend, -16]
 	ret
 #if DC_ZVA_THRESHOLD
 	.p2align 4
-L(zva_loop):
+L(zva_64):
-	add	dst, dst, 64
+	stp	    val, val, [dst, 16]
-	dc	zva, dst
+	stp	    val, val, [dst, 32]
-	subs	count, count, 64
+	stp	    val, val, [dst, 48]
-	b.hi	L(zva_loop)
+	bic	    dst, dst, 63
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
    RETURN
-L(no_zva):
+	/*
-	sub	count, dstend, dst	/* Count is 16 too large.  */
+	 * Previous memory writes might cross cache line boundary, and cause
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	 * cache line partially dirty. Zeroing this kind of cache line using
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+	 * DC ZVA will incur extra cost, for it requires loading untouched
-L(no_zva_loop):
+	 * part of the line from memory before zeoring.
-	stp	q0, q0, [dst, 32]
+	 *
-	stp	q0, q0, [dst, 64]!
+	 * So, write the first 64 byte aligned block using stp to force
 	 * fully dirty cache line.
 	 */
 	stp	    val, val, [dst, 64]
 	stp	    val, val, [dst, 80]
 	stp	    val, val, [dst, 96]
 	stp	    val, val, [dst, 112]
 	sub	    count, dstend, dst
 	/*
 	 * Adjust count and bias for loop. By substracting extra 1 from count,
 	 * it is easy to use tbz instruction to check whether loop tailing
 	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
 	 */
 	sub	    count, count, 128+64+64+1
 	add	    dst, dst, 128
 	nop
 	/* DC ZVA sets 64 bytes each time. */
 1:	dc	    zva, dst
 	add	    dst, dst, 64
 	subs	count, count, 64
-	b.hi	L(no_zva_loop)
+	b.hs	1b
-	stp	q0, q0, [dstend, -64]
+
-	stp	q0, q0, [dstend, -32]
+	/*
-	RETURN
+	 * Write the last 64 byte aligned block using stp to force fully
 	 * dirty cache line.
 	 */
 	stp	    val, val, [dst, 0]
 	stp	    val, val, [dst, 16]
 	stp	    val, val, [dst, 32]
 	stp	    val, val, [dst, 48]
 	tbz	    count, 5, 1f	/* Remaining count is less than 33 bytes? */
 	stp	    val, val, [dst, 64]
 	stp	    val, val, [dst, 80]
 1:	stp	    val, val, [dstend, -32]
 	stp	    val, val, [dstend, -16]
 	ret
 #endif
 END (memset)