libmesosphere: use ARM-software/optimized-routines for memcpy/memset/memcmp

2025-01-03 11:11:14 +00:00 · 2020-03-11 02:46:07 -07:00 · 2020-03-11 02:46:07 -07:00 · e42d3a3abf
commit e42d3a3abf
parent 884844bc23
5 changed files with 534 additions and 0 deletions
--- a/libraries/libmesosphere/source/libc/arch/arm64/asmdefs.h
+++ b/libraries/libmesosphere/source/libc/arch/arm64/asmdefs.h
@ -0,0 +1,31 @@
 /*
 * Macros for asm code.
 *
 * Copyright (c) 2019, Arm Limited.
 * SPDX-License-Identifier: MIT
 */
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 #define ENTRY_ALIGN(name, alignment)	\
  .global name;		\
  .type name,%function;	\
  .align alignment;		\
  name:			\
  .cfi_startproc;
 #define ENTRY(name)	ENTRY_ALIGN(name, 6)
 #define ENTRY_ALIAS(name)	\
  .global name;		\
  .type name,%function;	\
  name:
 #define END(name)	\
  .cfi_endproc;		\
  .size name, .-name;
 #define L(l) .L ## l
 #endif
--- a/libraries/libmesosphere/source/libc/arch/arm64/memcmp.arch.arm64.s
+++ b/libraries/libmesosphere/source/libc/arch/arm64/memcmp.arch.arm64.s
@ -0,0 +1,133 @@
 /* memcmp - compare memory
 *
 * Copyright (c) 2013, Arm Limited.
 * SPDX-License-Identifier: MIT
 */
 /* Assumptions:
 *
 * ARMv8-a, AArch64, unaligned accesses.
 */
 #include "asmdefs.h"
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
 #define result		w0
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
 #define data1h		x4
 #define data2		x5
 #define data2w		w5
 #define data2h		x6
 #define tmp1		x7
 #define tmp2		x8
 ENTRY (memcmp)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
 	cmp	data1, data2
 	b.ne	L(return)
 	subs	limit, limit, 8
 	b.gt	L(more16)
 	ldr	data1, [src1, limit]
 	ldr	data2, [src2, limit]
 	b	L(return)
 L(more16):
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
 	cmp	data1, data2
 	bne	L(return)
 	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
 	   strings.  */
 	subs	limit, limit, 16
 	b.ls	L(last_bytes)
 	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
 	   try to align, so limit it only to strings larger than 128 bytes.  */
 	cmp	limit, 96
 	b.ls	L(loop16)
 	/* Align src1 and adjust src2 with bytes not yet done.  */
 	and	tmp1, src1, 15
 	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 	/* Loop performing 16 bytes per iteration using aligned src1.
 	   Limit is pre-decremented by 16 and must be larger than zero.
 	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
 L(loop16):
 	ldp	data1, data1h, [src1], 16
 	ldp	data2, data2h, [src2], 16
 	subs	limit, limit, 16
 	ccmp	data1, data2, 0, hi
 	ccmp	data1h, data2h, 0, eq
 	b.eq	L(loop16)
 	cmp	data1, data2
 	bne	L(return)
 	mov	data1, data1h
 	mov	data2, data2h
 	cmp	data1, data2
 	bne	L(return)
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
 	add	src1, src1, limit
 	add	src2, src2, limit
 	ldp	data1, data1h, [src1]
 	ldp	data2, data2h, [src2]
 	cmp     data1, data2
 	bne	L(return)
 	mov	data1, data1h
 	mov	data2, data2h
 	cmp	data1, data2
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp     data1, data2
 L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 	.p2align 4
 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
 L(less8):
 	adds	limit, limit, 4
 	b.lo	L(less4)
 	ldr	data1w, [src1], 4
 	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
 	b.ne	L(return)
 	sub	limit, limit, 4
 L(less4):
 	adds	limit, limit, 4
 	beq	L(ret_eq)
 L(byte_loop):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	subs	limit, limit, 1
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
 	ret
 END (memcmp)
--- a/libraries/libmesosphere/source/libc/arch/arm64/memcpy.arch.arm64.s
+++ b/libraries/libmesosphere/source/libc/arch/arm64/memcpy.arch.arm64.s
@ -0,0 +1,239 @@
 /*
 * memcpy - copy memory area
 *
 * Copyright (c) 2012-2020, Arm Limited.
 * SPDX-License-Identifier: MIT
 */
 /* Assumptions:
 *
 * ARMv8-a, AArch64, unaligned accesses.
 *
 */
 #include "asmdefs.h"
 #define dstin	x0
 #define src	x1
 #define count	x2
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_l	x10
 #define C_lw	w10
 #define C_h	x11
 #define D_l	x12
 #define D_h	x13
 #define E_l	x14
 #define E_h	x15
 #define F_l	x16
 #define F_h	x17
 #define G_l	count
 #define G_h	dst
 #define H_l	src
 #define H_h	srcend
 #define tmp1	x14
 /* This implementation handles overlaps and supports both memcpy and memmove
   from a single entry point.  It uses unaligned accesses and branchless
   sequences to keep the code small, simple and improve performance.
   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
   check is negligible since it is only required for large copies.
   Large copies use a software pipelined loop processing 64 bytes per iteration.
   The destination pointer is 16-byte aligned to minimize unaligned accesses.
   The loop tail is handled by always copying 64 bytes from the end.
 */
 ENTRY (memcpy)
 ENTRY_ALIAS (memmove)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
 	b.hi	L(copy_long)
 	cmp	count, 32
 	b.hi	L(copy32_128)
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
 	ldp	A_l, A_h, [src]
 	ldp	D_l, D_h, [srcend, -16]
 	stp	A_l, A_h, [dstin]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 	/* Copy 8-15 bytes.  */
 L(copy16):
 	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
 	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
 	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
 	str	B_lw, [dstend, -4]
 	ret
 	/* Copy 0..3 bytes using a branchless sequence.  */
 L(copy4):
 	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
 	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
 	strb	C_lw, [dstend, -1]
 L(copy0):
 	ret
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
 	ldp	A_l, A_h, [src]
 	ldp	B_l, B_h, [src, 16]
 	ldp	C_l, C_h, [srcend, -32]
 	ldp	D_l, D_h, [srcend, -16]
 	cmp	count, 64
 	b.hi	L(copy128)
 	stp	A_l, A_h, [dstin]
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstend, -32]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
 	ldp	E_l, E_h, [src, 32]
 	ldp	F_l, F_h, [src, 48]
 	cmp	count, 96
 	b.ls	L(copy96)
 	ldp	G_l, G_h, [srcend, -64]
 	ldp	H_l, H_h, [srcend, -48]
 	stp	G_l, G_h, [dstend, -64]
 	stp	H_l, H_h, [dstend, -48]
 L(copy96):
 	stp	A_l, A_h, [dstin]
 	stp	B_l, B_h, [dstin, 16]
 	stp	E_l, E_h, [dstin, 32]
 	stp	F_l, F_h, [dstin, 48]
 	stp	C_l, C_h, [dstend, -32]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 	.p2align 4
 	/* Copy more than 128 bytes.  */
 L(copy_long):
 	/* Use backwards copy if there is an overlap.  */
 	sub	tmp1, dstin, src
 	cbz	tmp1, L(copy0)
 	cmp	tmp1, count
 	b.lo	L(copy_long_backwards)
 	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
 	ldp	D_l, D_h, [src]
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	sub	src, src, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
 	ldp	B_l, B_h, [src, 32]
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
 L(loop64):
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
 	ldp	B_l, B_h, [src, 32]
 	stp	C_l, C_h, [dst, 48]
 	ldp	C_l, C_h, [src, 48]
 	stp	D_l, D_h, [dst, 64]!
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
 	b.hi	L(loop64)
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]
 	stp	B_l, B_h, [dst, 32]
 	ldp	B_l, B_h, [srcend, -32]
 	stp	C_l, C_h, [dst, 48]
 	ldp	C_l, C_h, [srcend, -16]
 	stp	D_l, D_h, [dst, 64]
 	stp	E_l, E_h, [dstend, -64]
 	stp	A_l, A_h, [dstend, -48]
 	stp	B_l, B_h, [dstend, -32]
 	stp	C_l, C_h, [dstend, -16]
 	ret
 	.p2align 4
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align dst to 16-byte alignment.  */
 L(copy_long_backwards):
 	ldp	D_l, D_h, [srcend, -16]
 	and	tmp1, dstend, 15
 	sub	srcend, srcend, tmp1
 	sub	count, count, tmp1
 	ldp	A_l, A_h, [srcend, -16]
 	stp	D_l, D_h, [dstend, -16]
 	ldp	B_l, B_h, [srcend, -32]
 	ldp	C_l, C_h, [srcend, -48]
 	ldp	D_l, D_h, [srcend, -64]!
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 L(loop64_backwards):
 	stp	A_l, A_h, [dstend, -16]
 	ldp	A_l, A_h, [srcend, -16]
 	stp	B_l, B_h, [dstend, -32]
 	ldp	B_l, B_h, [srcend, -32]
 	stp	C_l, C_h, [dstend, -48]
 	ldp	C_l, C_h, [srcend, -48]
 	stp	D_l, D_h, [dstend, -64]!
 	ldp	D_l, D_h, [srcend, -64]!
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
 	ldp	G_l, G_h, [src, 48]
 	stp	A_l, A_h, [dstend, -16]
 	ldp	A_l, A_h, [src, 32]
 	stp	B_l, B_h, [dstend, -32]
 	ldp	B_l, B_h, [src, 16]
 	stp	C_l, C_h, [dstend, -48]
 	ldp	C_l, C_h, [src]
 	stp	D_l, D_h, [dstend, -64]
 	stp	G_l, G_h, [dstin, 48]
 	stp	A_l, A_h, [dstin, 32]
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstin]
 	ret
 END (memcpy)
--- a/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s
+++ b/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s
@ -0,0 +1,127 @@
 /*
 * memset - fill memory with a constant byte
 *
 * Copyright (c) 2012-2020, Arm Limited.
 * SPDX-License-Identifier: MIT
 */
 /* Assumptions:
 *
 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
 *
 */
 #include "asmdefs.h"
 #define SAVE_FPU_REGISTERS
 #define SKIP_ZVA_CHECK
 #ifdef SAVE_FPU_REGISTERS
 #define SAVE_Q0    str q0, [sp, #-16]!
 #define RESTORE_Q0 ldr q0, [sp], #16
 #else
 #define SAVE_Q0
 #define RESTORE_Q0
 #endif
 #define RETURN RESTORE_Q0; ret; .p2align 4
 #define dstin	x0
 #define val	x1
 #define valw	w1
 #define count	x2
 #define dst	x3
 #define dstend	x4
 #define zva_val	x5
 ENTRY (memset)
    SAVE_Q0
 	dup	v0.16B, valw
 	add	dstend, dstin, count
 	cmp	count, 96
 	b.hi	L(set_long)
 	cmp	count, 16
 	b.hs	L(set_medium)
 	mov	val, v0.D[0]
 	/* Set 0..15 bytes.  */
 	tbz	count, 3, 1f
 	str	val, [dstin]
 	str	val, [dstend, -8]
    RETURN
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
    RETURN
 2:	cbz	count, 3f
 	strb	valw, [dstin]
 	tbz	count, 1, 3f
 	strh	valw, [dstend, -2]
 3:	RETURN
 	/* Set 17..96 bytes.  */
 L(set_medium):
 	str	q0, [dstin]
 	tbnz	count, 6, L(set96)
 	str	q0, [dstend, -16]
 	tbz	count, 5, 1f
 	str	q0, [dstin, 16]
 	str	q0, [dstend, -32]
 1:	RETURN
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
 L(set96):
 	str	q0, [dstin, 16]
 	stp	q0, q0, [dstin, 32]
 	stp	q0, q0, [dstend, -32]
    RETURN
 	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
 	cmp	count, 160
 	ccmp	valw, 0, 0, hs
 	b.ne	L(no_zva)
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
 	sub	count, count, 128	/* Adjust count and bias for loop.  */
 	.p2align 4
 L(zva_loop):
 	add	dst, dst, 64
 	dc	zva, dst
 	subs	count, count, 64
 	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
    RETURN
 L(no_zva):
 	sub	count, dstend, dst	/* Count is 16 too large.  */
 	sub	dst, dst, 16		/* Dst is biased by -32.  */
 	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
 L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]!
 	subs	count, count, 64
 	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	RETURN
 END (memset)
--- a/libraries/libmesosphere/source/libc/kern_libc.arch.generic.c
+++ b/libraries/libmesosphere/source/libc/kern_libc.arch.generic.c
@ -61,6 +61,7 @@ QUICKREF
 /*SUPPRESS 20*/
 void *
 //__inhibit_loop_to_libcall
 __attribute__((weak))
 memmove (void *dst_void,
 	const void *src_void,
 	size_t length)
@ -169,6 +170,7 @@ QUICKREF
 	*/
 void *
 __attribute__((weak))
 memcpy (void * dst0,
 	const void * __restrict src0,
 	size_t len0)
@ -259,6 +261,7 @@ QUICKREF
 #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
 void *
 __attribute__((weak))
 memset (void *m,
 	int c,
 	size_t n)
@ -357,6 +360,7 @@ QUICKREF
 #define TOO_SMALL(LEN)  ((LEN) < LBLOCKSIZE)
 int
 __attribute__((weak))
 memcmp (const void *m1,
 	const void *m2,
 	size_t n)