From 6c52cc3e26d9dde4292dc35e5c306fd6db21032e Mon Sep 17 00:00:00 2001 From: Michael Scire Date: Wed, 11 Mar 2020 08:45:00 -0700 Subject: [PATCH] memset: use neon-less impl, reformat other asm --- .../source/libc/arch/arm64/asmdefs.h | 22 +- .../libc/arch/arm64/memcmp.arch.arm64.s | 182 ++++----- .../libc/arch/arm64/memcpy.arch.arm64.s | 356 +++++++++--------- .../libc/arch/arm64/memset.arch.arm64.s | 199 ++++++---- 4 files changed, 401 insertions(+), 358 deletions(-) diff --git a/libraries/libmesosphere/source/libc/arch/arm64/asmdefs.h b/libraries/libmesosphere/source/libc/arch/arm64/asmdefs.h index 7d143a969..edc4e66c4 100644 --- a/libraries/libmesosphere/source/libc/arch/arm64/asmdefs.h +++ b/libraries/libmesosphere/source/libc/arch/arm64/asmdefs.h @@ -8,22 +8,22 @@ #ifndef _ASMDEFS_H #define _ASMDEFS_H -#define ENTRY_ALIGN(name, alignment) \ - .global name; \ - .type name,%function; \ - .align alignment; \ - name: \ +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ .cfi_startproc; -#define ENTRY(name) ENTRY_ALIGN(name, 6) +#define ENTRY(name) ENTRY_ALIGN(name, 6) -#define ENTRY_ALIAS(name) \ - .global name; \ - .type name,%function; \ +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ name: -#define END(name) \ - .cfi_endproc; \ +#define END(name) \ + .cfi_endproc; \ .size name, .-name; #define L(l) .L ## l diff --git a/libraries/libmesosphere/source/libc/arch/arm64/memcmp.arch.arm64.s b/libraries/libmesosphere/source/libc/arch/arm64/memcmp.arch.arm64.s index 95fb1d2d2..609f7e208 100644 --- a/libraries/libmesosphere/source/libc/arch/arm64/memcmp.arch.arm64.s +++ b/libraries/libmesosphere/source/libc/arch/arm64/memcmp.arch.arm64.s @@ -12,122 +12,122 @@ #include "asmdefs.h" /* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 /* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 ENTRY (memcmp) - subs limit, limit, 8 - b.lo L(less8) + subs limit, limit, 8 + b.lo L(less8) - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) - subs limit, limit, 8 - b.gt L(more16) + subs limit, limit, 8 + b.gt L(more16) - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 - b.ls L(last_bytes) + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) - /* Align src1 and adjust src2 with bytes not yet done. */ - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ - .p2align 4 + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi - ccmp data1h, data2h, 0, eq - b.eq L(loop16) + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h - cmp data1, data2 - bne L(return) + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) - /* Compare last 1-16 bytes using unaligned access. */ + /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h - cmp data1, data2 + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 - /* Compare data bytes and set return value to 0, -1 or 1. */ + /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): #ifndef __AARCH64EB__ - rev data1, data1 - rev data2, data2 + rev data1, data1 + rev data2, data2 #endif - cmp data1, data2 + cmp data1, data2 L(ret_eq): - cset result, ne - cneg result, result, lo - ret + cset result, ne + cneg result, result, lo + ret - .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ L(less8): - adds limit, limit, 4 - b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 - cmp data1w, data2w - b.ne L(return) - sub limit, limit, 4 + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 L(less4): - adds limit, limit, 4 - beq L(ret_eq) + adds limit, limit, 4 + beq L(ret_eq) L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) - sub result, data1w, data2w - ret + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret END (memcmp) diff --git a/libraries/libmesosphere/source/libc/arch/arm64/memcpy.arch.arm64.s b/libraries/libmesosphere/source/libc/arch/arm64/memcpy.arch.arm64.s index 5e76e13a7..02ed1dd89 100644 --- a/libraries/libmesosphere/source/libc/arch/arm64/memcpy.arch.arm64.s +++ b/libraries/libmesosphere/source/libc/arch/arm64/memcpy.arch.arm64.s @@ -13,32 +13,32 @@ #include "asmdefs.h" -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_lw w10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l x14 -#define E_h x15 -#define F_l x16 -#define F_h x17 -#define G_l count -#define G_h dst -#define H_l src -#define H_h srcend -#define tmp1 x14 +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 /* This implementation handles overlaps and supports both memcpy and memmove from a single entry point. It uses unaligned accesses and branchless @@ -55,185 +55,185 @@ ENTRY (memcpy) ENTRY_ALIAS (memmove) - add srcend, src, count - add dstend, dstin, count - cmp count, 128 - b.hi L(copy_long) - cmp count, 32 - b.hi L(copy32_128) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) - /* Small copies: 0..32 bytes. */ - cmp count, 16 - b.lo L(copy16) - ldp A_l, A_h, [src] - ldp D_l, D_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret - /* Copy 8-15 bytes. */ + /* Copy 8-15 bytes. */ L(copy16): - tbz count, 3, L(copy8) - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] - str A_h, [dstend, -8] - ret + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret - .p2align 3 - /* Copy 4-7 bytes. */ + .p2align 3 + /* Copy 4-7 bytes. */ L(copy8): - tbz count, 2, L(copy4) - ldr A_lw, [src] - ldr B_lw, [srcend, -4] - str A_lw, [dstin] - str B_lw, [dstend, -4] - ret + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret - /* Copy 0..3 bytes using a branchless sequence. */ + /* Copy 0..3 bytes using a branchless sequence. */ L(copy4): - cbz count, L(copy0) - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb C_lw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] L(copy0): - ret + ret - .p2align 4 - /* Medium copies: 33..128 bytes. */ + .p2align 4 + /* Medium copies: 33..128 bytes. */ L(copy32_128): - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - ldp D_l, D_h, [srcend, -16] - cmp count, 64 - b.hi L(copy128) - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] - stp D_l, D_h, [dstend, -16] - ret + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret - .p2align 4 - /* Copy 65..128 bytes. */ + .p2align 4 + /* Copy 65..128 bytes. */ L(copy128): - ldp E_l, E_h, [src, 32] - ldp F_l, F_h, [src, 48] - cmp count, 96 - b.ls L(copy96) - ldp G_l, G_h, [srcend, -64] - ldp H_l, H_h, [srcend, -48] - stp G_l, G_h, [dstend, -64] - stp H_l, H_h, [dstend, -48] + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] L(copy96): - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp E_l, E_h, [dstin, 32] - stp F_l, F_h, [dstin, 48] - stp C_l, C_h, [dstend, -32] - stp D_l, D_h, [dstend, -16] - ret + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret - .p2align 4 - /* Copy more than 128 bytes. */ + .p2align 4 + /* Copy more than 128 bytes. */ L(copy_long): - /* Use backwards copy if there is an overlap. */ - sub tmp1, dstin, src - cbz tmp1, L(copy0) - cmp tmp1, count - b.lo L(copy_long_backwards) + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) - /* Copy 16 bytes and then align dst to 16-byte alignment. */ + /* Copy 16 bytes and then align dst to 16-byte alignment. */ - ldp D_l, D_h, [src] - and tmp1, dstin, 15 - bic dst, dstin, 15 - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(copy64_from_end) + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) L(loop64): - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! - subs count, count, 64 - b.hi L(loop64) + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) - /* Write the last iteration and copy 64 bytes from the end. */ + /* Write the last iteration and copy 64 bytes from the end. */ L(copy64_from_end): - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] - ret + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret - .p2align 4 + .p2align 4 - /* Large backwards copy for overlapping copies. - Copy 16 bytes and then align dst to 16-byte alignment. */ + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ L(copy_long_backwards): - ldp D_l, D_h, [srcend, -16] - and tmp1, dstend, 15 - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls L(copy64_from_start) + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) L(loop64_backwards): - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! - subs count, count, 64 - b.hi L(loop64_backwards) + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) - /* Write the last iteration and copy 64 bytes from the start. */ + /* Write the last iteration and copy 64 bytes from the start. */ L(copy64_from_start): - ldp G_l, G_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp G_l, G_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] - ret + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret END (memcpy) diff --git a/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s b/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s index 91d230841..d8d272726 100644 --- a/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s +++ b/libraries/libmesosphere/source/libc/arch/arm64/memset.arch.arm64.s @@ -13,115 +13,158 @@ #include "asmdefs.h" -#define SAVE_FPU_REGISTERS -#define SKIP_ZVA_CHECK - -#ifdef SAVE_FPU_REGISTERS -#define SAVE_Q0 str q0, [sp, #-16]! -#define RESTORE_Q0 ldr q0, [sp], #16 -#else -#define SAVE_Q0 -#define RESTORE_Q0 -#endif - -#define RETURN RESTORE_Q0; ret; .p2align 4 +#define DC_ZVA_THRESHOLD 512 #define dstin x0 -#define val x1 +#define val x1 #define valw w1 #define count x2 -#define dst x3 +#define dst x3 #define dstend x4 #define zva_val x5 ENTRY (memset) - SAVE_Q0 - dup v0.16B, valw - add dstend, dstin, count + bfi valw, valw, 8, 8 + bfi valw, valw, 16, 16 + bfi val, val, 32, 32 - cmp count, 96 + add dstend, dstin, count + + cmp count, 96 b.hi L(set_long) - cmp count, 16 + cmp count, 16 b.hs L(set_medium) - mov val, v0.D[0] /* Set 0..15 bytes. */ - tbz count, 3, 1f - str val, [dstin] - str val, [dstend, -8] - RETURN -1: tbz count, 2, 2f - str valw, [dstin] - str valw, [dstend, -4] - RETURN -2: cbz count, 3f + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f strb valw, [dstin] - tbz count, 1, 3f + tbz count, 1, 3f strh valw, [dstend, -2] -3: RETURN +3: ret - /* Set 17..96 bytes. */ + /* Set 16..96 bytes. */ + .p2align 4 L(set_medium): - str q0, [dstin] + stp val, val, [dstin] tbnz count, 6, L(set96) - str q0, [dstend, -16] - tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] -1: RETURN + stp val, val, [dstend, -16] + tbz count, 5, 1f + stp val, val, [dstin, 16] + stp val, val, [dstend, -32] +1: ret .p2align 4 /* Set 64..96 bytes. Write 64 bytes from the start and 32 bytes from the end. */ L(set96): - str q0, [dstin, 16] - stp q0, q0, [dstin, 32] - stp q0, q0, [dstend, -32] - RETURN + stp val, val, [dstin, 16] + stp val, val, [dstin, 32] + stp val, val, [dstin, 48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret .p2align 4 L(set_long): - and valw, valw, 255 - bic dst, dstin, 15 - str q0, [dstin] - cmp count, 160 - ccmp valw, 0, 0, hs - b.ne L(no_zva) - -#ifndef SKIP_ZVA_CHECK - mrs zva_val, dczid_el0 - and zva_val, zva_val, 31 - cmp zva_val, 4 /* ZVA size is 64 bytes. */ - b.ne L(no_zva) + stp val, val, [dstin] + bic dst, dstin, 15 +#if DC_ZVA_THRESHOLD + cmp count, DC_ZVA_THRESHOLD + ccmp val, 0, 0, cs + b.eq L(zva_64) #endif - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - sub count, dstend, dst /* Count is now 64 too large. */ - sub count, count, 128 /* Adjust count and bias for loop. */ + /* Small-size or non-zero memset does not use DC ZVA. */ + sub count, dstend, dst + /* + * Adjust count and bias for loop. By substracting extra 1 from count, + * it is easy to use tbz instruction to check whether loop tailing + * count is less than 33 bytes, so as to bypass 2 unneccesary stps. + */ + sub count, count, 64+16+1 + +#if DC_ZVA_THRESHOLD + /* Align loop on 16-byte boundary, this might be friendly to i-cache. */ + nop +#endif + +1: stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + stp val, val, [dst, 64]! + subs count, count, 64 + b.hs 1b + + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ + stp val, val, [dst, 16] + stp val, val, [dst, 32] +1: stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret + +#if DC_ZVA_THRESHOLD .p2align 4 -L(zva_loop): - add dst, dst, 64 - dc zva, dst - subs count, count, 64 - b.hi L(zva_loop) - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - RETURN +L(zva_64): + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + bic dst, dst, 63 -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - sub dst, dst, 16 /* Dst is biased by -32. */ - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -L(no_zva_loop): - stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! + /* + * Previous memory writes might cross cache line boundary, and cause + * cache line partially dirty. Zeroing this kind of cache line using + * DC ZVA will incur extra cost, for it requires loading untouched + * part of the line from memory before zeoring. + * + * So, write the first 64 byte aligned block using stp to force + * fully dirty cache line. + */ + stp val, val, [dst, 64] + stp val, val, [dst, 80] + stp val, val, [dst, 96] + stp val, val, [dst, 112] + + sub count, dstend, dst + /* + * Adjust count and bias for loop. By substracting extra 1 from count, + * it is easy to use tbz instruction to check whether loop tailing + * count is less than 33 bytes, so as to bypass 2 unneccesary stps. + */ + sub count, count, 128+64+64+1 + add dst, dst, 128 + nop + + /* DC ZVA sets 64 bytes each time. */ +1: dc zva, dst + add dst, dst, 64 subs count, count, 64 - b.hi L(no_zva_loop) - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - RETURN + b.hs 1b + + /* + * Write the last 64 byte aligned block using stp to force fully + * dirty cache line. + */ + stp val, val, [dst, 0] + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ + stp val, val, [dst, 64] + stp val, val, [dst, 80] +1: stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret +#endif + END (memset)