memset: use neon-less impl, reformat other asm

This commit is contained in:
Michael Scire 2020-03-11 08:45:00 -07:00
parent e42d3a3abf
commit 6c52cc3e26
4 changed files with 401 additions and 358 deletions

View file

@ -8,22 +8,22 @@
#ifndef _ASMDEFS_H #ifndef _ASMDEFS_H
#define _ASMDEFS_H #define _ASMDEFS_H
#define ENTRY_ALIGN(name, alignment) \ #define ENTRY_ALIGN(name, alignment) \
.global name; \ .global name; \
.type name,%function; \ .type name,%function; \
.align alignment; \ .align alignment; \
name: \ name: \
.cfi_startproc; .cfi_startproc;
#define ENTRY(name) ENTRY_ALIGN(name, 6) #define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \ #define ENTRY_ALIAS(name) \
.global name; \ .global name; \
.type name,%function; \ .type name,%function; \
name: name:
#define END(name) \ #define END(name) \
.cfi_endproc; \ .cfi_endproc; \
.size name, .-name; .size name, .-name;
#define L(l) .L ## l #define L(l) .L ## l

View file

@ -12,122 +12,122 @@
#include "asmdefs.h" #include "asmdefs.h"
/* Parameters and result. */ /* Parameters and result. */
#define src1 x0 #define src1 x0
#define src2 x1 #define src2 x1
#define limit x2 #define limit x2
#define result w0 #define result w0
/* Internal variables. */ /* Internal variables. */
#define data1 x3 #define data1 x3
#define data1w w3 #define data1w w3
#define data1h x4 #define data1h x4
#define data2 x5 #define data2 x5
#define data2w w5 #define data2w w5
#define data2h x6 #define data2h x6
#define tmp1 x7 #define tmp1 x7
#define tmp2 x8 #define tmp2 x8
ENTRY (memcmp) ENTRY (memcmp)
subs limit, limit, 8 subs limit, limit, 8
b.lo L(less8) b.lo L(less8)
ldr data1, [src1], 8 ldr data1, [src1], 8
ldr data2, [src2], 8 ldr data2, [src2], 8
cmp data1, data2 cmp data1, data2
b.ne L(return) b.ne L(return)
subs limit, limit, 8 subs limit, limit, 8
b.gt L(more16) b.gt L(more16)
ldr data1, [src1, limit] ldr data1, [src1, limit]
ldr data2, [src2, limit] ldr data2, [src2, limit]
b L(return) b L(return)
L(more16): L(more16):
ldr data1, [src1], 8 ldr data1, [src1], 8
ldr data2, [src2], 8 ldr data2, [src2], 8
cmp data1, data2 cmp data1, data2
bne L(return) bne L(return)
/* Jump directly to comparing the last 16 bytes for 32 byte (or less) /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
strings. */ strings. */
subs limit, limit, 16 subs limit, limit, 16
b.ls L(last_bytes) b.ls L(last_bytes)
/* We overlap loads between 0-32 bytes at either side of SRC1 when we /* We overlap loads between 0-32 bytes at either side of SRC1 when we
try to align, so limit it only to strings larger than 128 bytes. */ try to align, so limit it only to strings larger than 128 bytes. */
cmp limit, 96 cmp limit, 96
b.ls L(loop16) b.ls L(loop16)
/* Align src1 and adjust src2 with bytes not yet done. */ /* Align src1 and adjust src2 with bytes not yet done. */
and tmp1, src1, 15 and tmp1, src1, 15
add limit, limit, tmp1 add limit, limit, tmp1
sub src1, src1, tmp1 sub src1, src1, tmp1
sub src2, src2, tmp1 sub src2, src2, tmp1
/* Loop performing 16 bytes per iteration using aligned src1. /* Loop performing 16 bytes per iteration using aligned src1.
Limit is pre-decremented by 16 and must be larger than zero. Limit is pre-decremented by 16 and must be larger than zero.
Exit if <= 16 bytes left to do or if the data is not equal. */ Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4 .p2align 4
L(loop16): L(loop16):
ldp data1, data1h, [src1], 16 ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16 ldp data2, data2h, [src2], 16
subs limit, limit, 16 subs limit, limit, 16
ccmp data1, data2, 0, hi ccmp data1, data2, 0, hi
ccmp data1h, data2h, 0, eq ccmp data1h, data2h, 0, eq
b.eq L(loop16) b.eq L(loop16)
cmp data1, data2 cmp data1, data2
bne L(return) bne L(return)
mov data1, data1h mov data1, data1h
mov data2, data2h mov data2, data2h
cmp data1, data2 cmp data1, data2
bne L(return) bne L(return)
/* Compare last 1-16 bytes using unaligned access. */ /* Compare last 1-16 bytes using unaligned access. */
L(last_bytes): L(last_bytes):
add src1, src1, limit add src1, src1, limit
add src2, src2, limit add src2, src2, limit
ldp data1, data1h, [src1] ldp data1, data1h, [src1]
ldp data2, data2h, [src2] ldp data2, data2h, [src2]
cmp data1, data2 cmp data1, data2
bne L(return) bne L(return)
mov data1, data1h mov data1, data1h
mov data2, data2h mov data2, data2h
cmp data1, data2 cmp data1, data2
/* Compare data bytes and set return value to 0, -1 or 1. */ /* Compare data bytes and set return value to 0, -1 or 1. */
L(return): L(return):
#ifndef __AARCH64EB__ #ifndef __AARCH64EB__
rev data1, data1 rev data1, data1
rev data2, data2 rev data2, data2
#endif #endif
cmp data1, data2 cmp data1, data2
L(ret_eq): L(ret_eq):
cset result, ne cset result, ne
cneg result, result, lo cneg result, result, lo
ret ret
.p2align 4 .p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */ /* Compare up to 8 bytes. Limit is [-8..-1]. */
L(less8): L(less8):
adds limit, limit, 4 adds limit, limit, 4
b.lo L(less4) b.lo L(less4)
ldr data1w, [src1], 4 ldr data1w, [src1], 4
ldr data2w, [src2], 4 ldr data2w, [src2], 4
cmp data1w, data2w cmp data1w, data2w
b.ne L(return) b.ne L(return)
sub limit, limit, 4 sub limit, limit, 4
L(less4): L(less4):
adds limit, limit, 4 adds limit, limit, 4
beq L(ret_eq) beq L(ret_eq)
L(byte_loop): L(byte_loop):
ldrb data1w, [src1], 1 ldrb data1w, [src1], 1
ldrb data2w, [src2], 1 ldrb data2w, [src2], 1
subs limit, limit, 1 subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq L(byte_loop) b.eq L(byte_loop)
sub result, data1w, data2w sub result, data1w, data2w
ret ret
END (memcmp) END (memcmp)

View file

@ -13,32 +13,32 @@
#include "asmdefs.h" #include "asmdefs.h"
#define dstin x0 #define dstin x0
#define src x1 #define src x1
#define count x2 #define count x2
#define dst x3 #define dst x3
#define srcend x4 #define srcend x4
#define dstend x5 #define dstend x5
#define A_l x6 #define A_l x6
#define A_lw w6 #define A_lw w6
#define A_h x7 #define A_h x7
#define B_l x8 #define B_l x8
#define B_lw w8 #define B_lw w8
#define B_h x9 #define B_h x9
#define C_l x10 #define C_l x10
#define C_lw w10 #define C_lw w10
#define C_h x11 #define C_h x11
#define D_l x12 #define D_l x12
#define D_h x13 #define D_h x13
#define E_l x14 #define E_l x14
#define E_h x15 #define E_h x15
#define F_l x16 #define F_l x16
#define F_h x17 #define F_h x17
#define G_l count #define G_l count
#define G_h dst #define G_h dst
#define H_l src #define H_l src
#define H_h srcend #define H_h srcend
#define tmp1 x14 #define tmp1 x14
/* This implementation handles overlaps and supports both memcpy and memmove /* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless from a single entry point. It uses unaligned accesses and branchless
@ -55,185 +55,185 @@
ENTRY (memcpy) ENTRY (memcpy)
ENTRY_ALIAS (memmove) ENTRY_ALIAS (memmove)
add srcend, src, count add srcend, src, count
add dstend, dstin, count add dstend, dstin, count
cmp count, 128 cmp count, 128
b.hi L(copy_long) b.hi L(copy_long)
cmp count, 32 cmp count, 32
b.hi L(copy32_128) b.hi L(copy32_128)
/* Small copies: 0..32 bytes. */ /* Small copies: 0..32 bytes. */
cmp count, 16 cmp count, 16
b.lo L(copy16) b.lo L(copy16)
ldp A_l, A_h, [src] ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16] ldp D_l, D_h, [srcend, -16]
stp A_l, A_h, [dstin] stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16] stp D_l, D_h, [dstend, -16]
ret ret
/* Copy 8-15 bytes. */ /* Copy 8-15 bytes. */
L(copy16): L(copy16):
tbz count, 3, L(copy8) tbz count, 3, L(copy8)
ldr A_l, [src] ldr A_l, [src]
ldr A_h, [srcend, -8] ldr A_h, [srcend, -8]
str A_l, [dstin] str A_l, [dstin]
str A_h, [dstend, -8] str A_h, [dstend, -8]
ret ret
.p2align 3 .p2align 3
/* Copy 4-7 bytes. */ /* Copy 4-7 bytes. */
L(copy8): L(copy8):
tbz count, 2, L(copy4) tbz count, 2, L(copy4)
ldr A_lw, [src] ldr A_lw, [src]
ldr B_lw, [srcend, -4] ldr B_lw, [srcend, -4]
str A_lw, [dstin] str A_lw, [dstin]
str B_lw, [dstend, -4] str B_lw, [dstend, -4]
ret ret
/* Copy 0..3 bytes using a branchless sequence. */ /* Copy 0..3 bytes using a branchless sequence. */
L(copy4): L(copy4):
cbz count, L(copy0) cbz count, L(copy0)
lsr tmp1, count, 1 lsr tmp1, count, 1
ldrb A_lw, [src] ldrb A_lw, [src]
ldrb C_lw, [srcend, -1] ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1] ldrb B_lw, [src, tmp1]
strb A_lw, [dstin] strb A_lw, [dstin]
strb B_lw, [dstin, tmp1] strb B_lw, [dstin, tmp1]
strb C_lw, [dstend, -1] strb C_lw, [dstend, -1]
L(copy0): L(copy0):
ret ret
.p2align 4 .p2align 4
/* Medium copies: 33..128 bytes. */ /* Medium copies: 33..128 bytes. */
L(copy32_128): L(copy32_128):
ldp A_l, A_h, [src] ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16] ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32] ldp C_l, C_h, [srcend, -32]
ldp D_l, D_h, [srcend, -16] ldp D_l, D_h, [srcend, -16]
cmp count, 64 cmp count, 64
b.hi L(copy128) b.hi L(copy128)
stp A_l, A_h, [dstin] stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16] stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend, -32] stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16] stp D_l, D_h, [dstend, -16]
ret ret
.p2align 4 .p2align 4
/* Copy 65..128 bytes. */ /* Copy 65..128 bytes. */
L(copy128): L(copy128):
ldp E_l, E_h, [src, 32] ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48] ldp F_l, F_h, [src, 48]
cmp count, 96 cmp count, 96
b.ls L(copy96) b.ls L(copy96)
ldp G_l, G_h, [srcend, -64] ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48] ldp H_l, H_h, [srcend, -48]
stp G_l, G_h, [dstend, -64] stp G_l, G_h, [dstend, -64]
stp H_l, H_h, [dstend, -48] stp H_l, H_h, [dstend, -48]
L(copy96): L(copy96):
stp A_l, A_h, [dstin] stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16] stp B_l, B_h, [dstin, 16]
stp E_l, E_h, [dstin, 32] stp E_l, E_h, [dstin, 32]
stp F_l, F_h, [dstin, 48] stp F_l, F_h, [dstin, 48]
stp C_l, C_h, [dstend, -32] stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16] stp D_l, D_h, [dstend, -16]
ret ret
.p2align 4 .p2align 4
/* Copy more than 128 bytes. */ /* Copy more than 128 bytes. */
L(copy_long): L(copy_long):
/* Use backwards copy if there is an overlap. */ /* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src sub tmp1, dstin, src
cbz tmp1, L(copy0) cbz tmp1, L(copy0)
cmp tmp1, count cmp tmp1, count
b.lo L(copy_long_backwards) b.lo L(copy_long_backwards)
/* Copy 16 bytes and then align dst to 16-byte alignment. */ /* Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [src] ldp D_l, D_h, [src]
and tmp1, dstin, 15 and tmp1, dstin, 15
bic dst, dstin, 15 bic dst, dstin, 15
sub src, src, tmp1 sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */ add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16] ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin] stp D_l, D_h, [dstin]
ldp B_l, B_h, [src, 32] ldp B_l, B_h, [src, 32]
ldp C_l, C_h, [src, 48] ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]! ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */ subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end) b.ls L(copy64_from_end)
L(loop64): L(loop64):
stp A_l, A_h, [dst, 16] stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16] ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32] stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [src, 32] ldp B_l, B_h, [src, 32]
stp C_l, C_h, [dst, 48] stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [src, 48] ldp C_l, C_h, [src, 48]
stp D_l, D_h, [dst, 64]! stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]! ldp D_l, D_h, [src, 64]!
subs count, count, 64 subs count, count, 64
b.hi L(loop64) b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */ /* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end): L(copy64_from_end):
ldp E_l, E_h, [srcend, -64] ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16] stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48] ldp A_l, A_h, [srcend, -48]
stp B_l, B_h, [dst, 32] stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [srcend, -32] ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dst, 48] stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [srcend, -16] ldp C_l, C_h, [srcend, -16]
stp D_l, D_h, [dst, 64] stp D_l, D_h, [dst, 64]
stp E_l, E_h, [dstend, -64] stp E_l, E_h, [dstend, -64]
stp A_l, A_h, [dstend, -48] stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32] stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16] stp C_l, C_h, [dstend, -16]
ret ret
.p2align 4 .p2align 4
/* Large backwards copy for overlapping copies. /* Large backwards copy for overlapping copies.
Copy 16 bytes and then align dst to 16-byte alignment. */ Copy 16 bytes and then align dst to 16-byte alignment. */
L(copy_long_backwards): L(copy_long_backwards):
ldp D_l, D_h, [srcend, -16] ldp D_l, D_h, [srcend, -16]
and tmp1, dstend, 15 and tmp1, dstend, 15
sub srcend, srcend, tmp1 sub srcend, srcend, tmp1
sub count, count, tmp1 sub count, count, tmp1
ldp A_l, A_h, [srcend, -16] ldp A_l, A_h, [srcend, -16]
stp D_l, D_h, [dstend, -16] stp D_l, D_h, [dstend, -16]
ldp B_l, B_h, [srcend, -32] ldp B_l, B_h, [srcend, -32]
ldp C_l, C_h, [srcend, -48] ldp C_l, C_h, [srcend, -48]
ldp D_l, D_h, [srcend, -64]! ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1 sub dstend, dstend, tmp1
subs count, count, 128 subs count, count, 128
b.ls L(copy64_from_start) b.ls L(copy64_from_start)
L(loop64_backwards): L(loop64_backwards):
stp A_l, A_h, [dstend, -16] stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16] ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32] stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [srcend, -32] ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dstend, -48] stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [srcend, -48] ldp C_l, C_h, [srcend, -48]
stp D_l, D_h, [dstend, -64]! stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]! ldp D_l, D_h, [srcend, -64]!
subs count, count, 64 subs count, count, 64
b.hi L(loop64_backwards) b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */ /* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start): L(copy64_from_start):
ldp G_l, G_h, [src, 48] ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16] stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32] ldp A_l, A_h, [src, 32]
stp B_l, B_h, [dstend, -32] stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [src, 16] ldp B_l, B_h, [src, 16]
stp C_l, C_h, [dstend, -48] stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [src] ldp C_l, C_h, [src]
stp D_l, D_h, [dstend, -64] stp D_l, D_h, [dstend, -64]
stp G_l, G_h, [dstin, 48] stp G_l, G_h, [dstin, 48]
stp A_l, A_h, [dstin, 32] stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16] stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin] stp C_l, C_h, [dstin]
ret ret
END (memcpy) END (memcpy)

View file

@ -13,115 +13,158 @@
#include "asmdefs.h" #include "asmdefs.h"
#define SAVE_FPU_REGISTERS #define DC_ZVA_THRESHOLD 512
#define SKIP_ZVA_CHECK
#ifdef SAVE_FPU_REGISTERS
#define SAVE_Q0 str q0, [sp, #-16]!
#define RESTORE_Q0 ldr q0, [sp], #16
#else
#define SAVE_Q0
#define RESTORE_Q0
#endif
#define RETURN RESTORE_Q0; ret; .p2align 4
#define dstin x0 #define dstin x0
#define val x1 #define val x1
#define valw w1 #define valw w1
#define count x2 #define count x2
#define dst x3 #define dst x3
#define dstend x4 #define dstend x4
#define zva_val x5 #define zva_val x5
ENTRY (memset) ENTRY (memset)
SAVE_Q0
dup v0.16B, valw bfi valw, valw, 8, 8
add dstend, dstin, count bfi valw, valw, 16, 16
bfi val, val, 32, 32
cmp count, 96 add dstend, dstin, count
cmp count, 96
b.hi L(set_long) b.hi L(set_long)
cmp count, 16 cmp count, 16
b.hs L(set_medium) b.hs L(set_medium)
mov val, v0.D[0]
/* Set 0..15 bytes. */ /* Set 0..15 bytes. */
tbz count, 3, 1f tbz count, 3, 1f
str val, [dstin] str val, [dstin]
str val, [dstend, -8] str val, [dstend, -8]
RETURN ret
1: tbz count, 2, 2f 1: tbz count, 2, 2f
str valw, [dstin] str valw, [dstin]
str valw, [dstend, -4] str valw, [dstend, -4]
RETURN ret
2: cbz count, 3f 2: cbz count, 3f
strb valw, [dstin] strb valw, [dstin]
tbz count, 1, 3f tbz count, 1, 3f
strh valw, [dstend, -2] strh valw, [dstend, -2]
3: RETURN 3: ret
/* Set 17..96 bytes. */ /* Set 16..96 bytes. */
.p2align 4
L(set_medium): L(set_medium):
str q0, [dstin] stp val, val, [dstin]
tbnz count, 6, L(set96) tbnz count, 6, L(set96)
str q0, [dstend, -16] stp val, val, [dstend, -16]
tbz count, 5, 1f tbz count, 5, 1f
str q0, [dstin, 16] stp val, val, [dstin, 16]
str q0, [dstend, -32] stp val, val, [dstend, -32]
1: RETURN 1: ret
.p2align 4 .p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and /* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */ 32 bytes from the end. */
L(set96): L(set96):
str q0, [dstin, 16] stp val, val, [dstin, 16]
stp q0, q0, [dstin, 32] stp val, val, [dstin, 32]
stp q0, q0, [dstend, -32] stp val, val, [dstin, 48]
RETURN stp val, val, [dstend, -32]
stp val, val, [dstend, -16]
ret
.p2align 4 .p2align 4
L(set_long): L(set_long):
and valw, valw, 255 stp val, val, [dstin]
bic dst, dstin, 15 bic dst, dstin, 15
str q0, [dstin] #if DC_ZVA_THRESHOLD
cmp count, 160 cmp count, DC_ZVA_THRESHOLD
ccmp valw, 0, 0, hs ccmp val, 0, 0, cs
b.ne L(no_zva) b.eq L(zva_64)
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif #endif
str q0, [dst, 16] /* Small-size or non-zero memset does not use DC ZVA. */
stp q0, q0, [dst, 32] sub count, dstend, dst
bic dst, dst, 63
sub count, dstend, dst /* Count is now 64 too large. */
sub count, count, 128 /* Adjust count and bias for loop. */
/*
* Adjust count and bias for loop. By substracting extra 1 from count,
* it is easy to use tbz instruction to check whether loop tailing
* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
*/
sub count, count, 64+16+1
#if DC_ZVA_THRESHOLD
/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
nop
#endif
1: stp val, val, [dst, 16]
stp val, val, [dst, 32]
stp val, val, [dst, 48]
stp val, val, [dst, 64]!
subs count, count, 64
b.hs 1b
tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
stp val, val, [dst, 16]
stp val, val, [dst, 32]
1: stp val, val, [dstend, -32]
stp val, val, [dstend, -16]
ret
#if DC_ZVA_THRESHOLD
.p2align 4 .p2align 4
L(zva_loop): L(zva_64):
add dst, dst, 64 stp val, val, [dst, 16]
dc zva, dst stp val, val, [dst, 32]
subs count, count, 64 stp val, val, [dst, 48]
b.hi L(zva_loop) bic dst, dst, 63
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
RETURN
L(no_zva): /*
sub count, dstend, dst /* Count is 16 too large. */ * Previous memory writes might cross cache line boundary, and cause
sub dst, dst, 16 /* Dst is biased by -32. */ * cache line partially dirty. Zeroing this kind of cache line using
sub count, count, 64 + 16 /* Adjust count and bias for loop. */ * DC ZVA will incur extra cost, for it requires loading untouched
L(no_zva_loop): * part of the line from memory before zeoring.
stp q0, q0, [dst, 32] *
stp q0, q0, [dst, 64]! * So, write the first 64 byte aligned block using stp to force
* fully dirty cache line.
*/
stp val, val, [dst, 64]
stp val, val, [dst, 80]
stp val, val, [dst, 96]
stp val, val, [dst, 112]
sub count, dstend, dst
/*
* Adjust count and bias for loop. By substracting extra 1 from count,
* it is easy to use tbz instruction to check whether loop tailing
* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
*/
sub count, count, 128+64+64+1
add dst, dst, 128
nop
/* DC ZVA sets 64 bytes each time. */
1: dc zva, dst
add dst, dst, 64
subs count, count, 64 subs count, count, 64
b.hi L(no_zva_loop) b.hs 1b
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32] /*
RETURN * Write the last 64 byte aligned block using stp to force fully
* dirty cache line.
*/
stp val, val, [dst, 0]
stp val, val, [dst, 16]
stp val, val, [dst, 32]
stp val, val, [dst, 48]
tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
stp val, val, [dst, 64]
stp val, val, [dst, 80]
1: stp val, val, [dstend, -32]
stp val, val, [dstend, -16]
ret
#endif
END (memset) END (memset)