mirror of
https://github.com/Atmosphere-NX/Atmosphere
synced 2025-01-08 21:47:57 +00:00
memset: use neon-less impl, reformat other asm
This commit is contained in:
parent
e42d3a3abf
commit
6c52cc3e26
4 changed files with 401 additions and 358 deletions
|
@ -8,22 +8,22 @@
|
||||||
#ifndef _ASMDEFS_H
|
#ifndef _ASMDEFS_H
|
||||||
#define _ASMDEFS_H
|
#define _ASMDEFS_H
|
||||||
|
|
||||||
#define ENTRY_ALIGN(name, alignment) \
|
#define ENTRY_ALIGN(name, alignment) \
|
||||||
.global name; \
|
.global name; \
|
||||||
.type name,%function; \
|
.type name,%function; \
|
||||||
.align alignment; \
|
.align alignment; \
|
||||||
name: \
|
name: \
|
||||||
.cfi_startproc;
|
.cfi_startproc;
|
||||||
|
|
||||||
#define ENTRY(name) ENTRY_ALIGN(name, 6)
|
#define ENTRY(name) ENTRY_ALIGN(name, 6)
|
||||||
|
|
||||||
#define ENTRY_ALIAS(name) \
|
#define ENTRY_ALIAS(name) \
|
||||||
.global name; \
|
.global name; \
|
||||||
.type name,%function; \
|
.type name,%function; \
|
||||||
name:
|
name:
|
||||||
|
|
||||||
#define END(name) \
|
#define END(name) \
|
||||||
.cfi_endproc; \
|
.cfi_endproc; \
|
||||||
.size name, .-name;
|
.size name, .-name;
|
||||||
|
|
||||||
#define L(l) .L ## l
|
#define L(l) .L ## l
|
||||||
|
|
|
@ -12,122 +12,122 @@
|
||||||
#include "asmdefs.h"
|
#include "asmdefs.h"
|
||||||
|
|
||||||
/* Parameters and result. */
|
/* Parameters and result. */
|
||||||
#define src1 x0
|
#define src1 x0
|
||||||
#define src2 x1
|
#define src2 x1
|
||||||
#define limit x2
|
#define limit x2
|
||||||
#define result w0
|
#define result w0
|
||||||
|
|
||||||
/* Internal variables. */
|
/* Internal variables. */
|
||||||
#define data1 x3
|
#define data1 x3
|
||||||
#define data1w w3
|
#define data1w w3
|
||||||
#define data1h x4
|
#define data1h x4
|
||||||
#define data2 x5
|
#define data2 x5
|
||||||
#define data2w w5
|
#define data2w w5
|
||||||
#define data2h x6
|
#define data2h x6
|
||||||
#define tmp1 x7
|
#define tmp1 x7
|
||||||
#define tmp2 x8
|
#define tmp2 x8
|
||||||
|
|
||||||
ENTRY (memcmp)
|
ENTRY (memcmp)
|
||||||
subs limit, limit, 8
|
subs limit, limit, 8
|
||||||
b.lo L(less8)
|
b.lo L(less8)
|
||||||
|
|
||||||
ldr data1, [src1], 8
|
ldr data1, [src1], 8
|
||||||
ldr data2, [src2], 8
|
ldr data2, [src2], 8
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
b.ne L(return)
|
b.ne L(return)
|
||||||
|
|
||||||
subs limit, limit, 8
|
subs limit, limit, 8
|
||||||
b.gt L(more16)
|
b.gt L(more16)
|
||||||
|
|
||||||
ldr data1, [src1, limit]
|
ldr data1, [src1, limit]
|
||||||
ldr data2, [src2, limit]
|
ldr data2, [src2, limit]
|
||||||
b L(return)
|
b L(return)
|
||||||
|
|
||||||
L(more16):
|
L(more16):
|
||||||
ldr data1, [src1], 8
|
ldr data1, [src1], 8
|
||||||
ldr data2, [src2], 8
|
ldr data2, [src2], 8
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
bne L(return)
|
bne L(return)
|
||||||
|
|
||||||
/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
|
/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
|
||||||
strings. */
|
strings. */
|
||||||
subs limit, limit, 16
|
subs limit, limit, 16
|
||||||
b.ls L(last_bytes)
|
b.ls L(last_bytes)
|
||||||
|
|
||||||
/* We overlap loads between 0-32 bytes at either side of SRC1 when we
|
/* We overlap loads between 0-32 bytes at either side of SRC1 when we
|
||||||
try to align, so limit it only to strings larger than 128 bytes. */
|
try to align, so limit it only to strings larger than 128 bytes. */
|
||||||
cmp limit, 96
|
cmp limit, 96
|
||||||
b.ls L(loop16)
|
b.ls L(loop16)
|
||||||
|
|
||||||
/* Align src1 and adjust src2 with bytes not yet done. */
|
/* Align src1 and adjust src2 with bytes not yet done. */
|
||||||
and tmp1, src1, 15
|
and tmp1, src1, 15
|
||||||
add limit, limit, tmp1
|
add limit, limit, tmp1
|
||||||
sub src1, src1, tmp1
|
sub src1, src1, tmp1
|
||||||
sub src2, src2, tmp1
|
sub src2, src2, tmp1
|
||||||
|
|
||||||
/* Loop performing 16 bytes per iteration using aligned src1.
|
/* Loop performing 16 bytes per iteration using aligned src1.
|
||||||
Limit is pre-decremented by 16 and must be larger than zero.
|
Limit is pre-decremented by 16 and must be larger than zero.
|
||||||
Exit if <= 16 bytes left to do or if the data is not equal. */
|
Exit if <= 16 bytes left to do or if the data is not equal. */
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(loop16):
|
L(loop16):
|
||||||
ldp data1, data1h, [src1], 16
|
ldp data1, data1h, [src1], 16
|
||||||
ldp data2, data2h, [src2], 16
|
ldp data2, data2h, [src2], 16
|
||||||
subs limit, limit, 16
|
subs limit, limit, 16
|
||||||
ccmp data1, data2, 0, hi
|
ccmp data1, data2, 0, hi
|
||||||
ccmp data1h, data2h, 0, eq
|
ccmp data1h, data2h, 0, eq
|
||||||
b.eq L(loop16)
|
b.eq L(loop16)
|
||||||
|
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
bne L(return)
|
bne L(return)
|
||||||
mov data1, data1h
|
mov data1, data1h
|
||||||
mov data2, data2h
|
mov data2, data2h
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
bne L(return)
|
bne L(return)
|
||||||
|
|
||||||
/* Compare last 1-16 bytes using unaligned access. */
|
/* Compare last 1-16 bytes using unaligned access. */
|
||||||
L(last_bytes):
|
L(last_bytes):
|
||||||
add src1, src1, limit
|
add src1, src1, limit
|
||||||
add src2, src2, limit
|
add src2, src2, limit
|
||||||
ldp data1, data1h, [src1]
|
ldp data1, data1h, [src1]
|
||||||
ldp data2, data2h, [src2]
|
ldp data2, data2h, [src2]
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
bne L(return)
|
bne L(return)
|
||||||
mov data1, data1h
|
mov data1, data1h
|
||||||
mov data2, data2h
|
mov data2, data2h
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
|
|
||||||
/* Compare data bytes and set return value to 0, -1 or 1. */
|
/* Compare data bytes and set return value to 0, -1 or 1. */
|
||||||
L(return):
|
L(return):
|
||||||
#ifndef __AARCH64EB__
|
#ifndef __AARCH64EB__
|
||||||
rev data1, data1
|
rev data1, data1
|
||||||
rev data2, data2
|
rev data2, data2
|
||||||
#endif
|
#endif
|
||||||
cmp data1, data2
|
cmp data1, data2
|
||||||
L(ret_eq):
|
L(ret_eq):
|
||||||
cset result, ne
|
cset result, ne
|
||||||
cneg result, result, lo
|
cneg result, result, lo
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Compare up to 8 bytes. Limit is [-8..-1]. */
|
/* Compare up to 8 bytes. Limit is [-8..-1]. */
|
||||||
L(less8):
|
L(less8):
|
||||||
adds limit, limit, 4
|
adds limit, limit, 4
|
||||||
b.lo L(less4)
|
b.lo L(less4)
|
||||||
ldr data1w, [src1], 4
|
ldr data1w, [src1], 4
|
||||||
ldr data2w, [src2], 4
|
ldr data2w, [src2], 4
|
||||||
cmp data1w, data2w
|
cmp data1w, data2w
|
||||||
b.ne L(return)
|
b.ne L(return)
|
||||||
sub limit, limit, 4
|
sub limit, limit, 4
|
||||||
L(less4):
|
L(less4):
|
||||||
adds limit, limit, 4
|
adds limit, limit, 4
|
||||||
beq L(ret_eq)
|
beq L(ret_eq)
|
||||||
L(byte_loop):
|
L(byte_loop):
|
||||||
ldrb data1w, [src1], 1
|
ldrb data1w, [src1], 1
|
||||||
ldrb data2w, [src2], 1
|
ldrb data2w, [src2], 1
|
||||||
subs limit, limit, 1
|
subs limit, limit, 1
|
||||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||||
b.eq L(byte_loop)
|
b.eq L(byte_loop)
|
||||||
sub result, data1w, data2w
|
sub result, data1w, data2w
|
||||||
ret
|
ret
|
||||||
|
|
||||||
END (memcmp)
|
END (memcmp)
|
||||||
|
|
|
@ -13,32 +13,32 @@
|
||||||
|
|
||||||
#include "asmdefs.h"
|
#include "asmdefs.h"
|
||||||
|
|
||||||
#define dstin x0
|
#define dstin x0
|
||||||
#define src x1
|
#define src x1
|
||||||
#define count x2
|
#define count x2
|
||||||
#define dst x3
|
#define dst x3
|
||||||
#define srcend x4
|
#define srcend x4
|
||||||
#define dstend x5
|
#define dstend x5
|
||||||
#define A_l x6
|
#define A_l x6
|
||||||
#define A_lw w6
|
#define A_lw w6
|
||||||
#define A_h x7
|
#define A_h x7
|
||||||
#define B_l x8
|
#define B_l x8
|
||||||
#define B_lw w8
|
#define B_lw w8
|
||||||
#define B_h x9
|
#define B_h x9
|
||||||
#define C_l x10
|
#define C_l x10
|
||||||
#define C_lw w10
|
#define C_lw w10
|
||||||
#define C_h x11
|
#define C_h x11
|
||||||
#define D_l x12
|
#define D_l x12
|
||||||
#define D_h x13
|
#define D_h x13
|
||||||
#define E_l x14
|
#define E_l x14
|
||||||
#define E_h x15
|
#define E_h x15
|
||||||
#define F_l x16
|
#define F_l x16
|
||||||
#define F_h x17
|
#define F_h x17
|
||||||
#define G_l count
|
#define G_l count
|
||||||
#define G_h dst
|
#define G_h dst
|
||||||
#define H_l src
|
#define H_l src
|
||||||
#define H_h srcend
|
#define H_h srcend
|
||||||
#define tmp1 x14
|
#define tmp1 x14
|
||||||
|
|
||||||
/* This implementation handles overlaps and supports both memcpy and memmove
|
/* This implementation handles overlaps and supports both memcpy and memmove
|
||||||
from a single entry point. It uses unaligned accesses and branchless
|
from a single entry point. It uses unaligned accesses and branchless
|
||||||
|
@ -55,185 +55,185 @@
|
||||||
|
|
||||||
ENTRY (memcpy)
|
ENTRY (memcpy)
|
||||||
ENTRY_ALIAS (memmove)
|
ENTRY_ALIAS (memmove)
|
||||||
add srcend, src, count
|
add srcend, src, count
|
||||||
add dstend, dstin, count
|
add dstend, dstin, count
|
||||||
cmp count, 128
|
cmp count, 128
|
||||||
b.hi L(copy_long)
|
b.hi L(copy_long)
|
||||||
cmp count, 32
|
cmp count, 32
|
||||||
b.hi L(copy32_128)
|
b.hi L(copy32_128)
|
||||||
|
|
||||||
/* Small copies: 0..32 bytes. */
|
/* Small copies: 0..32 bytes. */
|
||||||
cmp count, 16
|
cmp count, 16
|
||||||
b.lo L(copy16)
|
b.lo L(copy16)
|
||||||
ldp A_l, A_h, [src]
|
ldp A_l, A_h, [src]
|
||||||
ldp D_l, D_h, [srcend, -16]
|
ldp D_l, D_h, [srcend, -16]
|
||||||
stp A_l, A_h, [dstin]
|
stp A_l, A_h, [dstin]
|
||||||
stp D_l, D_h, [dstend, -16]
|
stp D_l, D_h, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
/* Copy 8-15 bytes. */
|
/* Copy 8-15 bytes. */
|
||||||
L(copy16):
|
L(copy16):
|
||||||
tbz count, 3, L(copy8)
|
tbz count, 3, L(copy8)
|
||||||
ldr A_l, [src]
|
ldr A_l, [src]
|
||||||
ldr A_h, [srcend, -8]
|
ldr A_h, [srcend, -8]
|
||||||
str A_l, [dstin]
|
str A_l, [dstin]
|
||||||
str A_h, [dstend, -8]
|
str A_h, [dstend, -8]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 3
|
.p2align 3
|
||||||
/* Copy 4-7 bytes. */
|
/* Copy 4-7 bytes. */
|
||||||
L(copy8):
|
L(copy8):
|
||||||
tbz count, 2, L(copy4)
|
tbz count, 2, L(copy4)
|
||||||
ldr A_lw, [src]
|
ldr A_lw, [src]
|
||||||
ldr B_lw, [srcend, -4]
|
ldr B_lw, [srcend, -4]
|
||||||
str A_lw, [dstin]
|
str A_lw, [dstin]
|
||||||
str B_lw, [dstend, -4]
|
str B_lw, [dstend, -4]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
/* Copy 0..3 bytes using a branchless sequence. */
|
/* Copy 0..3 bytes using a branchless sequence. */
|
||||||
L(copy4):
|
L(copy4):
|
||||||
cbz count, L(copy0)
|
cbz count, L(copy0)
|
||||||
lsr tmp1, count, 1
|
lsr tmp1, count, 1
|
||||||
ldrb A_lw, [src]
|
ldrb A_lw, [src]
|
||||||
ldrb C_lw, [srcend, -1]
|
ldrb C_lw, [srcend, -1]
|
||||||
ldrb B_lw, [src, tmp1]
|
ldrb B_lw, [src, tmp1]
|
||||||
strb A_lw, [dstin]
|
strb A_lw, [dstin]
|
||||||
strb B_lw, [dstin, tmp1]
|
strb B_lw, [dstin, tmp1]
|
||||||
strb C_lw, [dstend, -1]
|
strb C_lw, [dstend, -1]
|
||||||
L(copy0):
|
L(copy0):
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Medium copies: 33..128 bytes. */
|
/* Medium copies: 33..128 bytes. */
|
||||||
L(copy32_128):
|
L(copy32_128):
|
||||||
ldp A_l, A_h, [src]
|
ldp A_l, A_h, [src]
|
||||||
ldp B_l, B_h, [src, 16]
|
ldp B_l, B_h, [src, 16]
|
||||||
ldp C_l, C_h, [srcend, -32]
|
ldp C_l, C_h, [srcend, -32]
|
||||||
ldp D_l, D_h, [srcend, -16]
|
ldp D_l, D_h, [srcend, -16]
|
||||||
cmp count, 64
|
cmp count, 64
|
||||||
b.hi L(copy128)
|
b.hi L(copy128)
|
||||||
stp A_l, A_h, [dstin]
|
stp A_l, A_h, [dstin]
|
||||||
stp B_l, B_h, [dstin, 16]
|
stp B_l, B_h, [dstin, 16]
|
||||||
stp C_l, C_h, [dstend, -32]
|
stp C_l, C_h, [dstend, -32]
|
||||||
stp D_l, D_h, [dstend, -16]
|
stp D_l, D_h, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Copy 65..128 bytes. */
|
/* Copy 65..128 bytes. */
|
||||||
L(copy128):
|
L(copy128):
|
||||||
ldp E_l, E_h, [src, 32]
|
ldp E_l, E_h, [src, 32]
|
||||||
ldp F_l, F_h, [src, 48]
|
ldp F_l, F_h, [src, 48]
|
||||||
cmp count, 96
|
cmp count, 96
|
||||||
b.ls L(copy96)
|
b.ls L(copy96)
|
||||||
ldp G_l, G_h, [srcend, -64]
|
ldp G_l, G_h, [srcend, -64]
|
||||||
ldp H_l, H_h, [srcend, -48]
|
ldp H_l, H_h, [srcend, -48]
|
||||||
stp G_l, G_h, [dstend, -64]
|
stp G_l, G_h, [dstend, -64]
|
||||||
stp H_l, H_h, [dstend, -48]
|
stp H_l, H_h, [dstend, -48]
|
||||||
L(copy96):
|
L(copy96):
|
||||||
stp A_l, A_h, [dstin]
|
stp A_l, A_h, [dstin]
|
||||||
stp B_l, B_h, [dstin, 16]
|
stp B_l, B_h, [dstin, 16]
|
||||||
stp E_l, E_h, [dstin, 32]
|
stp E_l, E_h, [dstin, 32]
|
||||||
stp F_l, F_h, [dstin, 48]
|
stp F_l, F_h, [dstin, 48]
|
||||||
stp C_l, C_h, [dstend, -32]
|
stp C_l, C_h, [dstend, -32]
|
||||||
stp D_l, D_h, [dstend, -16]
|
stp D_l, D_h, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Copy more than 128 bytes. */
|
/* Copy more than 128 bytes. */
|
||||||
L(copy_long):
|
L(copy_long):
|
||||||
/* Use backwards copy if there is an overlap. */
|
/* Use backwards copy if there is an overlap. */
|
||||||
sub tmp1, dstin, src
|
sub tmp1, dstin, src
|
||||||
cbz tmp1, L(copy0)
|
cbz tmp1, L(copy0)
|
||||||
cmp tmp1, count
|
cmp tmp1, count
|
||||||
b.lo L(copy_long_backwards)
|
b.lo L(copy_long_backwards)
|
||||||
|
|
||||||
/* Copy 16 bytes and then align dst to 16-byte alignment. */
|
/* Copy 16 bytes and then align dst to 16-byte alignment. */
|
||||||
|
|
||||||
ldp D_l, D_h, [src]
|
ldp D_l, D_h, [src]
|
||||||
and tmp1, dstin, 15
|
and tmp1, dstin, 15
|
||||||
bic dst, dstin, 15
|
bic dst, dstin, 15
|
||||||
sub src, src, tmp1
|
sub src, src, tmp1
|
||||||
add count, count, tmp1 /* Count is now 16 too large. */
|
add count, count, tmp1 /* Count is now 16 too large. */
|
||||||
ldp A_l, A_h, [src, 16]
|
ldp A_l, A_h, [src, 16]
|
||||||
stp D_l, D_h, [dstin]
|
stp D_l, D_h, [dstin]
|
||||||
ldp B_l, B_h, [src, 32]
|
ldp B_l, B_h, [src, 32]
|
||||||
ldp C_l, C_h, [src, 48]
|
ldp C_l, C_h, [src, 48]
|
||||||
ldp D_l, D_h, [src, 64]!
|
ldp D_l, D_h, [src, 64]!
|
||||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||||
b.ls L(copy64_from_end)
|
b.ls L(copy64_from_end)
|
||||||
|
|
||||||
L(loop64):
|
L(loop64):
|
||||||
stp A_l, A_h, [dst, 16]
|
stp A_l, A_h, [dst, 16]
|
||||||
ldp A_l, A_h, [src, 16]
|
ldp A_l, A_h, [src, 16]
|
||||||
stp B_l, B_h, [dst, 32]
|
stp B_l, B_h, [dst, 32]
|
||||||
ldp B_l, B_h, [src, 32]
|
ldp B_l, B_h, [src, 32]
|
||||||
stp C_l, C_h, [dst, 48]
|
stp C_l, C_h, [dst, 48]
|
||||||
ldp C_l, C_h, [src, 48]
|
ldp C_l, C_h, [src, 48]
|
||||||
stp D_l, D_h, [dst, 64]!
|
stp D_l, D_h, [dst, 64]!
|
||||||
ldp D_l, D_h, [src, 64]!
|
ldp D_l, D_h, [src, 64]!
|
||||||
subs count, count, 64
|
subs count, count, 64
|
||||||
b.hi L(loop64)
|
b.hi L(loop64)
|
||||||
|
|
||||||
/* Write the last iteration and copy 64 bytes from the end. */
|
/* Write the last iteration and copy 64 bytes from the end. */
|
||||||
L(copy64_from_end):
|
L(copy64_from_end):
|
||||||
ldp E_l, E_h, [srcend, -64]
|
ldp E_l, E_h, [srcend, -64]
|
||||||
stp A_l, A_h, [dst, 16]
|
stp A_l, A_h, [dst, 16]
|
||||||
ldp A_l, A_h, [srcend, -48]
|
ldp A_l, A_h, [srcend, -48]
|
||||||
stp B_l, B_h, [dst, 32]
|
stp B_l, B_h, [dst, 32]
|
||||||
ldp B_l, B_h, [srcend, -32]
|
ldp B_l, B_h, [srcend, -32]
|
||||||
stp C_l, C_h, [dst, 48]
|
stp C_l, C_h, [dst, 48]
|
||||||
ldp C_l, C_h, [srcend, -16]
|
ldp C_l, C_h, [srcend, -16]
|
||||||
stp D_l, D_h, [dst, 64]
|
stp D_l, D_h, [dst, 64]
|
||||||
stp E_l, E_h, [dstend, -64]
|
stp E_l, E_h, [dstend, -64]
|
||||||
stp A_l, A_h, [dstend, -48]
|
stp A_l, A_h, [dstend, -48]
|
||||||
stp B_l, B_h, [dstend, -32]
|
stp B_l, B_h, [dstend, -32]
|
||||||
stp C_l, C_h, [dstend, -16]
|
stp C_l, C_h, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
|
|
||||||
/* Large backwards copy for overlapping copies.
|
/* Large backwards copy for overlapping copies.
|
||||||
Copy 16 bytes and then align dst to 16-byte alignment. */
|
Copy 16 bytes and then align dst to 16-byte alignment. */
|
||||||
L(copy_long_backwards):
|
L(copy_long_backwards):
|
||||||
ldp D_l, D_h, [srcend, -16]
|
ldp D_l, D_h, [srcend, -16]
|
||||||
and tmp1, dstend, 15
|
and tmp1, dstend, 15
|
||||||
sub srcend, srcend, tmp1
|
sub srcend, srcend, tmp1
|
||||||
sub count, count, tmp1
|
sub count, count, tmp1
|
||||||
ldp A_l, A_h, [srcend, -16]
|
ldp A_l, A_h, [srcend, -16]
|
||||||
stp D_l, D_h, [dstend, -16]
|
stp D_l, D_h, [dstend, -16]
|
||||||
ldp B_l, B_h, [srcend, -32]
|
ldp B_l, B_h, [srcend, -32]
|
||||||
ldp C_l, C_h, [srcend, -48]
|
ldp C_l, C_h, [srcend, -48]
|
||||||
ldp D_l, D_h, [srcend, -64]!
|
ldp D_l, D_h, [srcend, -64]!
|
||||||
sub dstend, dstend, tmp1
|
sub dstend, dstend, tmp1
|
||||||
subs count, count, 128
|
subs count, count, 128
|
||||||
b.ls L(copy64_from_start)
|
b.ls L(copy64_from_start)
|
||||||
|
|
||||||
L(loop64_backwards):
|
L(loop64_backwards):
|
||||||
stp A_l, A_h, [dstend, -16]
|
stp A_l, A_h, [dstend, -16]
|
||||||
ldp A_l, A_h, [srcend, -16]
|
ldp A_l, A_h, [srcend, -16]
|
||||||
stp B_l, B_h, [dstend, -32]
|
stp B_l, B_h, [dstend, -32]
|
||||||
ldp B_l, B_h, [srcend, -32]
|
ldp B_l, B_h, [srcend, -32]
|
||||||
stp C_l, C_h, [dstend, -48]
|
stp C_l, C_h, [dstend, -48]
|
||||||
ldp C_l, C_h, [srcend, -48]
|
ldp C_l, C_h, [srcend, -48]
|
||||||
stp D_l, D_h, [dstend, -64]!
|
stp D_l, D_h, [dstend, -64]!
|
||||||
ldp D_l, D_h, [srcend, -64]!
|
ldp D_l, D_h, [srcend, -64]!
|
||||||
subs count, count, 64
|
subs count, count, 64
|
||||||
b.hi L(loop64_backwards)
|
b.hi L(loop64_backwards)
|
||||||
|
|
||||||
/* Write the last iteration and copy 64 bytes from the start. */
|
/* Write the last iteration and copy 64 bytes from the start. */
|
||||||
L(copy64_from_start):
|
L(copy64_from_start):
|
||||||
ldp G_l, G_h, [src, 48]
|
ldp G_l, G_h, [src, 48]
|
||||||
stp A_l, A_h, [dstend, -16]
|
stp A_l, A_h, [dstend, -16]
|
||||||
ldp A_l, A_h, [src, 32]
|
ldp A_l, A_h, [src, 32]
|
||||||
stp B_l, B_h, [dstend, -32]
|
stp B_l, B_h, [dstend, -32]
|
||||||
ldp B_l, B_h, [src, 16]
|
ldp B_l, B_h, [src, 16]
|
||||||
stp C_l, C_h, [dstend, -48]
|
stp C_l, C_h, [dstend, -48]
|
||||||
ldp C_l, C_h, [src]
|
ldp C_l, C_h, [src]
|
||||||
stp D_l, D_h, [dstend, -64]
|
stp D_l, D_h, [dstend, -64]
|
||||||
stp G_l, G_h, [dstin, 48]
|
stp G_l, G_h, [dstin, 48]
|
||||||
stp A_l, A_h, [dstin, 32]
|
stp A_l, A_h, [dstin, 32]
|
||||||
stp B_l, B_h, [dstin, 16]
|
stp B_l, B_h, [dstin, 16]
|
||||||
stp C_l, C_h, [dstin]
|
stp C_l, C_h, [dstin]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
END (memcpy)
|
END (memcpy)
|
||||||
|
|
|
@ -13,115 +13,158 @@
|
||||||
|
|
||||||
#include "asmdefs.h"
|
#include "asmdefs.h"
|
||||||
|
|
||||||
#define SAVE_FPU_REGISTERS
|
#define DC_ZVA_THRESHOLD 512
|
||||||
#define SKIP_ZVA_CHECK
|
|
||||||
|
|
||||||
#ifdef SAVE_FPU_REGISTERS
|
|
||||||
#define SAVE_Q0 str q0, [sp, #-16]!
|
|
||||||
#define RESTORE_Q0 ldr q0, [sp], #16
|
|
||||||
#else
|
|
||||||
#define SAVE_Q0
|
|
||||||
#define RESTORE_Q0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define RETURN RESTORE_Q0; ret; .p2align 4
|
|
||||||
|
|
||||||
#define dstin x0
|
#define dstin x0
|
||||||
#define val x1
|
#define val x1
|
||||||
#define valw w1
|
#define valw w1
|
||||||
#define count x2
|
#define count x2
|
||||||
#define dst x3
|
#define dst x3
|
||||||
#define dstend x4
|
#define dstend x4
|
||||||
#define zva_val x5
|
#define zva_val x5
|
||||||
|
|
||||||
ENTRY (memset)
|
ENTRY (memset)
|
||||||
SAVE_Q0
|
|
||||||
|
|
||||||
dup v0.16B, valw
|
bfi valw, valw, 8, 8
|
||||||
add dstend, dstin, count
|
bfi valw, valw, 16, 16
|
||||||
|
bfi val, val, 32, 32
|
||||||
|
|
||||||
cmp count, 96
|
add dstend, dstin, count
|
||||||
|
|
||||||
|
cmp count, 96
|
||||||
b.hi L(set_long)
|
b.hi L(set_long)
|
||||||
cmp count, 16
|
cmp count, 16
|
||||||
b.hs L(set_medium)
|
b.hs L(set_medium)
|
||||||
mov val, v0.D[0]
|
|
||||||
|
|
||||||
/* Set 0..15 bytes. */
|
/* Set 0..15 bytes. */
|
||||||
tbz count, 3, 1f
|
tbz count, 3, 1f
|
||||||
str val, [dstin]
|
str val, [dstin]
|
||||||
str val, [dstend, -8]
|
str val, [dstend, -8]
|
||||||
RETURN
|
ret
|
||||||
1: tbz count, 2, 2f
|
1: tbz count, 2, 2f
|
||||||
str valw, [dstin]
|
str valw, [dstin]
|
||||||
str valw, [dstend, -4]
|
str valw, [dstend, -4]
|
||||||
RETURN
|
ret
|
||||||
2: cbz count, 3f
|
2: cbz count, 3f
|
||||||
strb valw, [dstin]
|
strb valw, [dstin]
|
||||||
tbz count, 1, 3f
|
tbz count, 1, 3f
|
||||||
strh valw, [dstend, -2]
|
strh valw, [dstend, -2]
|
||||||
3: RETURN
|
3: ret
|
||||||
|
|
||||||
/* Set 17..96 bytes. */
|
/* Set 16..96 bytes. */
|
||||||
|
.p2align 4
|
||||||
L(set_medium):
|
L(set_medium):
|
||||||
str q0, [dstin]
|
stp val, val, [dstin]
|
||||||
tbnz count, 6, L(set96)
|
tbnz count, 6, L(set96)
|
||||||
str q0, [dstend, -16]
|
stp val, val, [dstend, -16]
|
||||||
tbz count, 5, 1f
|
tbz count, 5, 1f
|
||||||
str q0, [dstin, 16]
|
stp val, val, [dstin, 16]
|
||||||
str q0, [dstend, -32]
|
stp val, val, [dstend, -32]
|
||||||
1: RETURN
|
1: ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Set 64..96 bytes. Write 64 bytes from the start and
|
/* Set 64..96 bytes. Write 64 bytes from the start and
|
||||||
32 bytes from the end. */
|
32 bytes from the end. */
|
||||||
L(set96):
|
L(set96):
|
||||||
str q0, [dstin, 16]
|
stp val, val, [dstin, 16]
|
||||||
stp q0, q0, [dstin, 32]
|
stp val, val, [dstin, 32]
|
||||||
stp q0, q0, [dstend, -32]
|
stp val, val, [dstin, 48]
|
||||||
RETURN
|
stp val, val, [dstend, -32]
|
||||||
|
stp val, val, [dstend, -16]
|
||||||
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(set_long):
|
L(set_long):
|
||||||
and valw, valw, 255
|
stp val, val, [dstin]
|
||||||
bic dst, dstin, 15
|
bic dst, dstin, 15
|
||||||
str q0, [dstin]
|
#if DC_ZVA_THRESHOLD
|
||||||
cmp count, 160
|
cmp count, DC_ZVA_THRESHOLD
|
||||||
ccmp valw, 0, 0, hs
|
ccmp val, 0, 0, cs
|
||||||
b.ne L(no_zva)
|
b.eq L(zva_64)
|
||||||
|
|
||||||
#ifndef SKIP_ZVA_CHECK
|
|
||||||
mrs zva_val, dczid_el0
|
|
||||||
and zva_val, zva_val, 31
|
|
||||||
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
|
||||||
b.ne L(no_zva)
|
|
||||||
#endif
|
#endif
|
||||||
str q0, [dst, 16]
|
/* Small-size or non-zero memset does not use DC ZVA. */
|
||||||
stp q0, q0, [dst, 32]
|
sub count, dstend, dst
|
||||||
bic dst, dst, 63
|
|
||||||
sub count, dstend, dst /* Count is now 64 too large. */
|
|
||||||
sub count, count, 128 /* Adjust count and bias for loop. */
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Adjust count and bias for loop. By substracting extra 1 from count,
|
||||||
|
* it is easy to use tbz instruction to check whether loop tailing
|
||||||
|
* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
|
||||||
|
*/
|
||||||
|
sub count, count, 64+16+1
|
||||||
|
|
||||||
|
#if DC_ZVA_THRESHOLD
|
||||||
|
/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
|
||||||
|
nop
|
||||||
|
#endif
|
||||||
|
|
||||||
|
1: stp val, val, [dst, 16]
|
||||||
|
stp val, val, [dst, 32]
|
||||||
|
stp val, val, [dst, 48]
|
||||||
|
stp val, val, [dst, 64]!
|
||||||
|
subs count, count, 64
|
||||||
|
b.hs 1b
|
||||||
|
|
||||||
|
tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
|
||||||
|
stp val, val, [dst, 16]
|
||||||
|
stp val, val, [dst, 32]
|
||||||
|
1: stp val, val, [dstend, -32]
|
||||||
|
stp val, val, [dstend, -16]
|
||||||
|
ret
|
||||||
|
|
||||||
|
#if DC_ZVA_THRESHOLD
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(zva_loop):
|
L(zva_64):
|
||||||
add dst, dst, 64
|
stp val, val, [dst, 16]
|
||||||
dc zva, dst
|
stp val, val, [dst, 32]
|
||||||
subs count, count, 64
|
stp val, val, [dst, 48]
|
||||||
b.hi L(zva_loop)
|
bic dst, dst, 63
|
||||||
stp q0, q0, [dstend, -64]
|
|
||||||
stp q0, q0, [dstend, -32]
|
|
||||||
RETURN
|
|
||||||
|
|
||||||
L(no_zva):
|
/*
|
||||||
sub count, dstend, dst /* Count is 16 too large. */
|
* Previous memory writes might cross cache line boundary, and cause
|
||||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
* cache line partially dirty. Zeroing this kind of cache line using
|
||||||
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
* DC ZVA will incur extra cost, for it requires loading untouched
|
||||||
L(no_zva_loop):
|
* part of the line from memory before zeoring.
|
||||||
stp q0, q0, [dst, 32]
|
*
|
||||||
stp q0, q0, [dst, 64]!
|
* So, write the first 64 byte aligned block using stp to force
|
||||||
|
* fully dirty cache line.
|
||||||
|
*/
|
||||||
|
stp val, val, [dst, 64]
|
||||||
|
stp val, val, [dst, 80]
|
||||||
|
stp val, val, [dst, 96]
|
||||||
|
stp val, val, [dst, 112]
|
||||||
|
|
||||||
|
sub count, dstend, dst
|
||||||
|
/*
|
||||||
|
* Adjust count and bias for loop. By substracting extra 1 from count,
|
||||||
|
* it is easy to use tbz instruction to check whether loop tailing
|
||||||
|
* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
|
||||||
|
*/
|
||||||
|
sub count, count, 128+64+64+1
|
||||||
|
add dst, dst, 128
|
||||||
|
nop
|
||||||
|
|
||||||
|
/* DC ZVA sets 64 bytes each time. */
|
||||||
|
1: dc zva, dst
|
||||||
|
add dst, dst, 64
|
||||||
subs count, count, 64
|
subs count, count, 64
|
||||||
b.hi L(no_zva_loop)
|
b.hs 1b
|
||||||
stp q0, q0, [dstend, -64]
|
|
||||||
stp q0, q0, [dstend, -32]
|
/*
|
||||||
RETURN
|
* Write the last 64 byte aligned block using stp to force fully
|
||||||
|
* dirty cache line.
|
||||||
|
*/
|
||||||
|
stp val, val, [dst, 0]
|
||||||
|
stp val, val, [dst, 16]
|
||||||
|
stp val, val, [dst, 32]
|
||||||
|
stp val, val, [dst, 48]
|
||||||
|
|
||||||
|
tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
|
||||||
|
stp val, val, [dst, 64]
|
||||||
|
stp val, val, [dst, 80]
|
||||||
|
1: stp val, val, [dstend, -32]
|
||||||
|
stp val, val, [dstend, -16]
|
||||||
|
ret
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
END (memset)
|
END (memset)
|
||||||
|
|
Loading…
Reference in a new issue