1/* SPDX-License-Identifier: MIT */
2/*
3 * memset - fill memory with a constant byte
4 *
5 * Copyright (c) 2012-2021, Arm Limited.
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include <asm/macro.h>
15#include "asmdefs.h"
16
17#define dstin	x0
18#define val	x1
19#define valw	w1
20#define count	x2
21#define dst	x3
22#define dstend	x4
23#define zva_val	x5
24
25ENTRY (memset)
26	PTR_ARG (0)
27	SIZE_ARG (2)
28
29	/*
30	 * The optimized memset uses the dc opcode, which causes problems
31	 * when the cache is disabled. Let's check if the cache is disabled
32	 * and use a very simple memset implementation in this case. Otherwise
33	 * jump to the optimized version.
34	 */
35	switch_el x6, 3f, 2f, 1f
363:	mrs	x6, sctlr_el3
37	b	0f
382:	mrs	x6, sctlr_el2
39	b	0f
401:	mrs	x6, sctlr_el1
410:
42	tst	x6, #CR_C
43	bne	9f
44
45	/*
46	 * A very "simple" memset implementation without the use of the
47	 * dc opcode. Can be run with caches disabled.
48	 */
49	mov	x3, #0x0
50	cmp	count, x3	/* check for zero length */
51	beq	8f
524:	strb	valw, [dstin, x3]
53	add	x3, x3, #0x1
54	cmp	count, x3
55	bne	4b
568:	ret
579:
58
59	/* Here the optimized memset version starts */
60	dup	v0.16B, valw
61	add	dstend, dstin, count
62
63	cmp	count, 96
64	b.hi	L(set_long)
65	cmp	count, 16
66	b.hs	L(set_medium)
67	mov	val, v0.D[0]
68
69	/* Set 0..15 bytes.  */
70	tbz	count, 3, 1f
71	str	val, [dstin]
72	str	val, [dstend, -8]
73	ret
74	.p2align 4
751:	tbz	count, 2, 2f
76	str	valw, [dstin]
77	str	valw, [dstend, -4]
78	ret
792:	cbz	count, 3f
80	strb	valw, [dstin]
81	tbz	count, 1, 3f
82	strh	valw, [dstend, -2]
833:	ret
84
85	/* Set 17..96 bytes.  */
86L(set_medium):
87	str	q0, [dstin]
88	tbnz	count, 6, L(set96)
89	str	q0, [dstend, -16]
90	tbz	count, 5, 1f
91	str	q0, [dstin, 16]
92	str	q0, [dstend, -32]
931:	ret
94
95	.p2align 4
96	/* Set 64..96 bytes.  Write 64 bytes from the start and
97	   32 bytes from the end.  */
98L(set96):
99	str	q0, [dstin, 16]
100	stp	q0, q0, [dstin, 32]
101	stp	q0, q0, [dstend, -32]
102	ret
103
104	.p2align 4
105L(set_long):
106	and	valw, valw, 255
107	bic	dst, dstin, 15
108	str	q0, [dstin]
109	cmp	count, 160
110	ccmp	valw, 0, 0, hs
111	b.ne	L(no_zva)
112
113#ifndef SKIP_ZVA_CHECK
114	mrs	zva_val, dczid_el0
115	and	zva_val, zva_val, 31
116	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
117	b.ne	L(no_zva)
118#endif
119	str	q0, [dst, 16]
120	stp	q0, q0, [dst, 32]
121	bic	dst, dst, 63
122	sub	count, dstend, dst	/* Count is now 64 too large.  */
123	sub	count, count, 128	/* Adjust count and bias for loop.  */
124
125	.p2align 4
126L(zva_loop):
127	add	dst, dst, 64
128	dc	zva, dst
129	subs	count, count, 64
130	b.hi	L(zva_loop)
131	stp	q0, q0, [dstend, -64]
132	stp	q0, q0, [dstend, -32]
133	ret
134
135L(no_zva):
136	sub	count, dstend, dst	/* Count is 16 too large.  */
137	sub	dst, dst, 16		/* Dst is biased by -32.  */
138	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
139L(no_zva_loop):
140	stp	q0, q0, [dst, 32]
141	stp	q0, q0, [dst, 64]!
142	subs	count, count, 64
143	b.hi	L(no_zva_loop)
144	stp	q0, q0, [dstend, -64]
145	stp	q0, q0, [dstend, -32]
146	ret
147
148END (memset)
149