1/* Optimized version of the standard memset() function.
2   This file is part of the GNU C Library.
3   Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
4   Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5   Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library; if not, see
19   <http://www.gnu.org/licenses/>.  */
20
21/* Return: dest
22
23   Inputs:
24        in0:    dest
25        in1:    value
26        in2:    count
27
28   The algorithm is fairly straightforward: set byte by byte until we
29   we get to a 16B-aligned address, then loop on 128 B chunks using an
30   early store as prefetching, then loop on 32B chucks, then clear remaining
31   words, finally clear remaining bytes.
32   Since a stf.spill f0 can store 16B in one go, we use this instruction
33   to get peak speed when value = 0.  */
34
35#include <sysdep.h>
36#undef ret
37
38#define dest		in0
39#define value		in1
40#define	cnt		in2
41
42#define tmp		r31
43#define save_lc		r30
44#define ptr0		r29
45#define ptr1		r28
46#define ptr2		r27
47#define ptr3		r26
48#define ptr9		r24
49#define	loopcnt		r23
50#define linecnt		r22
51#define bytecnt		r21
52
53#define fvalue		f6
54
55/* This routine uses only scratch predicate registers (p6 - p15) */
56#define p_scr		p6	/* default register for same-cycle branches */
57#define p_nz		p7
58#define p_zr		p8
59#define p_unalgn	p9
60#define p_y		p11
61#define p_n		p12
62#define p_yy		p13
63#define p_nn		p14
64
65#define movi0		mov
66
67#define MIN1		15
68#define MIN1P1HALF	8
69#define LINE_SIZE	128
70#define LSIZE_SH        7			/* shift amount */
71#define PREF_AHEAD	8
72
73#define USE_FLP
74#if defined(USE_INT)
75#define store		st8
76#define myval           value
77#elif defined(USE_FLP)
78#define store		stf8
79#define myval		fvalue
80#endif
81
82.align	64
83ENTRY(memset)
84{ .mmi
85	.prologue
86	alloc	tmp = ar.pfs, 3, 0, 0, 0
87	lfetch.nt1 [dest]
88	.save   ar.lc, save_lc
89	movi0	save_lc = ar.lc
90} { .mmi
91	.body
92	mov	ret0 = dest		/* return value */
93	cmp.ne	p_nz, p_zr = value, r0	/* use stf.spill if value is zero */
94	cmp.eq	p_scr, p0 = cnt, r0
95;; }
96{ .mmi
97	and	ptr2 = -(MIN1+1), dest	/* aligned address */
98	and	tmp = MIN1, dest	/* prepare to check for alignment */
99	tbit.nz p_y, p_n = dest, 0	/* Do we have an odd address? (M_B_U) */
100} { .mib
101	mov	ptr1 = dest
102	mux1	value = value, @brcst	/* create 8 identical bytes in word */
103(p_scr)	br.ret.dpnt.many rp		/* return immediately if count = 0 */
104;; }
105{ .mib
106	cmp.ne	p_unalgn, p0 = tmp, r0
107} { .mib				/* NB: # of bytes to move is 1 higher */
108	sub	bytecnt = (MIN1+1), tmp	/*     than loopcnt */
109	cmp.gt	p_scr, p0 = 16, cnt		/* is it a minimalistic task? */
110(p_scr)	br.cond.dptk.many .move_bytes_unaligned	/* go move just a few (M_B_U) */
111;; }
112{ .mmi
113(p_unalgn) add	ptr1 = (MIN1+1), ptr2		/* after alignment */
114(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		/* after alignment */
115(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	/* should we do a st8 ? */
116;; }
117{ .mib
118(p_y)	add	cnt = -8, cnt
119(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	/* should we do a st4 ? */
120} { .mib
121(p_y)	st8	[ptr2] = value, -4
122(p_n)	add	ptr2 = 4, ptr2
123;; }
124{ .mib
125(p_yy)	add	cnt = -4, cnt
126(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	/* should we do a st2 ? */
127} { .mib
128(p_yy)	st4	[ptr2] = value, -2
129(p_nn)	add	ptr2 = 2, ptr2
130;; }
131{ .mmi
132	mov	tmp = LINE_SIZE+1		/* for compare */
133(p_y)	add	cnt = -2, cnt
134(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	/* should we do a st1 ? */
135} { .mmi
136	setf.sig fvalue=value			/* transfer value to FLP side */
137(p_y)	st2	[ptr2] = value, -1
138(p_n)	add	ptr2 = 1, ptr2
139;; }
140
141{ .mmi
142(p_yy)	st1	[ptr2] = value
143	cmp.gt	p_scr, p0 = tmp, cnt		/* is it a minimalistic task? */
144} { .mbb
145(p_yy)	add	cnt = -1, cnt
146(p_scr)	br.cond.dpnt.many .fraction_of_line	/* go move just a few */
147;; }
148
149{ .mib
150	nop.m 0
151	shr.u	linecnt = cnt, LSIZE_SH
152(p_zr)	br.cond.dptk.many .l1b			/* Jump to use stf.spill */
153;; }
154
155#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
156	.align 32 /* --------  L1A: store ahead into cache lines; fill later */
157#endif
158{ .mmi
159	and	tmp = -(LINE_SIZE), cnt		/* compute end of range */
160	mov	ptr9 = ptr1			/* used for prefetching */
161	and	cnt = (LINE_SIZE-1), cnt	/* remainder */
162} { .mmi
163	mov	loopcnt = PREF_AHEAD-1		/* default prefetch loop */
164	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	/* check against actual value */
165;; }
166{ .mmi
167(p_scr)	add	loopcnt = -1, linecnt		/* start of stores */
168	add	ptr2 = 8, ptr1			/* (beyond prefetch stores) */
169	add	ptr1 = tmp, ptr1		/* first address beyond total */
170;; }						/* range */
171{ .mmi
172	add	tmp = -1, linecnt		/* next loop count */
173	movi0	ar.lc = loopcnt
174;; }
175.pref_l1a:
176{ .mib
177	store [ptr9] = myval, 128	/* Do stores one cache line apart */
178	nop.i	0
179	br.cloop.dptk.few .pref_l1a
180;; }
181{ .mmi
182	add	ptr0 = 16, ptr2		/* Two stores in parallel */
183	movi0	ar.lc = tmp
184;; }
185.l1ax:
186 { .mmi
187	store [ptr2] = myval, 8
188	store [ptr0] = myval, 8
189 ;; }
190 { .mmi
191	store [ptr2] = myval, 24
192	store [ptr0] = myval, 24
193 ;; }
194 { .mmi
195	store [ptr2] = myval, 8
196	store [ptr0] = myval, 8
197 ;; }
198 { .mmi
199	store [ptr2] = myval, 24
200	store [ptr0] = myval, 24
201 ;; }
202 { .mmi
203	store [ptr2] = myval, 8
204	store [ptr0] = myval, 8
205 ;; }
206 { .mmi
207	store [ptr2] = myval, 24
208	store [ptr0] = myval, 24
209 ;; }
210 { .mmi
211	store [ptr2] = myval, 8
212	store [ptr0] = myval, 32
213	cmp.lt	p_scr, p0 = ptr9, ptr1	/* do we need more prefetching? */
214 ;; }
215{ .mmb
216	store [ptr2] = myval, 24
217(p_scr)	store [ptr9] = myval, 128
218	br.cloop.dptk.few .l1ax
219;; }
220{ .mbb
221	cmp.le  p_scr, p0 = 8, cnt		/* just a few bytes left ? */
222(p_scr) br.cond.dpnt.many  .fraction_of_line	/* Branch no. 2 */
223	br.cond.dpnt.many  .move_bytes_from_alignment	/* Branch no. 3 */
224;; }
225
226#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
227	{ nop 0 }
228#else
229	.align 32
230#endif
231.l1b:	/* ------------------  L1B: store ahead into cache lines; fill later */
232{ .mmi
233	and	tmp = -(LINE_SIZE), cnt		/* compute end of range */
234	mov	ptr9 = ptr1			/* used for prefetching */
235	and	cnt = (LINE_SIZE-1), cnt	/* remainder */
236} { .mmi
237	mov	loopcnt = PREF_AHEAD-1		/* default prefetch loop */
238	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	/* check against actual value */
239;; }
240{ .mmi
241(p_scr)	add	loopcnt = -1, linecnt
242	add	ptr2 = 16, ptr1	/* start of stores (beyond prefetch stores) */
243	add	ptr1 = tmp, ptr1	/* first address beyond total range */
244;; }
245{ .mmi
246	add	tmp = -1, linecnt	/* next loop count */
247	movi0	ar.lc = loopcnt
248;; }
249.pref_l1b:
250{ .mib
251	stf.spill [ptr9] = f0, 128	/* Do stores one cache line apart */
252	nop.i   0
253	br.cloop.dptk.few .pref_l1b
254;; }
255{ .mmi
256	add	ptr0 = 16, ptr2		/* Two stores in parallel */
257	movi0	ar.lc = tmp
258;; }
259.l1bx:
260 { .mmi
261	stf.spill [ptr2] = f0, 32
262	stf.spill [ptr0] = f0, 32
263 ;; }
264 { .mmi
265	stf.spill [ptr2] = f0, 32
266	stf.spill [ptr0] = f0, 32
267 ;; }
268 { .mmi
269	stf.spill [ptr2] = f0, 32
270	stf.spill [ptr0] = f0, 64
271	cmp.lt	p_scr, p0 = ptr9, ptr1	/* do we need more prefetching? */
272 ;; }
273{ .mmb
274	stf.spill [ptr2] = f0, 32
275(p_scr)	stf.spill [ptr9] = f0, 128
276	br.cloop.dptk.few .l1bx
277;; }
278{ .mib
279	cmp.gt  p_scr, p0 = 8, cnt	/* just a few bytes left ? */
280(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment
281;; }
282
283.fraction_of_line:
284{ .mib
285	add	ptr2 = 16, ptr1
286	shr.u	loopcnt = cnt, 5	/* loopcnt = cnt / 32 */
287;; }
288{ .mib
289	cmp.eq	p_scr, p0 = loopcnt, r0
290	add	loopcnt = -1, loopcnt
291(p_scr)	br.cond.dpnt.many store_words
292;; }
293{ .mib
294	and	cnt = 0x1f, cnt		/* compute the remaining cnt */
295	movi0   ar.lc = loopcnt
296;; }
297#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
298	.align 32
299#endif
300.l2:	/* ----------------------------  L2A:  store 32B in 2 cycles */
301{ .mmb
302	store	[ptr1] = myval, 8
303	store	[ptr2] = myval, 8
304;; } { .mmb
305	store	[ptr1] = myval, 24
306	store	[ptr2] = myval, 24
307	br.cloop.dptk.many .l2
308;; }
309store_words:
310{ .mib
311	cmp.gt	p_scr, p0 = 8, cnt		/* just a few bytes left ? */
312(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	/* Branch */
313;; }
314
315{ .mmi
316	store	[ptr1] = myval, 8		/* store */
317	cmp.le	p_y, p_n = 16, cnt		/* */
318	add	cnt = -8, cnt			/* subtract */
319;; }
320{ .mmi
321(p_y)	store	[ptr1] = myval, 8		/* store */
322(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt		/* */
323(p_y)	add	cnt = -8, cnt			/* subtract */
324;; }
325{ .mmi						/* store */
326(p_yy)	store	[ptr1] = myval, 8		/* */
327(p_yy)	add	cnt = -8, cnt			/* subtract */
328;; }
329
330.move_bytes_from_alignment:
331{ .mib
332	cmp.eq	p_scr, p0 = cnt, r0
333	tbit.nz.unc p_y, p0 = cnt, 2	/* should we terminate with a st4 ? */
334(p_scr)	br.cond.dpnt.few .restore_and_exit
335;; }
336{ .mib
337(p_y)	st4	[ptr1] = value, 4
338	tbit.nz.unc p_yy, p0 = cnt, 1	/* should we terminate with a st2 ? */
339;; }
340{ .mib
341(p_yy)	st2	[ptr1] = value, 2
342	tbit.nz.unc p_y, p0 = cnt, 0
343;; }
344
345{ .mib
346(p_y)	st1	[ptr1] = value
347;; }
348.restore_and_exit:
349{ .mib
350	nop.m	0
351	movi0	ar.lc = save_lc
352	br.ret.sptk.many rp
353;; }
354
355.move_bytes_unaligned:
356{ .mmi
357       .pred.rel "mutex",p_y, p_n
358       .pred.rel "mutex",p_yy, p_nn
359(p_n)	cmp.le  p_yy, p_nn = 4, cnt
360(p_y)	cmp.le  p_yy, p_nn = 5, cnt
361(p_n)	add	ptr2 = 2, ptr1
362} { .mmi
363(p_y)	add	ptr2 = 3, ptr1
364(p_y)	st1	[ptr1] = value, 1	/* fill 1 (odd-aligned) byte */
365(p_y)	add	cnt = -1, cnt		/* [15, 14 (or less) left] */
366;; }
367{ .mmi
368(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
369	add	ptr3 = ptr1, cnt	/* prepare last store */
370	movi0	ar.lc = save_lc
371} { .mmi
372(p_yy)	st2	[ptr1] = value, 4	/* fill 2 (aligned) bytes */
373(p_yy)	st2	[ptr2] = value, 4	/* fill 2 (aligned) bytes */
374(p_yy)	add	cnt = -4, cnt		/* [11, 10 (o less) left] */
375;; }
376{ .mmi
377(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
378	add	ptr3 = -1, ptr3		/* last store */
379	tbit.nz p_scr, p0 = cnt, 1	/* will there be a st2 at the end ? */
380} { .mmi
381(p_y)	st2	[ptr1] = value, 4	/* fill 2 (aligned) bytes */
382(p_y)	st2	[ptr2] = value, 4	/* fill 2 (aligned) bytes */
383(p_y)	add	cnt = -4, cnt		/* [7, 6 (or less) left] */
384;; }
385{ .mmi
386(p_yy)	st2	[ptr1] = value, 4	/* fill 2 (aligned) bytes */
387(p_yy)	st2	[ptr2] = value, 4	/* fill 2 (aligned) bytes */
388					/* [3, 2 (or less) left] */
389	tbit.nz p_y, p0 = cnt, 0	/* will there be a st1 at the end ? */
390} { .mmi
391(p_yy)	add	cnt = -4, cnt
392;; }
393{ .mmb
394(p_scr)	st2	[ptr1] = value		/* fill 2 (aligned) bytes */
395(p_y)	st1	[ptr3] = value		/* fill last byte (using ptr3) */
396	br.ret.sptk.many rp
397;; }
398END(memset)
399libc_hidden_def (memset)
400