1/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
2 *
3 * "memset" implementation of SuperH
4 *
5 * Copyright (C) 1999  Niibe Yutaka
6 *
7 * Copyright (c) 2009  STMicroelectronics Ltd
8 *   Optimised using 64bit data transfer (via FPU) and the movca.l inst.
9 *   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
10 *
11 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
12 */
13
14/*
15 *            void *memset(void *s, int c, size_t n);
16 */
17
18#include <sysdep.h>
19
20#if defined (__LITTLE_ENDIAN__) && defined (__SH_FPU_ANY__)
21#define MEMSET_USES_FPU
22/* Use paired single precision load or store mode for 64-bit tranfering.
23 * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
24 * Currenlty it has been only implemented and tested for little endian mode. */
25.macro FPU_SET_PAIRED_PREC
26	sts	fpscr, r3
27	mov	#0x10, r1	! PR=0 SZ=1
28	shll16  r1
29	lds	r1, fpscr
30.endm
31.macro RESTORE_FPSCR
32	lds	r3, fpscr
33.endm
34#endif
35
36ENTRY(memset)
37	mov	#12,r0
38	add	r6,r4
39	cmp/gt	r6,r0
40	bt/s	40f		! if it's too small, set a byte at once
41	 mov	r4,r0
42	and	#3,r0
43	cmp/eq	#0,r0
44	bt/s	2f		! It's aligned
45	 sub	r0,r6
461:
47	dt	r0
48	bf/s	1b
49	 mov.b	r5,@-r4
502:				! make VVVV
51	extu.b	r5,r5
52	swap.b	r5,r0		!   V0
53	or	r0,r5		!   VV
54	swap.w	r5,r0		! VV00
55	or	r0,r5		! VVVV
56
57	! Check if enough bytes need to be copied to be worth the big loop
58	mov	#0x40, r0	! (MT)
59	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
60
61	bt/s	22f
62	 mov	r6,r0
63
64	! align the dst to the cache block size if necessary
65	mov	r4, r3
66	mov	#~(0x1f), r1
67
68	and	r3, r1
69	cmp/eq	r3, r1
70
71	bt/s	11f		! dst is already aligned
72	 sub	r1, r3		! r3-r1 -> r3
73	shlr2	r3		! number of loops
74
7510:	mov.l	r5,@-r4
76	dt	r3
77	bf/s	10b
78	 add	#-4, r6
79
8011:	! dst is 32byte aligned
81	mov	r6,r2
82	mov	#-5,r0
83	shld	r0,r2		! number of loops
84
85	add	#-32, r4
86	mov	r5, r0
87
88#ifdef MEMSET_USES_FPU
89	lds	r5, fpul	! (CO)
90	fsts	fpul, fr0	! Dr0 will be 'VVVVVVVV'
91	fsts	fpul, fr1
92
93	FPU_SET_PAIRED_PREC
9412:
95	movca.l	r0, @r4
96	mov.l	r5, @(4, r4)
97	add	#32, r4
98	fmov	dr0, @-r4
99	fmov	dr0, @-r4
100	add	#-0x20, r6
101	fmov	dr0, @-r4
102	dt	r2
103	bf/s	12b
104	 add	#-40, r4
105
106	RESTORE_FPSCR
107#else
10812:
109	movca.l	r0,@r4
110	mov.l	r5,@(4, r4)
111	mov.l	r5,@(8, r4)
112	mov.l	r5,@(12,r4)
113	mov.l	r5,@(16,r4)
114	mov.l	r5,@(20,r4)
115	add	#-0x20, r6
116	mov.l	r5,@(24,r4)
117	dt	r2
118	mov.l	r5,@(28,r4)
119	bf/s	12b
120	 add	#-32, r4
121
122#endif
123	add	#32, r4
124	mov	#8, r0
125	cmp/ge	r0, r6
126	bf	40f
127
128	mov	r6,r0
12922:
130	shlr2	r0
131	shlr	r0		! r0 = r6 >> 3
1323:
133	dt	r0
134	mov.l	r5,@-r4		! set 8-byte at once
135	bf/s	3b
136	 mov.l	r5,@-r4
137	!
138	mov	#7,r0
139	and	r0,r6
140
141	! fill bytes (length may be zero)
14240:	tst	r6,r6
143	bt	5f
1444:
145	dt	r6
146	bf/s	4b
147	 mov.b	r5,@-r4
1485:
149	rts
150	 mov	r4,r0
151END(memset)
152libc_hidden_def (memset)
153