1 /*
2  * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
3  *
4  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball
5  */
6 
7 #if !defined _STRING_H
8 #error "Never use <libc-string_i386.h> directly; include <string.h> instead"
9 #endif
10 
11 #ifndef _LIBC_STRING_i386_H
12 #define _LIBC_STRING_i386_H 1
13 
14 static __always_inline
inlined_memset_const_c_count4(void * s,unsigned eax,unsigned count)15 void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count)
16 {
17 	int ecx, edi;
18 
19 	if (count == 0)
20 		return s;
21 
22 	/* Very small (2 stores or less) are best done with direct
23 	 * mov <const>,<mem> instructions (they do not clobber registers) */
24 	if (count == 1) {
25 		*(char *)((char *)s + 0) = eax;
26 		return s;
27 	}
28 
29 	/* You wonder why & 0xff is needed? Try memset(p, '\xff', size).
30 	 * If char is signed, '\xff' == -1! */
31 	eax = (eax & 0xff) * 0x01010101; /* done at compile time */
32 
33 	if (count == 2) {
34 		*(short *)((char *)s + 0) = eax;
35 		return s;
36 	}
37 	if (count == 3) {
38 		*(short *)((char *)s + 0) = eax;
39 		*(char *) ((char *)s + 2) = eax;
40 		return s;
41 	}
42 	if (count == 1*4 + 0) {
43 		*(int *)((char *)s + 0) = eax;
44 		return s;
45 	}
46 	if (count == 1*4 + 1) {
47 		*(int *) ((char *)s + 0) = eax;
48 		*(char *)((char *)s + 4) = eax;
49 		return s;
50 	}
51 	if (count == 1*4 + 2) {
52 		*(int *)  ((char *)s + 0) = eax;
53 		*(short *)((char *)s + 4) = eax;
54 		return s;
55 	}
56 
57 	/* Small string stores: don't clobber ecx
58 	 * (clobbers only eax and edi) */
59 #define small_store(arg) { \
60 	__asm__ __volatile__( \
61 		arg \
62 		: "=&D" (edi) \
63 		: "a" (eax), "0" (s) \
64 		: "memory" \
65 	); \
66 	return s; \
67 }
68 	if (count == 1*4 + 3) small_store("stosl; stosw; stosb");
69 	if (count == 2*4 + 0) {
70 		((int *)s)[0] = eax;
71 		((int *)s)[1] = eax;
72 		return s;
73 	}
74 	if (count == 2*4 + 1) small_store("stosl; stosl; stosb");
75 	if (count == 2*4 + 2) small_store("stosl; stosl; stosw");
76 	if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb");
77 	if (count == 3*4 + 0) small_store("stosl; stosl; stosl");
78 	if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb");
79 	if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw");
80 	if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb");
81 	if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl");
82 	if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb");
83 	/* going over 7 bytes is suboptimal */
84 	/* stosw is 2-byte insn, so this one takes 6 bytes: */
85 	if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw");
86 	/* 7 bytes */
87 	if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb");
88 	/* 5 bytes */
89 	if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl");
90 	/* 6 bytes */
91 	if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb");
92 	/* 7 bytes */
93 	if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw");
94 	/* 8 bytes, but oh well... */
95 	if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb");
96 	/* 6 bytes */
97 	if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl");
98 	/* the rest would be 7+ bytes and is handled below instead */
99 #undef small_store
100 
101 	/* Not small, but multiple-of-4 store.
102 	 * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */
103 	__asm__ __volatile__(
104 		"	rep; stosl\n"
105 		: "=&c" (ecx), "=&D" (edi)
106 		: "a" (eax), "0" (count / 4), "1" (s)
107 		: "memory"
108 	);
109 	return s;
110 }
111 #if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */
112 #define memset(s, c, count) ( \
113 	( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \
114 	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
115 	) \
116 	? memset((s), (c), (count)) \
117 	: inlined_memset_const_c_count4((s), (c), (count)) \
118 	)
119 #endif
120 
121 
122 static __always_inline
inlined_mempcpy_const_count4(void * d,const void * s,unsigned count)123 void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count)
124 {
125 	int ecx;
126 	char *esi, *edi;
127 
128 	if (count == 0)
129 		return d;
130 
131 	if (count == 1) {
132 		*(char *)d = *(char *)s;
133 		return (char *)d + 1;
134 	}
135 	if (count == 2) {
136 		*(short *)d = *(short *)s;
137 		return (char *)d + 2;
138 	}
139 	/* Small string moves: don't clobber ecx
140 	 * (clobbers only esi and edi) */
141 #define small_move(arg) { \
142 	__asm__ __volatile__( \
143 		arg \
144 		: "=&S" (esi), "=&D" (edi) \
145 		: "0" (s), "1" (d) \
146 		: "memory" \
147 	); \
148 	return edi; \
149 }
150 	if (count == 3) small_move("movsw; movsb");
151 	if (count == 1*4 + 0) {
152 		*(int *)d = *(int *)s;
153 		return (char *)d + 4;
154 	}
155 	if (count == 1*4 + 1) small_move("movsl; movsb");
156 	if (count == 1*4 + 2) small_move("movsl; movsw");
157 	if (count == 1*4 + 3) small_move("movsl; movsw; movsb");
158 	if (count == 2*4 + 0) small_move("movsl; movsl");
159 	if (count == 2*4 + 1) small_move("movsl; movsl; movsb");
160 	if (count == 2*4 + 2) small_move("movsl; movsl; movsw");
161 	if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb");
162 	if (count == 3*4 + 0) small_move("movsl; movsl; movsl");
163 	if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb");
164 	if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw");
165 	if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb");
166 	if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl");
167 	if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb");
168 	/* going over 7 bytes is suboptimal */
169 	/* movsw is 2-byte insn, so this one takes 6 bytes: */
170 	if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw");
171 	/* 7 bytes */
172 	if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb");
173 	/* 5 bytes */
174 	if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl");
175 	/* 6 bytes */
176 	if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb");
177 	/* 7 bytes */
178 	if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw");
179 	/* 8 bytes, but oh well... */
180 	if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb");
181 	/* 6 bytes */
182 	if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl");
183 	/* the rest would be 7+ bytes and is handled below instead */
184 #undef small_move
185 
186 	/* Not small, but multiple-of-4 move.
187 	 * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */
188 	__asm__ __volatile__(
189 		"	rep; movsl\n"
190 		: "=&c" (ecx), "=&S" (esi), "=&D" (edi)
191 		: "0" (count / 4), "1" (s), "2" (d)
192 		: "memory"
193 	);
194 	return edi;
195 }
196 static __always_inline
inlined_memcpy_const_count4(void * d,const void * s,unsigned count)197 void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count)
198 {
199 	inlined_mempcpy_const_count4(d, s, count);
200 	return d;
201 }
202 #if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */
203 #define mempcpy(d, s, count) ( \
204 	( !(__builtin_constant_p(count)) \
205 	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
206 	) \
207 	? mempcpy((d), (s), (count)) \
208 	: inlined_mempcpy_const_count4((d), (s), (count)) \
209 	)
210 #define memcpy(d, s, count) ( \
211 	( !(__builtin_constant_p(count)) \
212 	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
213 	) \
214 	? memcpy((d), (s), (count)) \
215 	: inlined_memcpy_const_count4((d), (s), (count)) \
216 	)
217 #endif
218 
219 
220 static __always_inline
inlined_strlen(const char * s)221 size_t inlined_strlen(const char *s)
222 {
223 	int edi;
224 	int ecx;
225 	__asm__ __volatile__(
226 		"	repne; scasb\n"
227 	/*	"	notl	%0\n" */
228 	/*	"	decl	%0\n" */
229 		: "=c" (ecx), "=&D" (edi)
230 		: "1" (s), "a" (0), "0" (0xffffffffu)
231 		/* : no clobbers */
232 	);
233 	return -ecx - 1;
234 }
235 #if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */
236 #define strlen(s) inlined_strlen(s)
237 #endif
238 
239 
240 static __always_inline
inlined_stpcpy(char * dest,const char * src)241 char *inlined_stpcpy(char *dest, const char *src)
242 {
243 	char *esi, *edi;
244 	int eax;
245 	__asm__ __volatile__(
246 		"1:	lodsb\n"
247 		"	stosb\n"
248 		"	testb	%%al, %%al\n"
249 		"	jnz	1b\n"
250 		: "=&S" (esi), "=&D" (edi), "=&a" (eax)
251 		: "0" (src), "1" (dest)
252 		: "memory"
253 	);
254 	return edi - 1;
255 }
256 static __always_inline
inlined_strcpy(char * dest,const char * src)257 char *inlined_strcpy(char *dest, const char *src)
258 {
259 	inlined_stpcpy(dest, src);
260 	return dest;
261 }
262 #if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */
263 #define stpcpy(dest, src) inlined_stpcpy(dest, src)
264 #define strcpy(dest, src) inlined_strcpy(dest, src)
265 #endif
266 
267 
268 static __always_inline
inlined_memchr(const void * s,int c,size_t count)269 void *inlined_memchr(const void *s, int c, size_t count)
270 {
271 	void *edi;
272 	int ecx;
273 	/* Unfortunately, c gets loaded to %eax (wide insn), not %al */
274 	__asm__ __volatile__(
275 		"	jecxz	1f\n"
276 		"	repne; scasb\n"
277 		"	leal	-1(%%edi), %%edi\n"
278 		"	je	2f\n"
279 		"1:\n"
280 		"	xorl	%%edi, %%edi\n"
281 		"2:\n"
282 		: "=&D" (edi), "=&c" (ecx)
283 		: "a" (c), "0" (s), "1" (count)
284 		/* : no clobbers */
285 	);
286 	return edi;
287 }
288 static __always_inline
inlined_memchr_const_c(const void * s,int c,size_t count)289 void *inlined_memchr_const_c(const void *s, int c, size_t count)
290 {
291 #if defined __OPTIMIZE__
292 	void *edi;
293 	int ecx, eax;
294 	__asm__ __volatile__(
295 		"	jecxz	1f\n"
296 		"	movb	%4, %%al\n" /* const c to %%al */
297 		"	repne; scasb\n"
298 		"	leal	-1(%%edi), %%edi\n"
299 		"	je	2f\n"
300 		"1:\n"
301 		"	xorl	%%edi, %%edi\n"
302 		"2:\n"
303 		: "=&D" (edi), "=&c" (ecx), "=&a" (eax)
304 		: "0" (s), "i" (c), "1" (count)
305 		/* : no clobbers */
306 	);
307 	return edi;
308 #else
309 	/* With -O0, gcc can't figure out how to encode CONST c
310 	 * as an immediate operand. Generating slightly bigger code
311 	 * (usually "movl CONST,%eax", 3 bytes bigger than needed):
312 	 */
313 	void *edi;
314 	int ecx, eax;
315 	__asm__ __volatile__(
316 		"	jecxz	1f\n"
317 		"	repne; scasb\n"
318 		"	leal	-1(%%edi), %%edi\n"
319 		"	je	2f\n"
320 		"1:\n"
321 		"	xorl	%%edi, %%edi\n"
322 		"2:\n"
323 		: "=&D" (edi), "=&c" (ecx), "=&a" (eax)
324 		: "0" (s), "2" (c), "1" (count)
325 		/* : no clobbers */
326 	);
327 	return edi;
328 #endif
329 }
330 #if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */
331 #define memchr(s, c, count) ( \
332 	__builtin_constant_p(c) \
333 	? inlined_memchr_const_c(s, (c) & 0xff, count) \
334 	: inlined_memchr(s, c, count) \
335 	)
336 #endif
337 
338 #endif /* _LIBC_STRING_i386_H  */
339