1 
2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
3  *
4  *  This library is free software; you can redistribute it and/or
5  *  modify it under the terms of the GNU Library General Public
6  *  License as published by the Free Software Foundation; either
7  *  version 2 of the License, or (at your option) any later version.
8  *
9  *  This library is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  *  Library General Public License for more details.
13  *
14  *  You should have received a copy of the GNU Library General Public
15  *  License along with this library; if not, see
16  *  <http://www.gnu.org/licenses/>.
17  */
18 
19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
20  *
21  *  Besides uClibc, I'm using this code in my libc for elks, which is
22  *  a 16-bit environment with a fairly limited compiler.  It would make
23  *  things much easier for me if this file isn't modified unnecessarily.
24  *  In particular, please put any new or replacement functions somewhere
25  *  else, and modify the makefile to use your version instead.
26  *  Thanks.  Manuel
27  *
28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
29 
30 
31 /* May 23, 2002     Initial Notes:
32  *
33  * I'm still tweaking this stuff, but it passes the tests I've thrown
34  * at it, and Erik needs it for the gcc port.  The glibc extension
35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36  * in the glibc source.  I also need to fix the behavior of
37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
38  *
39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40  * file on my platform (x86) show about 5-10% faster conversion speed than
41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42  * individual mbrtowc()/wcrtomb() calls.
43  *
44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
46  * needs to deal gracefully with whatever is sent to it.  In that mode,
47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
48  * an arg to force that behavior, so the interface will be changing.
49  *
50  * I need to fix the error checking for 16-bit wide chars.  This isn't
51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
53  *
54  * July 1, 2002
55  *
56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
58  *    locales.
59  * Enabled building of a C/POSIX-locale-only version, so full locale support
60  *    no longer needs to be enabled.
61  *
62  * Nov 4, 2002
63  *
64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66  *   order to support %ls in printf.  See comments below for details.
67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
69  *   and consistency with the stds requirements that a printf format string by
70  *   a valid multibyte string beginning and ending in it's initial shift state.
71  *
72  * Nov 5, 2002
73  *
74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
75  *
76  * Nov 7, 2002
77  *
78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79  *   Added some size/speed optimizations and integrated it into my locale
80  *   framework.  Minimally tested at the moment, but the stub C-locale
81  *   version (which most people would probably be using) should be fine.
82  *
83  * Nov 21, 2002
84  *
85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
86  * Add a couple of ugly hacks to support *wprintf.
87  * Add a mini iconv() and iconv implementation (requires locale support).
88  *
89  * Aug 1, 2003
90  * Bug fix for mbrtowc.
91  *
92  * Aug 18, 2003
93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
94  *
95  * Feb 11, 2004
96  * Bug fix: Fix size check for remaining output space in iconv().
97  *
98  * Manuel
99  */
100 #ifdef _LIBC
101 #include <errno.h>
102 #include <stddef.h>
103 #include <limits.h>
104 #include <stdint.h>
105 #include <inttypes.h>
106 #include <stdlib.h>
107 #include <stdio.h>
108 #include <assert.h>
109 #include <locale.h>
110 #include <wchar.h>
111 #include <bits/uClibc_uwchar.h>
112 
113 /**********************************************************************/
114 #ifdef __UCLIBC_HAS_LOCALE__
115 #ifdef __UCLIBC_MJN3_ONLY__
116 #ifdef L_iswspace
117 /* generates one warning */
118 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
119 #endif
120 #endif /* __UCLIBC_MJN3_ONLY__ */
121 
122 #define ENCODING		(__UCLIBC_CURLOCALE->encoding)
123 
124 #define Cc2wc_IDX_SHIFT		__LOCALE_DATA_Cc2wc_IDX_SHIFT
125 #define Cc2wc_ROW_LEN		__LOCALE_DATA_Cc2wc_ROW_LEN
126 #define Cwc2c_DOMAIN_MAX	__LOCALE_DATA_Cwc2c_DOMAIN_MAX
127 #define Cwc2c_TI_SHIFT		__LOCALE_DATA_Cwc2c_TI_SHIFT
128 #define Cwc2c_TT_SHIFT		__LOCALE_DATA_Cwc2c_TT_SHIFT
129 #define Cwc2c_TI_LEN		__LOCALE_DATA_Cwc2c_TI_LEN
130 
131 #ifndef __CTYPE_HAS_UTF_8_LOCALES
132 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
133 #endif
134 
135 #else  /* __UCLIBC_HAS_LOCALE__ */
136 
137 #ifdef __UCLIBC_MJN3_ONLY__
138 #ifdef L_btowc
139 /* emit only once */
140 #warning fix preprocessor logic testing locale settings
141 #endif
142 #endif
143 
144 #define ENCODING (__ctype_encoding_7_bit)
145 #ifdef __CTYPE_HAS_8_BIT_LOCALES
146 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
147 #endif
148 #ifdef __CTYPE_HAS_UTF_8_LOCALES
149 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
150 #endif
151 #undef L__wchar_utf8sntowcs
152 #undef L__wchar_wcsntoutf8s
153 
154 #endif /* __UCLIBC_HAS_LOCALE__ */
155 /**********************************************************************/
156 
157 #if WCHAR_MAX > 0xffffUL
158 #define UTF_8_MAX_LEN 6
159 #else
160 #define UTF_8_MAX_LEN 3
161 #endif
162 
163 #define KUHN 1
164 
165 /* Implementation-specific work functions. */
166 
167 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
168 					const char **__restrict src, size_t n,
169 					mbstate_t *ps, int allow_continuation) attribute_hidden;
170 
171 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
172 					const wchar_t **__restrict src, size_t wn) attribute_hidden;
173 #endif
174 /**********************************************************************/
175 #ifdef L_btowc
176 
177 
btowc(int c)178 wint_t btowc(int c)
179 {
180 #ifdef __CTYPE_HAS_8_BIT_LOCALES
181 
182 	wchar_t wc;
183 	unsigned char buf[1];
184 	mbstate_t mbstate;
185 
186 	if (c != EOF) {
187 		*buf = (unsigned char) c;
188 		mbstate.__mask = 0;		/* Initialize the mbstate. */
189 		if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
190 			return wc;
191 		}
192 	}
193 	return WEOF;
194 
195 #else  /* !__CTYPE_HAS_8_BIT_LOCALES */
196 
197 #ifdef __UCLIBC_HAS_LOCALE__
198 	assert((ENCODING == __ctype_encoding_7_bit)
199 		   || (ENCODING == __ctype_encoding_utf8));
200 #endif
201 
202 	/* If we don't have 8-bit locale support, then this is trivial since
203 	 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
204 	return (((unsigned int)c) < 0x80) ? c : WEOF;
205 
206 #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
207 }
libc_hidden_def(btowc)208 libc_hidden_def(btowc)
209 
210 #endif
211 /**********************************************************************/
212 #ifdef L_wctob
213 
214 /* Note: We completely ignore ps in all currently supported conversions. */
215 
216 
217 int wctob(wint_t c)
218 {
219 #ifdef __CTYPE_HAS_8_BIT_LOCALES
220 
221 	unsigned char buf[MB_LEN_MAX];
222 
223 	return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
224 
225 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
226 
227 #ifdef __UCLIBC_HAS_LOCALE__
228 	assert((ENCODING == __ctype_encoding_7_bit)
229 		   || (ENCODING == __ctype_encoding_utf8));
230 #endif /* __UCLIBC_HAS_LOCALE__ */
231 
232 	/* If we don't have 8-bit locale support, then this is trivial since
233 	 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
234 
235 	/* TODO: need unsigned version of wint_t... */
236 /*  	return (((unsigned int)c) < 0x80) ? c : WEOF; */
237 	return ((c >= 0) && (c < 0x80)) ? c : EOF;
238 
239 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
240 }
241 
242 #endif
243 /**********************************************************************/
244 #ifdef L_mbsinit
245 
mbsinit(const mbstate_t * ps)246 int mbsinit(const mbstate_t *ps)
247 {
248 	return !ps || !ps->__mask;
249 }
libc_hidden_def(mbsinit)250 libc_hidden_def(mbsinit)
251 
252 #endif
253 /**********************************************************************/
254 #ifdef L_mbrlen
255 
256 
257 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
258 {
259 	static mbstate_t mbstate;	/* Rely on bss 0-init. */
260 
261 	return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
262 }
libc_hidden_def(mbrlen)263 libc_hidden_def(mbrlen)
264 
265 #endif
266 /**********************************************************************/
267 #ifdef L_mbrtowc
268 
269 
270 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
271 			   size_t n, mbstate_t *__restrict ps)
272 {
273 	static mbstate_t mbstate;	/* Rely on bss 0-init. */
274 	wchar_t wcbuf[1];
275 	const char *p;
276 	size_t r;
277 	char empty_string[1];		/* Avoid static to be fPIC friendly. */
278 
279 	if (!ps) {
280 		ps = &mbstate;
281 	}
282 
283 	if (!s) {
284 		pwc = (wchar_t *) s;	/* NULL */
285 		empty_string[0] = 0;	/* Init the empty string when necessary. */
286 		s = empty_string;
287 		n = 1;
288 	} else if (*s == '\0') {
289 		if (pwc)
290 			*pwc = '\0';
291 	/* According to the ISO C 89 standard this is the expected behaviour.  */
292 		return 0;
293 	} else if (!n) {
294 		/* TODO: change error code? */
295 #if 0
296 		return (ps->__mask && (ps->__wc == 0xffffU))
297 			? ((size_t) -1) : ((size_t) -2);
298 #else
299 		return 0;
300 #endif
301 	}
302 
303 	p = s;
304 
305 #ifdef __CTYPE_HAS_UTF_8_LOCALES
306 	/* Need to do this here since mbsrtowcs doesn't allow incompletes. */
307 	if (ENCODING == __ctype_encoding_utf8) {
308 		if (!pwc) {
309 			pwc = wcbuf;
310 		}
311 		r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
312 		return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
313 	}
314 #endif
315 
316 #ifdef __UCLIBC_MJN3_ONLY__
317 #warning TODO: This adds a trailing nul!
318 #endif /* __UCLIBC_MJN3_ONLY__ */
319 
320 	r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
321 
322 	if (((ssize_t) r) >= 0) {
323 		if (pwc) {
324 			*pwc = *wcbuf;
325 		}
326 	}
327 	return (size_t) r;
328 }
libc_hidden_def(mbrtowc)329 libc_hidden_def(mbrtowc)
330 
331 #endif
332 /**********************************************************************/
333 #ifdef L_wcrtomb
334 
335 
336 /* Note: We completely ignore ps in all currently supported conversions. */
337 /* TODO: Check for valid state anyway? */
338 
339 size_t wcrtomb(register char *__restrict s, wchar_t wc,
340 			   mbstate_t *__restrict ps)
341 {
342 #ifdef __UCLIBC_MJN3_ONLY__
343 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
344 #endif /* __UCLIBC_MJN3_ONLY__ */
345 	wchar_t wcbuf[1];
346 	const wchar_t *pwc;
347 	size_t r;
348 	char buf[MB_LEN_MAX];
349 
350 	if (!s) {
351 		s = buf;
352 		wc = 0;
353 	}
354 
355 	pwc = wcbuf;
356 	wcbuf[0] = wc;
357 
358 	r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
359 	return (r != 0) ? r : 1;
360 }
libc_hidden_def(wcrtomb)361 libc_hidden_def(wcrtomb)
362 
363 #endif
364 /**********************************************************************/
365 #ifdef L_mbsrtowcs
366 
367 
368 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
369 				 size_t len, mbstate_t *__restrict ps)
370 {
371 	static mbstate_t mbstate;	/* Rely on bss 0-init. */
372 
373 	return mbsnrtowcs(dst, src, SIZE_MAX, len,
374 						((ps != NULL) ? ps : &mbstate));
375 }
libc_hidden_def(mbsrtowcs)376 libc_hidden_def(mbsrtowcs)
377 
378 #endif
379 /**********************************************************************/
380 #ifdef L_wcsrtombs
381 
382 /* Note: We completely ignore ps in all currently supported conversions.
383 
384  * TODO: Check for valid state anyway? */
385 
386 
387 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
388 				 size_t len, mbstate_t *__restrict ps)
389 {
390 	return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
391 }
libc_hidden_def(wcsrtombs)392 libc_hidden_def(wcsrtombs)
393 
394 #endif
395 /**********************************************************************/
396 #ifdef L__wchar_utf8sntowcs
397 
398 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
399  * UTF-8-test.txt strss test.
400  */
401 /*  #define DECODER */
402 
403 #ifdef DECODER
404 #ifndef KUHN
405 #define KUHN
406 #endif
407 #endif
408 
409 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
410 						  const char **__restrict src, size_t n,
411 						  mbstate_t *ps, int allow_continuation)
412 {
413 	register const char *s;
414 	__uwchar_t mask;
415 	__uwchar_t wc;
416 	wchar_t wcbuf[1];
417 	size_t count;
418 	int incr;
419 
420 	s = *src;
421 
422 	assert(s != NULL);
423 	assert(ps != NULL);
424 
425 	incr = 1;
426 	/* NOTE: The following is an AWFUL HACK!  In order to support %s in
427 	 * wprintf, we need to be able to compute the number of wchars needed
428 	 * for the mbs conversion, not to exceed the precision specified.
429 	 * But if dst is NULL, the return value is the length assuming a
430 	 * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
431 	 * as pwc in order to flag that we really want the length, subject
432 	 * to the restricted buffer size and no partial conversions.
433 	 * See mbsnrtowcs() as well. */
434 	if (!pwc || (pwc == ((wchar_t *)ps))) {
435 		if (!pwc) {
436 			wn = SIZE_MAX;
437 		}
438 		pwc = wcbuf;
439 		incr = 0;
440 	}
441 
442 	/* This is really here only to support the glibc extension function
443 	 * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
444 	 * check on the validity of the mbstate. */
445 	if (!(count = wn)) {
446 		return 0;
447 	}
448 
449 	if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
450 #ifdef DECODER
451 		wc = (__uwchar_t) ps->__wc;
452 		if (n) {
453 			goto CONTINUE;
454 		}
455 		goto DONE;
456 #else
457 		if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
458 			/* TODO: change error code here and below? */
459 			if (n) {
460 				goto CONTINUE;
461 			}
462 			goto DONE;
463 		}
464 		__set_errno(EILSEQ);
465 		return (size_t) -1;		/* We're in an error state. */
466 #endif
467 	}
468 
469 	do {
470 		if (!n) {
471 			goto DONE;
472 		}
473 		--n;
474 		if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
475 			mask = 0x40;
476 #ifdef __UCLIBC_MJN3_ONLY__
477 #warning TODO: Fix range for 16 bit wchar_t case.
478 #endif
479 			if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
480 			(((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
481 				goto START;
482 			}
483 		BAD:
484 #ifdef DECODER
485 			wc = 0xfffdU;
486 			goto COMPLETE;
487 #else
488 			ps->__mask = mask;
489 			ps->__wc = 0xffffU;
490 			__set_errno(EILSEQ);
491 			return (size_t) -1;	/* Illegal start byte! */
492 #endif
493 
494 		CONTINUE:
495 			while (n) {
496 				--n;
497 				if ((*s & 0xc0) != 0x80) {
498 					goto BAD;
499 				}
500 				mask <<= 5;
501 				wc <<= 6;
502 				wc += (*s & 0x3f);	/* keep seperate for bcc (smaller code) */
503 				++s;
504 			START:
505 				wc &= ~(mask << 1);
506 
507 				if ((wc & mask) == 0) {	/* Character completed. */
508 					if ((mask >>= 5) == 0x40) {
509 						mask += mask;
510 					}
511 					/* Check for invalid sequences (longer than necessary)
512 					 * and invalid chars.  */
513 					if ( (wc < mask) /* Sequence not minimal length. */
514 #ifdef KUHN
515 #if UTF_8_MAX_LEN == 3
516 #error broken since mask can overflow!!
517 						 /* For plane 0, these are the only defined values.*/
518 						 || (wc > 0xfffdU)
519 #else
520 						 /* Note that we don't need to worry about exceeding */
521 						 /* 31 bits as that is the most that UTF-8 provides. */
522 						 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
523 #endif
524 						 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
525 #endif /* KUHN */
526 						 ) {
527 						goto BAD;
528 					}
529 					goto COMPLETE;
530 				}
531 			}
532 			/* Character potentially valid but incomplete. */
533 			if (!allow_continuation) {
534 				if (count != wn) {
535 					return 0;
536 				}
537 				/* NOTE: The following can fail if you allow and then disallow
538 				 * continuation!!! */
539 #if UTF_8_MAX_LEN == 3
540 #error broken since mask can overflow!!
541 #endif
542 				/* Need to back up... */
543 				do {
544 					--s;
545 				} while ((mask >>= 5) >= 0x40);
546 				goto DONE;
547 			}
548 			ps->__mask = (wchar_t) mask;
549 			ps->__wc = (wchar_t) wc;
550 			*src = s;
551 			return (size_t) -2;
552 		}
553 	COMPLETE:
554 		*pwc = wc;
555 		pwc += incr;
556 	}
557 #ifdef DECODER
558 	while (--count);
559 #else
560 	while (wc && --count);
561 
562 	if (!wc) {
563 		s = NULL;
564 	}
565 #endif
566 
567  DONE:
568 	/* ps->__wc is irrelavent here. */
569 	ps->__mask = 0;
570 	if (pwc != wcbuf) {
571 		*src = s;
572 	}
573 
574 	return wn - count;
575 }
576 
577 #endif
578 /**********************************************************************/
579 #ifdef L__wchar_wcsntoutf8s
580 
_wchar_wcsntoutf8s(char * __restrict s,size_t n,const wchar_t ** __restrict src,size_t wn)581 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
582 						  const wchar_t **__restrict src, size_t wn)
583 {
584 	register char *p;
585 	size_t len, t;
586 	__uwchar_t wc;
587 	const __uwchar_t *swc;
588 	int store;
589 	char buf[MB_LEN_MAX];
590 	char m;
591 
592 	store = 1;
593 	/* NOTE: The following is an AWFUL HACK!  In order to support %ls in
594 	 * printf, we need to be able to compute the number of bytes needed
595 	 * for the mbs conversion, not to exceed the precision specified.
596 	 * But if dst is NULL, the return value is the length assuming a
597 	 * sufficiently sized buffer.  So, we allow passing of (char *) src
598 	 * as dst in order to flag that we really want the length, subject
599 	 * to the restricted buffer size and no partial conversions.
600 	 * See wcsnrtombs() as well. */
601 	if (!s || (s == ((char *) src))) {
602 		if (!s) {
603 			n = SIZE_MAX;
604 		}
605 		s = buf;
606 		store = 0;
607 	}
608 
609 	t = n;
610 	swc = (const __uwchar_t *) *src;
611 
612 	assert(swc != NULL);
613 
614 	while (wn && t) {
615 		wc = *swc;
616 
617 		*s = wc;
618 		len = 1;
619 
620 		if (wc >= 0x80) {
621 #ifdef KUHN
622 			if (
623 #if UTF_8_MAX_LEN == 3
624 				/* For plane 0, these are the only defined values.*/
625 				/* Note that we don't need to worry about exceeding */
626 				/* 31 bits as that is the most that UTF-8 provides. */
627 				(wc > 0xfffdU)
628 #else
629 				/* UTF_8_MAX_LEN == 6 */
630 				(wc > 0x7fffffffUL)
631 				|| ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
632 #endif
633 				|| ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
634 				) {
635 				__set_errno(EILSEQ);
636 				return (size_t) -1;
637 			}
638 #else  /* KUHN */
639 #if UTF_8_MAX_LEN != 3
640 			if (wc > 0x7fffffffUL) { /* Value too large. */
641 				__set_errno(EILSEQ);
642 				return (size_t) -1;
643 			}
644 #endif
645 #endif /* KUHN */
646 
647 			wc >>= 1;
648 			p = s;
649 			do {
650 				++p;
651 			} while (wc >>= 5);
652 			wc = *swc;
653 			if ((len = p - s) > t) { /* Not enough space. */
654 				break;
655 			}
656 
657 			m = 0x80;
658 			while( p>s ) {
659 				m = (m >> 1) | 0x80;
660 				*--p = (wc & 0x3f) | 0x80;
661 				wc >>= 6;
662 			}
663 			*s |= (m << 1);
664 		} else if (wc == 0) {	/* End of string. */
665 			swc = NULL;
666 			break;
667 		}
668 
669 		++swc;
670 		--wn;
671 		t -= len;
672 		if (store) {
673 			s += len;
674 		}
675 	}
676 
677 	if (store) {
678 		*src = (const wchar_t *) swc;
679 	}
680 
681 	return n - t;
682 }
683 
684 
685 #endif
686 /**********************************************************************/
687 #ifdef L_mbsnrtowcs
688 
689 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
690 
mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t NMC,size_t len,mbstate_t * __restrict ps)691 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
692 					size_t NMC, size_t len, mbstate_t *__restrict ps)
693 {
694 	static mbstate_t mbstate;	/* Rely on bss 0-init. */
695 	wchar_t wcbuf[1];
696 	const char *s;
697 	size_t count;
698 	int incr;
699 
700 	if (!ps) {
701 		ps = &mbstate;
702 	}
703 
704 #ifdef __CTYPE_HAS_UTF_8_LOCALES
705 	if (ENCODING == __ctype_encoding_utf8) {
706 		size_t r;
707 		return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
708 				!= (size_t) -2) ? r : 0;
709 	}
710 #endif
711 	incr = 1;
712 	/* NOTE: The following is an AWFUL HACK!  In order to support %s in
713 	 * wprintf, we need to be able to compute the number of wchars needed
714 	 * for the mbs conversion, not to exceed the precision specified.
715 	 * But if dst is NULL, the return value is the length assuming a
716 	 * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
717 	 * as dst in order to flag that we really want the length, subject
718 	 * to the restricted buffer size and no partial conversions.
719 	 * See _wchar_utf8sntowcs() as well. */
720 	if (!dst || (dst == ((wchar_t *)ps))) {
721 		if (!dst) {
722 			len = SIZE_MAX;
723 		}
724 		dst = wcbuf;
725 		incr = 0;
726 	}
727 
728 	/* Since all the following encodings are single-byte encodings... */
729 	if (len > NMC) {
730 		len = NMC;
731 	}
732 
733 	count = len;
734 	s = *src;
735 
736 #ifdef __CTYPE_HAS_8_BIT_LOCALES
737 	if (ENCODING == __ctype_encoding_8_bit) {
738 		wchar_t wc;
739 		while (count) {
740 			if ((wc = ((unsigned char)(*s))) >= 0x80) {	/* Non-ASCII... */
741 				wc -= 0x80;
742 				wc = __UCLIBC_CURLOCALE->tbl8c2wc[
743 						  (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
744 						   << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
745 				if (!wc) {
746 					goto BAD;
747 				}
748 			}
749 			if (!(*dst = wc)) {
750 				s = NULL;
751 				break;
752 			}
753 			dst += incr;
754 			++s;
755 			--count;
756 		}
757 		if (dst != wcbuf) {
758 			*src = s;
759 		}
760 		return len - count;
761 	}
762 #endif
763 
764 #ifdef __UCLIBC_HAS_LOCALE__
765 	assert(ENCODING == __ctype_encoding_7_bit);
766 #endif
767 
768 	while (count) {
769 		if ((*dst = (unsigned char) *s) == 0) {
770 			s = NULL;
771 			break;
772 		}
773 		if (*dst >= 0x80) {
774 #ifdef __CTYPE_HAS_8_BIT_LOCALES
775 		BAD:
776 #endif
777 			__set_errno(EILSEQ);
778 			return (size_t) -1;
779 		}
780 		++s;
781 		dst += incr;
782 		--count;
783 	}
784 	if (dst != wcbuf) {
785 		*src = s;
786 	}
787 	return len - count;
788 }
libc_hidden_def(mbsnrtowcs)789 libc_hidden_def(mbsnrtowcs)
790 
791 #endif
792 /**********************************************************************/
793 #ifdef L_wcsnrtombs
794 
795 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
796 
797 /* Note: We completely ignore ps in all currently supported conversions.
798  * TODO: Check for valid state anyway? */
799 
800 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
801 					size_t NWC, size_t len, mbstate_t *__restrict ps)
802 {
803 	const __uwchar_t *s;
804 	size_t count;
805 	int incr;
806 	char buf[MB_LEN_MAX];
807 
808 #ifdef __CTYPE_HAS_UTF_8_LOCALES
809 	if (ENCODING == __ctype_encoding_utf8) {
810 		return _wchar_wcsntoutf8s(dst, len, src, NWC);
811 	}
812 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
813 
814 	incr = 1;
815 	/* NOTE: The following is an AWFUL HACK!  In order to support %ls in
816 	 * printf, we need to be able to compute the number of bytes needed
817 	 * for the mbs conversion, not to exceed the precision specified.
818 	 * But if dst is NULL, the return value is the length assuming a
819 	 * sufficiently sized buffer.  So, we allow passing of (char *) src
820 	 * as dst in order to flag that we really want the length, subject
821 	 * to the restricted buffer size and no partial conversions.
822 	 * See _wchar_wcsntoutf8s() as well. */
823 	if (!dst || (dst == ((char *) src))) {
824 		if (!dst) {
825 			len = SIZE_MAX;
826 		}
827 		dst = buf;
828 		incr = 0;
829 	}
830 
831 	/* Since all the following encodings are single-byte encodings... */
832 	if (len > NWC) {
833 		len = NWC;
834 	}
835 
836 	count = len;
837 	s = (const __uwchar_t *) *src;
838 
839 #ifdef __CTYPE_HAS_8_BIT_LOCALES
840 	if (ENCODING == __ctype_encoding_8_bit) {
841 		__uwchar_t wc;
842 		__uwchar_t u;
843 		while (count) {
844 			if ((wc = *s) <= 0x7f) {
845 				if (!(*dst = (unsigned char) wc)) {
846 					s = NULL;
847 					break;
848 				}
849 			} else {
850 				u = 0;
851 				if (wc <= Cwc2c_DOMAIN_MAX) {
852 					u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
853 														+ Cwc2c_TT_SHIFT)];
854 					u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
855 									+ ((wc >> Cwc2c_TT_SHIFT)
856 									   & ((1 << Cwc2c_TI_SHIFT)-1))];
857 					u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
858 									+ (u << Cwc2c_TT_SHIFT)
859 									+ (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
860 				}
861 
862 #ifdef __WCHAR_REPLACEMENT_CHAR
863 				*dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
864 #else  /* __WCHAR_REPLACEMENT_CHAR */
865 				if (!u) {
866 					goto BAD;
867 				}
868 				*dst = (unsigned char) u;
869 #endif /* __WCHAR_REPLACEMENT_CHAR */
870 			}
871 			++s;
872 			dst += incr;
873 			--count;
874 		}
875 		if (dst != buf) {
876 			*src = (const wchar_t *) s;
877 		}
878 		return len - count;
879 	}
880 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
881 
882 #ifdef __UCLIBC_HAS_LOCALE__
883 	assert(ENCODING == __ctype_encoding_7_bit);
884 #endif
885 
886 	while (count) {
887 		if (*s >= 0x80) {
888 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
889 		BAD:
890 #endif
891 			__set_errno(EILSEQ);
892 			return (size_t) -1;
893 		}
894 		if ((*dst = (unsigned char) *s) == 0) {
895 			s = NULL;
896 			break;
897 		}
898 		++s;
899 		dst += incr;
900 		--count;
901 	}
902 	if (dst != buf) {
903 		*src = (const wchar_t *) s;
904 	}
905 	return len - count;
906 }
907 libc_hidden_def(wcsnrtombs)
908 
909 #endif
910 /**********************************************************************/
911 #ifdef L_wcswidth
912 
913 
914 #ifdef __UCLIBC_MJN3_ONLY__
915 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
916 #warning TODO: Update wcwidth to match latest by Kuhn.
917 #endif
918 
919 #if defined(__UCLIBC_HAS_LOCALE__) && \
920 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
921 
922 static const unsigned char new_idx[] = {
923 	0,    5,    5,    6,   10,   15,   28,   39,
924 	48,   48,   71,   94,  113,  128,  139,  154,
925 	175,  186,  188,  188,  188,  188,  188,  188,
926 	203,  208,  208,  208,  208,  208,  208,  208,
927 	208,  219,  219,  219,  222,  222,  222,  222,
928 	222,  222,  222,  222,  222,  222,  222,  224,
929 	224,  231,  231,  231,  231,  231,  231,  231,
930 	231,  231,  231,  231,  231,  231,  231,  231,
931 	231,  231,  231,  231,  231,  231,  231,  231,
932 	231,  231,  231,  231,  231,  231,  231,  231,
933 	231,  231,  231,  231,  231,  231,  231,  231,
934 	231,  231,  231,  231,  231,  231,  231,  231,
935 	231,  231,  231,  231,  231,  231,  231,  231,
936 	231,  231,  231,  231,  231,  231,  231,  231,
937 	231,  231,  231,  231,  231,  231,  231,  231,
938 	231,  231,  231,  231,  231,  231,  231,  231,
939 	231,  231,  231,  231,  231,  231,  231,  231,
940 	231,  231,  231,  231,  231,  231,  231,  231,
941 	231,  231,  231,  231,  231,  231,  231,  231,
942 	231,  231,  231,  231,  231,  231,  231,  231,
943 	231,  231,  231,  231,  231,  233,  233,  233,
944 	233,  233,  233,  233,  234,  234,  234,  234,
945 	234,  234,  234,  234,  234,  234,  234,  234,
946 	234,  234,  234,  234,  234,  234,  234,  234,
947 	234,  234,  234,  234,  234,  234,  234,  234,
948 	234,  234,  234,  234,  234,  234,  234,  234,
949 	234,  234,  234,  234,  234,  234,  234,  234,
950 	236,  236,  236,  236,  236,  236,  236,  236,
951 	236,  236,  236,  236,  236,  236,  236,  236,
952 	236,  236,  236,  236,  236,  236,  236,  236,
953 	236,  236,  236,  236,  236,  236,  236,  236,
954 	236,  237,  237,  238,  241,  241,  242,  249,
955 	255,
956 };
957 
958 static const unsigned char new_tbl[] = {
959 	0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
960 	0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
961 	0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
962 	0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
963 	0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
964 	0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
965 	0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
966 	0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
967 	0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
968 	0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
969 	0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
970 	0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
971 	0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
972 	0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
973 	0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
974 	0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
975 	0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
976 	0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
977 	0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
978 	0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
979 	0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
980 	0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
981 	0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
982 	0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
983 	0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
984 	0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
985 	0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
986 	0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
987 	0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
988 	0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
989 	0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
990 	0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
991 };
992 
993 static const signed char new_wtbl[] = {
994 	0,   -1,    1,   -1,    1,    1,    0,    1,
995 	0,    1,    1,    0,    1,    0,    1,    1,
996 	0,    1,    0,    1,    0,    1,    0,    1,
997 	0,    1,    0,    1,    1,    0,    1,    0,
998 	1,    0,    1,    0,    1,    0,    1,    1,
999 	0,    1,    0,    1,    0,    1,    0,    1,
1000 	1,    0,    1,    0,    1,    0,    1,    0,
1001 	1,    0,    1,    0,    1,    0,    1,    0,
1002 	1,    0,    1,    0,    1,    0,    1,    1,
1003 	0,    1,    0,    1,    0,    1,    0,    1,
1004 	0,    1,    0,    1,    0,    1,    0,    1,
1005 	0,    1,    0,    1,    0,    1,    1,    0,
1006 	1,    0,    1,    0,    1,    0,    1,    0,
1007 	1,    0,    1,    0,    1,    0,    1,    0,
1008 	1,    1,    0,    1,    0,    1,    0,    1,
1009 	0,    1,    0,    1,    0,    1,    0,    1,
1010 	1,    0,    1,    0,    1,    0,    1,    0,
1011 	1,    0,    1,    1,    0,    1,    0,    1,
1012 	0,    1,    0,    1,    0,    1,    0,    1,
1013 	0,    1,    1,    0,    1,    0,    1,    0,
1014 	1,    0,    1,    0,    1,    0,    1,    0,
1015 	1,    0,    1,    0,    1,    0,    1,    1,
1016 	0,    1,    0,    1,    0,    1,    0,    1,
1017 	0,    1,    2,    0,    1,    0,    1,    0,
1018 	1,    0,    1,    0,    1,    0,    1,    0,
1019 	1,    0,    1,    1,    0,    1,    0,    1,
1020 	1,    0,    1,    0,    1,    0,    1,    0,
1021 	1,    0,    1,    1,    2,    1,    1,    2,
1022 	2,    0,    2,    1,    2,    0,    2,    2,
1023 	1,    1,    2,    1,    1,    2,    1,    0,
1024 	1,    1,    0,    1,    0,    1,    2,    1,
1025 	0,    2,    1,    2,    1,    0,    1,
1026 };
1027 
1028 
wcswidth(const wchar_t * pwcs,size_t n)1029 int wcswidth(const wchar_t *pwcs, size_t n)
1030 {
1031 	int h, l, m, count;
1032 	wchar_t wc;
1033 	unsigned char b;
1034 
1035 	if (ENCODING == __ctype_encoding_7_bit) {
1036 		size_t i;
1037 
1038 		for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1039 			if (pwcs[i] != (pwcs[i] & 0x7f)) {
1040 				return -1;
1041 			}
1042 		}
1043 	}
1044 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1045 	else if (ENCODING == __ctype_encoding_8_bit) {
1046 		mbstate_t mbstate;
1047 
1048 		mbstate.__mask = 0;			/* Initialize the mbstate. */
1049 		if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1050 			return -1;
1051 		}
1052 	}
1053 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1054 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1055 	/* For stricter handling of allowed unicode values... see comments above. */
1056 	else if (ENCODING == __ctype_encoding_utf8) {
1057 		size_t i;
1058 
1059 		for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1060 			if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1061 				 || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1062 				) {
1063 				return -1;
1064 			}
1065 		}
1066 	}
1067 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1068 
1069 	for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1070 		if (wc <= 0xff) {
1071 			/* If we're here, wc != 0. */
1072 			if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1073 				return -1;
1074 			}
1075 			++count;
1076 			continue;
1077 		}
1078 		if (((unsigned int) wc) <= 0xffff) {
1079 			b = wc & 0xff;
1080 			h = (wc >> 8);
1081 			l = new_idx[h];
1082 			h = new_idx[h+1];
1083 			while ((m = (l+h) >> 1) != l) {
1084 				if (b >= new_tbl[m]) {
1085 					l = m;
1086 				} else {		/* wc < tbl[m] */
1087 					h = m;
1088 				}
1089 			}
1090 			count += new_wtbl[l]; /* none should be -1. */
1091 			continue;
1092 		}
1093 
1094 		/* Redo this to minimize average number of compares?*/
1095 		if (wc >= 0x1d167) {
1096 			if (wc <= 0x1d1ad) {
1097 				if ((wc <= 0x1d169
1098 					 || (wc >= 0x1d173
1099 						 && (wc <= 0x1d182
1100 							 || (wc >= 0x1d185
1101 								 && (wc <= 0x1d18b
1102 									 || (wc >= 0x1d1aa))))))
1103 					) {
1104 					continue;
1105 				}
1106 			} else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1107 				continue;
1108 			} else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1109 				++count;		/* need 2.. add one here */
1110 			}
1111 #if (WCHAR_MAX > 0x7fffffffL)
1112 			else if (wc > 0x7fffffffL) {
1113 				return -1;
1114 			}
1115 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1116 		}
1117 
1118 		++count;
1119 	}
1120 
1121 	return count;
1122 }
1123 
1124 #else  /*  __UCLIBC_HAS_LOCALE__ */
1125 
1126 int wcswidth(const wchar_t *pwcs, size_t n)
1127 {
1128 	int count;
1129 	wchar_t wc;
1130 	size_t i;
1131 
1132 	for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1133 		if (pwcs[i] != (pwcs[i] & 0x7f)) {
1134 			return -1;
1135 		}
1136 	}
1137 
1138 	for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1139 		if (wc <= 0xff) {
1140 			/* If we're here, wc != 0. */
1141 			if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1142 				return -1;
1143 			}
1144 			++count;
1145 			continue;
1146 		} else {
1147 			return -1;
1148 		}
1149 	}
1150 
1151 	return count;
1152 }
1153 
1154 #endif /*  __UCLIBC_HAS_LOCALE__ */
1155 
libc_hidden_def(wcswidth)1156 libc_hidden_def(wcswidth)
1157 
1158 #endif
1159 /**********************************************************************/
1160 #ifdef L_wcwidth
1161 
1162 
1163 int wcwidth(wchar_t wc)
1164 {
1165 	return wcswidth(&wc, 1);
1166 }
1167 
1168 #endif
1169 /**********************************************************************/
1170 
1171 
1172 typedef struct {
1173 	mbstate_t tostate;
1174 	mbstate_t fromstate;
1175 	int tocodeset;
1176 	int fromcodeset;
1177 	int frombom;
1178 	int tobom;
1179 	int fromcodeset0;
1180 	int frombom0;
1181 	int tobom0;
1182 	int skip_invalid_input;		/* To support iconv -c option. */
1183 } _UC_iconv_t;
1184 
1185 /* For the multibyte
1186  * bit 0 means swap endian
1187  * bit 1 means 2 byte
1188  * bit 2 means 4 byte
1189  *
1190  */
1191 
1192 #if defined L_iconv && defined _LIBC
1193 /* Used externally only by iconv utility */
1194 extern const unsigned char __iconv_codesets[];
1195 libc_hidden_proto(__iconv_codesets)
1196 #endif
1197 
1198 #if defined L_iconv || defined L_iconv_main
1199 # ifdef L_iconv_main
1200 static
1201 # endif
1202 const unsigned char __iconv_codesets[] =
1203 	"\x0a\xe0""WCHAR_T\x00"		/* superset of UCS-4 but platform-endian */
1204 #if __BYTE_ORDER == __BIG_ENDIAN
1205 	"\x08\xec""UCS-4\x00"		/* always BE */
1206 	"\x0a\xec""UCS-4BE\x00"
1207 	"\x0a\xed""UCS-4LE\x00"
1208 	"\x09\xe4""UTF-32\x00"		/* platform endian with BOM */
1209 	"\x0b\xe4""UTF-32BE\x00"
1210 	"\x0b\xe5""UTF-32LE\x00"
1211 	"\x08\xe2""UCS-2\x00"		/* always BE */
1212 	"\x0a\xe2""UCS-2BE\x00"
1213 	"\x0a\xe3""UCS-2LE\x00"
1214 	"\x09\xea""UTF-16\x00"		/* platform endian with BOM */
1215 	"\x0b\xea""UTF-16BE\x00"
1216 	"\x0b\xeb""UTF-16LE\x00"
1217 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1218 	"\x08\xed""UCS-4\x00"		/* always BE */
1219 	"\x0a\xed""UCS-4BE\x00"
1220 	"\x0a\xec""UCS-4LE\x00"
1221 	"\x09\xf4""UTF-32\x00"		/* platform endian with BOM */
1222 	"\x0b\xe5""UTF-32BE\x00"
1223 	"\x0b\xe4""UTF-32LE\x00"
1224 	"\x08\xe3""UCS-2\x00"		/* always BE */
1225 	"\x0a\xe3""UCS-2BE\x00"
1226 	"\x0a\xe2""UCS-2LE\x00"
1227 	"\x09\xfa""UTF-16\x00"		/* platform endian with BOM */
1228 	"\x0b\xeb""UTF-16BE\x00"
1229 	"\x0b\xea""UTF-16LE\x00"
1230 #endif
1231 	"\x08\x02""UTF-8\x00"
1232 	"\x0b\x01""US-ASCII\x00"
1233 	"\x07\x01""ASCII";			/* Must be last! (special case to save a nul) */
1234 #endif
1235 #if defined L_iconv && defined _LIBC
1236 libc_hidden_data_def(__iconv_codesets)
1237 #endif
1238 
1239 
1240 #ifdef L_iconv
1241 
1242 #include <iconv.h>
1243 #include <string.h>
1244 #include <endian.h>
1245 #include <byteswap.h>
1246 
1247 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1248 #error unsupported endianness for iconv
1249 #endif
1250 
1251 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1252 #error currently iconv requires 8 bit locales
1253 #endif
1254 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1255 #error currently iconv requires UTF-8 locales
1256 #endif
1257 
1258 
1259 enum {
1260 	IC_WCHAR_T = 0xe0,
1261 	IC_MULTIBYTE = 0xe0,
1262 #if __BYTE_ORDER == __BIG_ENDIAN
1263 	IC_UCS_4 =	0xec,
1264 	IC_UTF_32 = 0xe4,
1265 	IC_UCS_2 =	0xe2,
1266 	IC_UTF_16 = 0xea,
1267 #else
1268 	IC_UCS_4 =	0xed,
1269 	IC_UTF_32 = 0xe5,
1270 	IC_UCS_2 =	0xe3,
1271 	IC_UTF_16 = 0xeb,
1272 #endif
1273 	IC_UTF_8 = 2,
1274 	IC_ASCII = 1
1275 };
1276 
1277 
find_codeset(const char * name)1278 static int find_codeset(const char *name)
1279 {
1280 	const unsigned char *s;
1281 	int codeset;
1282 
1283 	for (s = __iconv_codesets; *s; s += *s) {
1284 		if (!strcasecmp((char*) (s + 2), name)) {
1285 			return s[1];
1286 		}
1287 	}
1288 
1289 	/* The following is ripped from find_locale in locale.c. */
1290 
1291 	/* TODO: maybe CODESET_LIST + *s ??? */
1292 	/* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1293 	codeset = 2;
1294 	s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1295 	do {
1296 		++codeset;		/* Increment codeset first. */
1297 		if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1298 			return codeset;
1299 		}
1300 	} while (*++s);
1301 
1302 	return 0;			/* No matching codeset! */
1303 }
1304 
iconv_open(const char * tocode,const char * fromcode)1305 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1306 {
1307 	register _UC_iconv_t *px;
1308 	int tocodeset, fromcodeset;
1309 
1310 	if (((tocodeset = find_codeset(tocode)) != 0)
1311 		&& ((fromcodeset = find_codeset(fromcode)) != 0)) {
1312 		if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1313 			px->tocodeset = tocodeset;
1314 			px->tobom0 = px->tobom = (tocodeset >= 0xe0) ? (tocodeset & 0x10) >> 4 : 0;
1315 			px->fromcodeset0 = px->fromcodeset = fromcodeset;
1316 			px->frombom0 = px->frombom = (fromcodeset >= 0xe0) ? (fromcodeset & 0x10) >> 4 : 0;
1317 			px->skip_invalid_input = px->tostate.__mask
1318 				= px->fromstate.__mask = 0;
1319 			return (iconv_t) px;
1320 		}
1321 	} else {
1322 		__set_errno(EINVAL);
1323 	}
1324 	return (iconv_t)(-1);
1325 }
1326 
iconv_close(iconv_t cd)1327 int weak_function iconv_close(iconv_t cd)
1328 {
1329 	free(cd);
1330 
1331 	return 0;
1332 }
1333 
iconv(iconv_t cd,char ** __restrict inbuf,size_t * __restrict inbytesleft,char ** __restrict outbuf,size_t * __restrict outbytesleft)1334 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1335 						   size_t *__restrict inbytesleft,
1336 						   char **__restrict outbuf,
1337 						   size_t *__restrict outbytesleft)
1338 {
1339 	_UC_iconv_t *px = (_UC_iconv_t *) cd;
1340 	size_t nrcount, r;
1341 	wchar_t wc, wc2;
1342 	int inci, inco;
1343 
1344 	assert(px != (_UC_iconv_t *)(-1));
1345 	assert(sizeof(wchar_t) == 4);
1346 
1347 	if (!inbuf || !*inbuf) {	/* Need to reinitialze conversion state. */
1348 		/* Note: For shift-state encodings we possibly need to output the
1349 		 * shift sequence to return to initial state! */
1350 		if ((px->fromcodeset & 0xf0) == 0xe0) {
1351 		}
1352 		px->tostate.__mask = px->fromstate.__mask = 0;
1353 		px->fromcodeset = px->fromcodeset0;
1354 		px->tobom = px->tobom0;
1355 		px->frombom = px->frombom0;
1356 		return 0;
1357 	}
1358 
1359 	nrcount = 0;
1360 	while (*inbytesleft) {
1361 		if (!*outbytesleft) {
1362 		TOO_BIG:
1363 			__set_errno(E2BIG);
1364 			return (size_t) -1;
1365 		}
1366 
1367 		inci = inco = 1;
1368 		if (px->fromcodeset >= IC_MULTIBYTE) {
1369 			inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1370 			if (*inbytesleft < inci) goto INVALID;
1371 			wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1372 				+ ((unsigned char)((*inbuf)[1]));
1373 			if (inci == 4) {
1374 				wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1375 					+ ((unsigned char)((*inbuf)[3])) + (wc << 16);
1376 				if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1377 			} else {
1378 				if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1379 				if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1380 					 && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1381 					) {			/* surrogate */
1382 					wc =- 0xd800U;
1383 					if (*inbytesleft < 4) goto INVALID;
1384 					wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1385 						+ ((unsigned char)((*inbuf)[3]));
1386 					if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1387 					if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1388 						goto ILLEGAL;
1389 					}
1390 					inci = 4;	/* Change inci here in case skipping illegals. */
1391 					wc = 0x10000UL + (wc << 10) + wc2;
1392 				}
1393 			}
1394 
1395 			if (px->frombom) {
1396 				px->frombom = 0;
1397 				if ((wc == 0xfeffU)
1398 					|| (wc == ((inci == 4)
1399 							   ? (((wchar_t) 0xfffe0000UL))
1400 							   : ((wchar_t)(0xfffeUL))))
1401 					) {
1402 					if (wc != 0xfeffU) {
1403 						px->fromcodeset ^= 1; /* toggle endianness */
1404 						wc = 0xfeffU;
1405 					}
1406 					if (!px->frombom) {
1407 						goto BOM_SKIP_OUTPUT;
1408 					}
1409 					goto GOT_BOM;
1410 				}
1411 			}
1412 
1413 			if (px->fromcodeset != IC_WCHAR_T) {
1414 				if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1415 										 ? 0x7fffffffUL : 0x10ffffUL)
1416 #ifdef KUHN
1417 					|| (((__uwchar_t)(wc - 0xfffeU)) < 2)
1418 					|| (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1419 #endif
1420 					) {
1421 					goto ILLEGAL;
1422 				}
1423 			}
1424 		} else if (px->fromcodeset == IC_UTF_8) {
1425 			const char *p = *inbuf;
1426 			r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1427 			if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1428 				if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1429 					assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1430 					if (r == (size_t)(-2)) {
1431 					INVALID:
1432 						__set_errno(EINVAL);
1433 					} else {
1434 						px->fromstate.__mask = 0;
1435 						inci = 1;
1436 					ILLEGAL:
1437 						if (px->skip_invalid_input) {
1438 							px->skip_invalid_input = 2;	/* flag for iconv utility */
1439 							goto BOM_SKIP_OUTPUT;
1440 						}
1441 						__set_errno(EILSEQ);
1442 					}
1443 					return (size_t)(-1);
1444 				}
1445 #ifdef __UCLIBC_MJN3_ONLY__
1446 #warning TODO: optimize this.
1447 #endif
1448 				if (p != NULL) { /* incomplete char case */
1449 					goto INVALID;
1450 				}
1451 				p = *inbuf + 1;	/* nul */
1452 			}
1453 			inci = p - *inbuf;
1454 		} else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) {	/* Non-ASCII... */
1455 			if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1456 				goto ILLEGAL;
1457 			} else {			/* some other 8-bit ascii-extension codeset */
1458 				const __codeset_8_bit_t *c8b
1459 					= __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1460 				wc -= 0x80;
1461 				wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1462 							 (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1463 							  << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1464 				if (!wc) {
1465 					goto ILLEGAL;
1466 				}
1467 			}
1468 		}
1469 
1470 
1471 		if (px->tobom) {
1472 			inci = 0;
1473 			wc = 0xfeffU;
1474 	GOT_BOM:
1475 			px->tobom = 0;
1476 		}
1477 
1478 		if (px->tocodeset >= IC_MULTIBYTE) {
1479 			inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1480 			if (*outbytesleft < inco) goto TOO_BIG;
1481 			if (px->tocodeset != IC_WCHAR_T) {
1482 				if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1483 										 ? 0x7fffffffUL : 0x10ffffUL)
1484 #ifdef KUHN
1485 					|| (((__uwchar_t)(wc - 0xfffeU)) < 2)
1486 					|| (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1487 #endif
1488 					) {
1489 				REPLACE_32:
1490 					wc = 0xfffd;
1491 					++nrcount;
1492 				}
1493 			}
1494 			if (inco == 4) {
1495 				if (px->tocodeset & 1) wc = bswap_32(wc);
1496 			} else {
1497 				if (((__uwchar_t)wc ) > 0xffffU) {
1498 					if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1499 						goto REPLACE_32;
1500 					}
1501 					if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1502 					wc2 = 0xdc00U + (wc & 0x3ff);
1503 					wc = 0xd800U + ((wc >> 10) & 0x3ff);
1504 					if (px->tocodeset & 1) {
1505 						wc = bswap_16(wc);
1506 						wc2 = bswap_16(wc2);
1507 					}
1508 					wc += (wc2 << 16);
1509 				} else if (px->tocodeset & 1) wc = bswap_16(wc);
1510 			}
1511 			(*outbuf)[0] = (char)((unsigned char)(wc));
1512 			(*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1513 			if (inco == 4) {
1514 				(*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1515 				(*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1516 			}
1517 		} else if (px->tocodeset == IC_UTF_8) {
1518 			const wchar_t *pw = &wc;
1519 			do {
1520 				r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1521 				if (r != (size_t)(-1)) {
1522 #ifdef __UCLIBC_MJN3_ONLY__
1523 #warning TODO: What happens for a nul?
1524 #endif
1525 					if (r == 0) {
1526 						if (wc != 0) {
1527 							goto TOO_BIG;
1528 						}
1529 						++r;
1530 					}
1531 					break;
1532 				}
1533 				wc = 0xfffdU;
1534 				++nrcount;
1535 			} while (1);
1536 			inco = r;
1537 		} else if (((__uwchar_t)(wc)) < 0x80) {
1538 		CHAR_GOOD:
1539 				**outbuf = wc;
1540 		} else {
1541 			if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1542 				const __codeset_8_bit_t *c8b
1543 					= __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1544 				__uwchar_t u;
1545 				u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1546 				u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1547 						 + ((wc >> Cwc2c_TT_SHIFT)
1548 							& ((1 << Cwc2c_TI_SHIFT)-1))];
1549 				wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1550 						 + (u << Cwc2c_TT_SHIFT)
1551 						 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1552 				if (wc) {
1553 					goto CHAR_GOOD;
1554 				}
1555 			}
1556 			**outbuf = '?';
1557 			++nrcount;
1558 		}
1559 
1560 		*outbuf += inco;
1561 		*outbytesleft -= inco;
1562 	BOM_SKIP_OUTPUT:
1563 		*inbuf += inci;
1564 		*inbytesleft -= inci;
1565 	}
1566 	return nrcount;
1567 }
1568 #endif
1569