1
2 /* Copyright (C) 2002, 2003, 2004 Manuel Novoa III
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
13 *
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, see
16 * <http://www.gnu.org/licenses/>.
17 */
18
19 /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
20 *
21 * Besides uClibc, I'm using this code in my libc for elks, which is
22 * a 16-bit environment with a fairly limited compiler. It would make
23 * things much easier for me if this file isn't modified unnecessarily.
24 * In particular, please put any new or replacement functions somewhere
25 * else, and modify the makefile to use your version instead.
26 * Thanks. Manuel
27 *
28 * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
29
30
31 /* May 23, 2002 Initial Notes:
32 *
33 * I'm still tweaking this stuff, but it passes the tests I've thrown
34 * at it, and Erik needs it for the gcc port. The glibc extension
35 * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36 * in the glibc source. I also need to fix the behavior of
37 * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
38 *
39 * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40 * file on my platform (x86) show about 5-10% faster conversion speed than
41 * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42 * individual mbrtowc()/wcrtomb() calls.
43 *
44 * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45 * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
46 * needs to deal gracefully with whatever is sent to it. In that mode,
47 * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
48 * an arg to force that behavior, so the interface will be changing.
49 *
50 * I need to fix the error checking for 16-bit wide chars. This isn't
51 * an issue for uClibc, but may be for ELKS. I'm currently not sure
52 * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
53 *
54 * July 1, 2002
55 *
56 * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57 * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
58 * locales.
59 * Enabled building of a C/POSIX-locale-only version, so full locale support
60 * no longer needs to be enabled.
61 *
62 * Nov 4, 2002
63 *
64 * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
65 * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66 * order to support %ls in printf. See comments below for details.
67 * Change behaviour of wc<->mb functions when in the C locale. Now they do
68 * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
69 * and consistency with the stds requirements that a printf format string by
70 * a valid multibyte string beginning and ending in it's initial shift state.
71 *
72 * Nov 5, 2002
73 *
74 * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
75 *
76 * Nov 7, 2002
77 *
78 * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79 * Added some size/speed optimizations and integrated it into my locale
80 * framework. Minimally tested at the moment, but the stub C-locale
81 * version (which most people would probably be using) should be fine.
82 *
83 * Nov 21, 2002
84 *
85 * Revert the wc<->mb changes from earlier this month involving the C-locale.
86 * Add a couple of ugly hacks to support *wprintf.
87 * Add a mini iconv() and iconv implementation (requires locale support).
88 *
89 * Aug 1, 2003
90 * Bug fix for mbrtowc.
91 *
92 * Aug 18, 2003
93 * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
94 *
95 * Feb 11, 2004
96 * Bug fix: Fix size check for remaining output space in iconv().
97 *
98 * Manuel
99 */
100 #ifdef _LIBC
101 #include <errno.h>
102 #include <stddef.h>
103 #include <limits.h>
104 #include <stdint.h>
105 #include <inttypes.h>
106 #include <stdlib.h>
107 #include <stdio.h>
108 #include <assert.h>
109 #include <locale.h>
110 #include <wchar.h>
111 #include <bits/uClibc_uwchar.h>
112
113 /**********************************************************************/
114 #ifdef __UCLIBC_HAS_LOCALE__
115 #ifdef __UCLIBC_MJN3_ONLY__
116 #ifdef L_iswspace
117 /* generates one warning */
118 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
119 #endif
120 #endif /* __UCLIBC_MJN3_ONLY__ */
121
122 #define ENCODING (__UCLIBC_CURLOCALE->encoding)
123
124 #define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT
125 #define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN
126 #define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX
127 #define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT
128 #define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT
129 #define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN
130
131 #ifndef __CTYPE_HAS_UTF_8_LOCALES
132 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
133 #endif
134
135 #else /* __UCLIBC_HAS_LOCALE__ */
136
137 #ifdef __UCLIBC_MJN3_ONLY__
138 #ifdef L_btowc
139 /* emit only once */
140 #warning fix preprocessor logic testing locale settings
141 #endif
142 #endif
143
144 #define ENCODING (__ctype_encoding_7_bit)
145 #ifdef __CTYPE_HAS_8_BIT_LOCALES
146 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
147 #endif
148 #ifdef __CTYPE_HAS_UTF_8_LOCALES
149 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
150 #endif
151 #undef L__wchar_utf8sntowcs
152 #undef L__wchar_wcsntoutf8s
153
154 #endif /* __UCLIBC_HAS_LOCALE__ */
155 /**********************************************************************/
156
157 #if WCHAR_MAX > 0xffffUL
158 #define UTF_8_MAX_LEN 6
159 #else
160 #define UTF_8_MAX_LEN 3
161 #endif
162
163 #define KUHN 1
164
165 /* Implementation-specific work functions. */
166
167 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
168 const char **__restrict src, size_t n,
169 mbstate_t *ps, int allow_continuation) attribute_hidden;
170
171 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
172 const wchar_t **__restrict src, size_t wn) attribute_hidden;
173 #endif
174 /**********************************************************************/
175 #ifdef L_btowc
176
177
btowc(int c)178 wint_t btowc(int c)
179 {
180 #ifdef __CTYPE_HAS_8_BIT_LOCALES
181
182 wchar_t wc;
183 unsigned char buf[1];
184 mbstate_t mbstate;
185
186 if (c != EOF) {
187 *buf = (unsigned char) c;
188 mbstate.__mask = 0; /* Initialize the mbstate. */
189 if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
190 return wc;
191 }
192 }
193 return WEOF;
194
195 #else /* !__CTYPE_HAS_8_BIT_LOCALES */
196
197 #ifdef __UCLIBC_HAS_LOCALE__
198 assert((ENCODING == __ctype_encoding_7_bit)
199 || (ENCODING == __ctype_encoding_utf8));
200 #endif
201
202 /* If we don't have 8-bit locale support, then this is trivial since
203 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
204 return (((unsigned int)c) < 0x80) ? c : WEOF;
205
206 #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
207 }
libc_hidden_def(btowc)208 libc_hidden_def(btowc)
209
210 #endif
211 /**********************************************************************/
212 #ifdef L_wctob
213
214 /* Note: We completely ignore ps in all currently supported conversions. */
215
216
217 int wctob(wint_t c)
218 {
219 #ifdef __CTYPE_HAS_8_BIT_LOCALES
220
221 unsigned char buf[MB_LEN_MAX];
222
223 return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
224
225 #else /* __CTYPE_HAS_8_BIT_LOCALES */
226
227 #ifdef __UCLIBC_HAS_LOCALE__
228 assert((ENCODING == __ctype_encoding_7_bit)
229 || (ENCODING == __ctype_encoding_utf8));
230 #endif /* __UCLIBC_HAS_LOCALE__ */
231
232 /* If we don't have 8-bit locale support, then this is trivial since
233 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
234
235 /* TODO: need unsigned version of wint_t... */
236 /* return (((unsigned int)c) < 0x80) ? c : WEOF; */
237 return ((c >= 0) && (c < 0x80)) ? c : EOF;
238
239 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
240 }
241
242 #endif
243 /**********************************************************************/
244 #ifdef L_mbsinit
245
mbsinit(const mbstate_t * ps)246 int mbsinit(const mbstate_t *ps)
247 {
248 return !ps || !ps->__mask;
249 }
libc_hidden_def(mbsinit)250 libc_hidden_def(mbsinit)
251
252 #endif
253 /**********************************************************************/
254 #ifdef L_mbrlen
255
256
257 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
258 {
259 static mbstate_t mbstate; /* Rely on bss 0-init. */
260
261 return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
262 }
libc_hidden_def(mbrlen)263 libc_hidden_def(mbrlen)
264
265 #endif
266 /**********************************************************************/
267 #ifdef L_mbrtowc
268
269
270 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
271 size_t n, mbstate_t *__restrict ps)
272 {
273 static mbstate_t mbstate; /* Rely on bss 0-init. */
274 wchar_t wcbuf[1];
275 const char *p;
276 size_t r;
277 char empty_string[1]; /* Avoid static to be fPIC friendly. */
278
279 if (!ps) {
280 ps = &mbstate;
281 }
282
283 if (!s) {
284 pwc = (wchar_t *) s; /* NULL */
285 empty_string[0] = 0; /* Init the empty string when necessary. */
286 s = empty_string;
287 n = 1;
288 } else if (*s == '\0') {
289 if (pwc)
290 *pwc = '\0';
291 /* According to the ISO C 89 standard this is the expected behaviour. */
292 return 0;
293 } else if (!n) {
294 /* TODO: change error code? */
295 #if 0
296 return (ps->__mask && (ps->__wc == 0xffffU))
297 ? ((size_t) -1) : ((size_t) -2);
298 #else
299 return 0;
300 #endif
301 }
302
303 p = s;
304
305 #ifdef __CTYPE_HAS_UTF_8_LOCALES
306 /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
307 if (ENCODING == __ctype_encoding_utf8) {
308 if (!pwc) {
309 pwc = wcbuf;
310 }
311 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
312 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
313 }
314 #endif
315
316 #ifdef __UCLIBC_MJN3_ONLY__
317 #warning TODO: This adds a trailing nul!
318 #endif /* __UCLIBC_MJN3_ONLY__ */
319
320 r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
321
322 if (((ssize_t) r) >= 0) {
323 if (pwc) {
324 *pwc = *wcbuf;
325 }
326 }
327 return (size_t) r;
328 }
libc_hidden_def(mbrtowc)329 libc_hidden_def(mbrtowc)
330
331 #endif
332 /**********************************************************************/
333 #ifdef L_wcrtomb
334
335
336 /* Note: We completely ignore ps in all currently supported conversions. */
337 /* TODO: Check for valid state anyway? */
338
339 size_t wcrtomb(register char *__restrict s, wchar_t wc,
340 mbstate_t *__restrict ps)
341 {
342 #ifdef __UCLIBC_MJN3_ONLY__
343 #warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc.
344 #endif /* __UCLIBC_MJN3_ONLY__ */
345 wchar_t wcbuf[1];
346 const wchar_t *pwc;
347 size_t r;
348 char buf[MB_LEN_MAX];
349
350 if (!s) {
351 s = buf;
352 wc = 0;
353 }
354
355 pwc = wcbuf;
356 wcbuf[0] = wc;
357
358 r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
359 return (r != 0) ? r : 1;
360 }
libc_hidden_def(wcrtomb)361 libc_hidden_def(wcrtomb)
362
363 #endif
364 /**********************************************************************/
365 #ifdef L_mbsrtowcs
366
367
368 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
369 size_t len, mbstate_t *__restrict ps)
370 {
371 static mbstate_t mbstate; /* Rely on bss 0-init. */
372
373 return mbsnrtowcs(dst, src, SIZE_MAX, len,
374 ((ps != NULL) ? ps : &mbstate));
375 }
libc_hidden_def(mbsrtowcs)376 libc_hidden_def(mbsrtowcs)
377
378 #endif
379 /**********************************************************************/
380 #ifdef L_wcsrtombs
381
382 /* Note: We completely ignore ps in all currently supported conversions.
383
384 * TODO: Check for valid state anyway? */
385
386
387 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
388 size_t len, mbstate_t *__restrict ps)
389 {
390 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
391 }
libc_hidden_def(wcsrtombs)392 libc_hidden_def(wcsrtombs)
393
394 #endif
395 /**********************************************************************/
396 #ifdef L__wchar_utf8sntowcs
397
398 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
399 * UTF-8-test.txt strss test.
400 */
401 /* #define DECODER */
402
403 #ifdef DECODER
404 #ifndef KUHN
405 #define KUHN
406 #endif
407 #endif
408
409 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
410 const char **__restrict src, size_t n,
411 mbstate_t *ps, int allow_continuation)
412 {
413 register const char *s;
414 __uwchar_t mask;
415 __uwchar_t wc;
416 wchar_t wcbuf[1];
417 size_t count;
418 int incr;
419
420 s = *src;
421
422 assert(s != NULL);
423 assert(ps != NULL);
424
425 incr = 1;
426 /* NOTE: The following is an AWFUL HACK! In order to support %s in
427 * wprintf, we need to be able to compute the number of wchars needed
428 * for the mbs conversion, not to exceed the precision specified.
429 * But if dst is NULL, the return value is the length assuming a
430 * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
431 * as pwc in order to flag that we really want the length, subject
432 * to the restricted buffer size and no partial conversions.
433 * See mbsnrtowcs() as well. */
434 if (!pwc || (pwc == ((wchar_t *)ps))) {
435 if (!pwc) {
436 wn = SIZE_MAX;
437 }
438 pwc = wcbuf;
439 incr = 0;
440 }
441
442 /* This is really here only to support the glibc extension function
443 * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
444 * check on the validity of the mbstate. */
445 if (!(count = wn)) {
446 return 0;
447 }
448
449 if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
450 #ifdef DECODER
451 wc = (__uwchar_t) ps->__wc;
452 if (n) {
453 goto CONTINUE;
454 }
455 goto DONE;
456 #else
457 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
458 /* TODO: change error code here and below? */
459 if (n) {
460 goto CONTINUE;
461 }
462 goto DONE;
463 }
464 __set_errno(EILSEQ);
465 return (size_t) -1; /* We're in an error state. */
466 #endif
467 }
468
469 do {
470 if (!n) {
471 goto DONE;
472 }
473 --n;
474 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
475 mask = 0x40;
476 #ifdef __UCLIBC_MJN3_ONLY__
477 #warning TODO: Fix range for 16 bit wchar_t case.
478 #endif
479 if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
480 (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
481 goto START;
482 }
483 BAD:
484 #ifdef DECODER
485 wc = 0xfffdU;
486 goto COMPLETE;
487 #else
488 ps->__mask = mask;
489 ps->__wc = 0xffffU;
490 __set_errno(EILSEQ);
491 return (size_t) -1; /* Illegal start byte! */
492 #endif
493
494 CONTINUE:
495 while (n) {
496 --n;
497 if ((*s & 0xc0) != 0x80) {
498 goto BAD;
499 }
500 mask <<= 5;
501 wc <<= 6;
502 wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
503 ++s;
504 START:
505 wc &= ~(mask << 1);
506
507 if ((wc & mask) == 0) { /* Character completed. */
508 if ((mask >>= 5) == 0x40) {
509 mask += mask;
510 }
511 /* Check for invalid sequences (longer than necessary)
512 * and invalid chars. */
513 if ( (wc < mask) /* Sequence not minimal length. */
514 #ifdef KUHN
515 #if UTF_8_MAX_LEN == 3
516 #error broken since mask can overflow!!
517 /* For plane 0, these are the only defined values.*/
518 || (wc > 0xfffdU)
519 #else
520 /* Note that we don't need to worry about exceeding */
521 /* 31 bits as that is the most that UTF-8 provides. */
522 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
523 #endif
524 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
525 #endif /* KUHN */
526 ) {
527 goto BAD;
528 }
529 goto COMPLETE;
530 }
531 }
532 /* Character potentially valid but incomplete. */
533 if (!allow_continuation) {
534 if (count != wn) {
535 return 0;
536 }
537 /* NOTE: The following can fail if you allow and then disallow
538 * continuation!!! */
539 #if UTF_8_MAX_LEN == 3
540 #error broken since mask can overflow!!
541 #endif
542 /* Need to back up... */
543 do {
544 --s;
545 } while ((mask >>= 5) >= 0x40);
546 goto DONE;
547 }
548 ps->__mask = (wchar_t) mask;
549 ps->__wc = (wchar_t) wc;
550 *src = s;
551 return (size_t) -2;
552 }
553 COMPLETE:
554 *pwc = wc;
555 pwc += incr;
556 }
557 #ifdef DECODER
558 while (--count);
559 #else
560 while (wc && --count);
561
562 if (!wc) {
563 s = NULL;
564 }
565 #endif
566
567 DONE:
568 /* ps->__wc is irrelavent here. */
569 ps->__mask = 0;
570 if (pwc != wcbuf) {
571 *src = s;
572 }
573
574 return wn - count;
575 }
576
577 #endif
578 /**********************************************************************/
579 #ifdef L__wchar_wcsntoutf8s
580
_wchar_wcsntoutf8s(char * __restrict s,size_t n,const wchar_t ** __restrict src,size_t wn)581 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
582 const wchar_t **__restrict src, size_t wn)
583 {
584 register char *p;
585 size_t len, t;
586 __uwchar_t wc;
587 const __uwchar_t *swc;
588 int store;
589 char buf[MB_LEN_MAX];
590 char m;
591
592 store = 1;
593 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
594 * printf, we need to be able to compute the number of bytes needed
595 * for the mbs conversion, not to exceed the precision specified.
596 * But if dst is NULL, the return value is the length assuming a
597 * sufficiently sized buffer. So, we allow passing of (char *) src
598 * as dst in order to flag that we really want the length, subject
599 * to the restricted buffer size and no partial conversions.
600 * See wcsnrtombs() as well. */
601 if (!s || (s == ((char *) src))) {
602 if (!s) {
603 n = SIZE_MAX;
604 }
605 s = buf;
606 store = 0;
607 }
608
609 t = n;
610 swc = (const __uwchar_t *) *src;
611
612 assert(swc != NULL);
613
614 while (wn && t) {
615 wc = *swc;
616
617 *s = wc;
618 len = 1;
619
620 if (wc >= 0x80) {
621 #ifdef KUHN
622 if (
623 #if UTF_8_MAX_LEN == 3
624 /* For plane 0, these are the only defined values.*/
625 /* Note that we don't need to worry about exceeding */
626 /* 31 bits as that is the most that UTF-8 provides. */
627 (wc > 0xfffdU)
628 #else
629 /* UTF_8_MAX_LEN == 6 */
630 (wc > 0x7fffffffUL)
631 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
632 #endif
633 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
634 ) {
635 __set_errno(EILSEQ);
636 return (size_t) -1;
637 }
638 #else /* KUHN */
639 #if UTF_8_MAX_LEN != 3
640 if (wc > 0x7fffffffUL) { /* Value too large. */
641 __set_errno(EILSEQ);
642 return (size_t) -1;
643 }
644 #endif
645 #endif /* KUHN */
646
647 wc >>= 1;
648 p = s;
649 do {
650 ++p;
651 } while (wc >>= 5);
652 wc = *swc;
653 if ((len = p - s) > t) { /* Not enough space. */
654 break;
655 }
656
657 m = 0x80;
658 while( p>s ) {
659 m = (m >> 1) | 0x80;
660 *--p = (wc & 0x3f) | 0x80;
661 wc >>= 6;
662 }
663 *s |= (m << 1);
664 } else if (wc == 0) { /* End of string. */
665 swc = NULL;
666 break;
667 }
668
669 ++swc;
670 --wn;
671 t -= len;
672 if (store) {
673 s += len;
674 }
675 }
676
677 if (store) {
678 *src = (const wchar_t *) swc;
679 }
680
681 return n - t;
682 }
683
684
685 #endif
686 /**********************************************************************/
687 #ifdef L_mbsnrtowcs
688
689 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
690
mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t NMC,size_t len,mbstate_t * __restrict ps)691 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
692 size_t NMC, size_t len, mbstate_t *__restrict ps)
693 {
694 static mbstate_t mbstate; /* Rely on bss 0-init. */
695 wchar_t wcbuf[1];
696 const char *s;
697 size_t count;
698 int incr;
699
700 if (!ps) {
701 ps = &mbstate;
702 }
703
704 #ifdef __CTYPE_HAS_UTF_8_LOCALES
705 if (ENCODING == __ctype_encoding_utf8) {
706 size_t r;
707 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
708 != (size_t) -2) ? r : 0;
709 }
710 #endif
711 incr = 1;
712 /* NOTE: The following is an AWFUL HACK! In order to support %s in
713 * wprintf, we need to be able to compute the number of wchars needed
714 * for the mbs conversion, not to exceed the precision specified.
715 * But if dst is NULL, the return value is the length assuming a
716 * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
717 * as dst in order to flag that we really want the length, subject
718 * to the restricted buffer size and no partial conversions.
719 * See _wchar_utf8sntowcs() as well. */
720 if (!dst || (dst == ((wchar_t *)ps))) {
721 if (!dst) {
722 len = SIZE_MAX;
723 }
724 dst = wcbuf;
725 incr = 0;
726 }
727
728 /* Since all the following encodings are single-byte encodings... */
729 if (len > NMC) {
730 len = NMC;
731 }
732
733 count = len;
734 s = *src;
735
736 #ifdef __CTYPE_HAS_8_BIT_LOCALES
737 if (ENCODING == __ctype_encoding_8_bit) {
738 wchar_t wc;
739 while (count) {
740 if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
741 wc -= 0x80;
742 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
743 (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
744 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
745 if (!wc) {
746 goto BAD;
747 }
748 }
749 if (!(*dst = wc)) {
750 s = NULL;
751 break;
752 }
753 dst += incr;
754 ++s;
755 --count;
756 }
757 if (dst != wcbuf) {
758 *src = s;
759 }
760 return len - count;
761 }
762 #endif
763
764 #ifdef __UCLIBC_HAS_LOCALE__
765 assert(ENCODING == __ctype_encoding_7_bit);
766 #endif
767
768 while (count) {
769 if ((*dst = (unsigned char) *s) == 0) {
770 s = NULL;
771 break;
772 }
773 if (*dst >= 0x80) {
774 #ifdef __CTYPE_HAS_8_BIT_LOCALES
775 BAD:
776 #endif
777 __set_errno(EILSEQ);
778 return (size_t) -1;
779 }
780 ++s;
781 dst += incr;
782 --count;
783 }
784 if (dst != wcbuf) {
785 *src = s;
786 }
787 return len - count;
788 }
libc_hidden_def(mbsnrtowcs)789 libc_hidden_def(mbsnrtowcs)
790
791 #endif
792 /**********************************************************************/
793 #ifdef L_wcsnrtombs
794
795 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
796
797 /* Note: We completely ignore ps in all currently supported conversions.
798 * TODO: Check for valid state anyway? */
799
800 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
801 size_t NWC, size_t len, mbstate_t *__restrict ps)
802 {
803 const __uwchar_t *s;
804 size_t count;
805 int incr;
806 char buf[MB_LEN_MAX];
807
808 #ifdef __CTYPE_HAS_UTF_8_LOCALES
809 if (ENCODING == __ctype_encoding_utf8) {
810 return _wchar_wcsntoutf8s(dst, len, src, NWC);
811 }
812 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
813
814 incr = 1;
815 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
816 * printf, we need to be able to compute the number of bytes needed
817 * for the mbs conversion, not to exceed the precision specified.
818 * But if dst is NULL, the return value is the length assuming a
819 * sufficiently sized buffer. So, we allow passing of (char *) src
820 * as dst in order to flag that we really want the length, subject
821 * to the restricted buffer size and no partial conversions.
822 * See _wchar_wcsntoutf8s() as well. */
823 if (!dst || (dst == ((char *) src))) {
824 if (!dst) {
825 len = SIZE_MAX;
826 }
827 dst = buf;
828 incr = 0;
829 }
830
831 /* Since all the following encodings are single-byte encodings... */
832 if (len > NWC) {
833 len = NWC;
834 }
835
836 count = len;
837 s = (const __uwchar_t *) *src;
838
839 #ifdef __CTYPE_HAS_8_BIT_LOCALES
840 if (ENCODING == __ctype_encoding_8_bit) {
841 __uwchar_t wc;
842 __uwchar_t u;
843 while (count) {
844 if ((wc = *s) <= 0x7f) {
845 if (!(*dst = (unsigned char) wc)) {
846 s = NULL;
847 break;
848 }
849 } else {
850 u = 0;
851 if (wc <= Cwc2c_DOMAIN_MAX) {
852 u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
853 + Cwc2c_TT_SHIFT)];
854 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
855 + ((wc >> Cwc2c_TT_SHIFT)
856 & ((1 << Cwc2c_TI_SHIFT)-1))];
857 u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
858 + (u << Cwc2c_TT_SHIFT)
859 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
860 }
861
862 #ifdef __WCHAR_REPLACEMENT_CHAR
863 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
864 #else /* __WCHAR_REPLACEMENT_CHAR */
865 if (!u) {
866 goto BAD;
867 }
868 *dst = (unsigned char) u;
869 #endif /* __WCHAR_REPLACEMENT_CHAR */
870 }
871 ++s;
872 dst += incr;
873 --count;
874 }
875 if (dst != buf) {
876 *src = (const wchar_t *) s;
877 }
878 return len - count;
879 }
880 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
881
882 #ifdef __UCLIBC_HAS_LOCALE__
883 assert(ENCODING == __ctype_encoding_7_bit);
884 #endif
885
886 while (count) {
887 if (*s >= 0x80) {
888 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
889 BAD:
890 #endif
891 __set_errno(EILSEQ);
892 return (size_t) -1;
893 }
894 if ((*dst = (unsigned char) *s) == 0) {
895 s = NULL;
896 break;
897 }
898 ++s;
899 dst += incr;
900 --count;
901 }
902 if (dst != buf) {
903 *src = (const wchar_t *) s;
904 }
905 return len - count;
906 }
907 libc_hidden_def(wcsnrtombs)
908
909 #endif
910 /**********************************************************************/
911 #ifdef L_wcswidth
912
913
914 #ifdef __UCLIBC_MJN3_ONLY__
915 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
916 #warning TODO: Update wcwidth to match latest by Kuhn.
917 #endif
918
919 #if defined(__UCLIBC_HAS_LOCALE__) && \
920 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
921
922 static const unsigned char new_idx[] = {
923 0, 5, 5, 6, 10, 15, 28, 39,
924 48, 48, 71, 94, 113, 128, 139, 154,
925 175, 186, 188, 188, 188, 188, 188, 188,
926 203, 208, 208, 208, 208, 208, 208, 208,
927 208, 219, 219, 219, 222, 222, 222, 222,
928 222, 222, 222, 222, 222, 222, 222, 224,
929 224, 231, 231, 231, 231, 231, 231, 231,
930 231, 231, 231, 231, 231, 231, 231, 231,
931 231, 231, 231, 231, 231, 231, 231, 231,
932 231, 231, 231, 231, 231, 231, 231, 231,
933 231, 231, 231, 231, 231, 231, 231, 231,
934 231, 231, 231, 231, 231, 231, 231, 231,
935 231, 231, 231, 231, 231, 231, 231, 231,
936 231, 231, 231, 231, 231, 231, 231, 231,
937 231, 231, 231, 231, 231, 231, 231, 231,
938 231, 231, 231, 231, 231, 231, 231, 231,
939 231, 231, 231, 231, 231, 231, 231, 231,
940 231, 231, 231, 231, 231, 231, 231, 231,
941 231, 231, 231, 231, 231, 231, 231, 231,
942 231, 231, 231, 231, 231, 231, 231, 231,
943 231, 231, 231, 231, 231, 233, 233, 233,
944 233, 233, 233, 233, 234, 234, 234, 234,
945 234, 234, 234, 234, 234, 234, 234, 234,
946 234, 234, 234, 234, 234, 234, 234, 234,
947 234, 234, 234, 234, 234, 234, 234, 234,
948 234, 234, 234, 234, 234, 234, 234, 234,
949 234, 234, 234, 234, 234, 234, 234, 234,
950 236, 236, 236, 236, 236, 236, 236, 236,
951 236, 236, 236, 236, 236, 236, 236, 236,
952 236, 236, 236, 236, 236, 236, 236, 236,
953 236, 236, 236, 236, 236, 236, 236, 236,
954 236, 237, 237, 238, 241, 241, 242, 249,
955 255,
956 };
957
958 static const unsigned char new_tbl[] = {
959 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
960 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
961 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
962 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
963 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
964 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
965 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
966 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
967 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
968 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
969 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
970 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
971 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
972 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
973 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
974 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
975 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
976 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
977 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
978 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
979 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
980 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
981 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
982 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
983 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
984 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
985 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
986 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
987 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
988 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
989 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
990 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
991 };
992
993 static const signed char new_wtbl[] = {
994 0, -1, 1, -1, 1, 1, 0, 1,
995 0, 1, 1, 0, 1, 0, 1, 1,
996 0, 1, 0, 1, 0, 1, 0, 1,
997 0, 1, 0, 1, 1, 0, 1, 0,
998 1, 0, 1, 0, 1, 0, 1, 1,
999 0, 1, 0, 1, 0, 1, 0, 1,
1000 1, 0, 1, 0, 1, 0, 1, 0,
1001 1, 0, 1, 0, 1, 0, 1, 0,
1002 1, 0, 1, 0, 1, 0, 1, 1,
1003 0, 1, 0, 1, 0, 1, 0, 1,
1004 0, 1, 0, 1, 0, 1, 0, 1,
1005 0, 1, 0, 1, 0, 1, 1, 0,
1006 1, 0, 1, 0, 1, 0, 1, 0,
1007 1, 0, 1, 0, 1, 0, 1, 0,
1008 1, 1, 0, 1, 0, 1, 0, 1,
1009 0, 1, 0, 1, 0, 1, 0, 1,
1010 1, 0, 1, 0, 1, 0, 1, 0,
1011 1, 0, 1, 1, 0, 1, 0, 1,
1012 0, 1, 0, 1, 0, 1, 0, 1,
1013 0, 1, 1, 0, 1, 0, 1, 0,
1014 1, 0, 1, 0, 1, 0, 1, 0,
1015 1, 0, 1, 0, 1, 0, 1, 1,
1016 0, 1, 0, 1, 0, 1, 0, 1,
1017 0, 1, 2, 0, 1, 0, 1, 0,
1018 1, 0, 1, 0, 1, 0, 1, 0,
1019 1, 0, 1, 1, 0, 1, 0, 1,
1020 1, 0, 1, 0, 1, 0, 1, 0,
1021 1, 0, 1, 1, 2, 1, 1, 2,
1022 2, 0, 2, 1, 2, 0, 2, 2,
1023 1, 1, 2, 1, 1, 2, 1, 0,
1024 1, 1, 0, 1, 0, 1, 2, 1,
1025 0, 2, 1, 2, 1, 0, 1,
1026 };
1027
1028
wcswidth(const wchar_t * pwcs,size_t n)1029 int wcswidth(const wchar_t *pwcs, size_t n)
1030 {
1031 int h, l, m, count;
1032 wchar_t wc;
1033 unsigned char b;
1034
1035 if (ENCODING == __ctype_encoding_7_bit) {
1036 size_t i;
1037
1038 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1039 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1040 return -1;
1041 }
1042 }
1043 }
1044 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1045 else if (ENCODING == __ctype_encoding_8_bit) {
1046 mbstate_t mbstate;
1047
1048 mbstate.__mask = 0; /* Initialize the mbstate. */
1049 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1050 return -1;
1051 }
1052 }
1053 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1054 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1055 /* For stricter handling of allowed unicode values... see comments above. */
1056 else if (ENCODING == __ctype_encoding_utf8) {
1057 size_t i;
1058
1059 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1060 if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1061 || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1062 ) {
1063 return -1;
1064 }
1065 }
1066 }
1067 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1068
1069 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1070 if (wc <= 0xff) {
1071 /* If we're here, wc != 0. */
1072 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1073 return -1;
1074 }
1075 ++count;
1076 continue;
1077 }
1078 if (((unsigned int) wc) <= 0xffff) {
1079 b = wc & 0xff;
1080 h = (wc >> 8);
1081 l = new_idx[h];
1082 h = new_idx[h+1];
1083 while ((m = (l+h) >> 1) != l) {
1084 if (b >= new_tbl[m]) {
1085 l = m;
1086 } else { /* wc < tbl[m] */
1087 h = m;
1088 }
1089 }
1090 count += new_wtbl[l]; /* none should be -1. */
1091 continue;
1092 }
1093
1094 /* Redo this to minimize average number of compares?*/
1095 if (wc >= 0x1d167) {
1096 if (wc <= 0x1d1ad) {
1097 if ((wc <= 0x1d169
1098 || (wc >= 0x1d173
1099 && (wc <= 0x1d182
1100 || (wc >= 0x1d185
1101 && (wc <= 0x1d18b
1102 || (wc >= 0x1d1aa))))))
1103 ) {
1104 continue;
1105 }
1106 } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1107 continue;
1108 } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1109 ++count; /* need 2.. add one here */
1110 }
1111 #if (WCHAR_MAX > 0x7fffffffL)
1112 else if (wc > 0x7fffffffL) {
1113 return -1;
1114 }
1115 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1116 }
1117
1118 ++count;
1119 }
1120
1121 return count;
1122 }
1123
1124 #else /* __UCLIBC_HAS_LOCALE__ */
1125
1126 int wcswidth(const wchar_t *pwcs, size_t n)
1127 {
1128 int count;
1129 wchar_t wc;
1130 size_t i;
1131
1132 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1133 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1134 return -1;
1135 }
1136 }
1137
1138 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1139 if (wc <= 0xff) {
1140 /* If we're here, wc != 0. */
1141 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1142 return -1;
1143 }
1144 ++count;
1145 continue;
1146 } else {
1147 return -1;
1148 }
1149 }
1150
1151 return count;
1152 }
1153
1154 #endif /* __UCLIBC_HAS_LOCALE__ */
1155
libc_hidden_def(wcswidth)1156 libc_hidden_def(wcswidth)
1157
1158 #endif
1159 /**********************************************************************/
1160 #ifdef L_wcwidth
1161
1162
1163 int wcwidth(wchar_t wc)
1164 {
1165 return wcswidth(&wc, 1);
1166 }
1167
1168 #endif
1169 /**********************************************************************/
1170
1171
1172 typedef struct {
1173 mbstate_t tostate;
1174 mbstate_t fromstate;
1175 int tocodeset;
1176 int fromcodeset;
1177 int frombom;
1178 int tobom;
1179 int fromcodeset0;
1180 int frombom0;
1181 int tobom0;
1182 int skip_invalid_input; /* To support iconv -c option. */
1183 } _UC_iconv_t;
1184
1185 /* For the multibyte
1186 * bit 0 means swap endian
1187 * bit 1 means 2 byte
1188 * bit 2 means 4 byte
1189 *
1190 */
1191
1192 #if defined L_iconv && defined _LIBC
1193 /* Used externally only by iconv utility */
1194 extern const unsigned char __iconv_codesets[];
1195 libc_hidden_proto(__iconv_codesets)
1196 #endif
1197
1198 #if defined L_iconv || defined L_iconv_main
1199 # ifdef L_iconv_main
1200 static
1201 # endif
1202 const unsigned char __iconv_codesets[] =
1203 "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
1204 #if __BYTE_ORDER == __BIG_ENDIAN
1205 "\x08\xec""UCS-4\x00" /* always BE */
1206 "\x0a\xec""UCS-4BE\x00"
1207 "\x0a\xed""UCS-4LE\x00"
1208 "\x09\xe4""UTF-32\x00" /* platform endian with BOM */
1209 "\x0b\xe4""UTF-32BE\x00"
1210 "\x0b\xe5""UTF-32LE\x00"
1211 "\x08\xe2""UCS-2\x00" /* always BE */
1212 "\x0a\xe2""UCS-2BE\x00"
1213 "\x0a\xe3""UCS-2LE\x00"
1214 "\x09\xea""UTF-16\x00" /* platform endian with BOM */
1215 "\x0b\xea""UTF-16BE\x00"
1216 "\x0b\xeb""UTF-16LE\x00"
1217 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1218 "\x08\xed""UCS-4\x00" /* always BE */
1219 "\x0a\xed""UCS-4BE\x00"
1220 "\x0a\xec""UCS-4LE\x00"
1221 "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
1222 "\x0b\xe5""UTF-32BE\x00"
1223 "\x0b\xe4""UTF-32LE\x00"
1224 "\x08\xe3""UCS-2\x00" /* always BE */
1225 "\x0a\xe3""UCS-2BE\x00"
1226 "\x0a\xe2""UCS-2LE\x00"
1227 "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
1228 "\x0b\xeb""UTF-16BE\x00"
1229 "\x0b\xea""UTF-16LE\x00"
1230 #endif
1231 "\x08\x02""UTF-8\x00"
1232 "\x0b\x01""US-ASCII\x00"
1233 "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
1234 #endif
1235 #if defined L_iconv && defined _LIBC
1236 libc_hidden_data_def(__iconv_codesets)
1237 #endif
1238
1239
1240 #ifdef L_iconv
1241
1242 #include <iconv.h>
1243 #include <string.h>
1244 #include <endian.h>
1245 #include <byteswap.h>
1246
1247 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1248 #error unsupported endianness for iconv
1249 #endif
1250
1251 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1252 #error currently iconv requires 8 bit locales
1253 #endif
1254 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1255 #error currently iconv requires UTF-8 locales
1256 #endif
1257
1258
1259 enum {
1260 IC_WCHAR_T = 0xe0,
1261 IC_MULTIBYTE = 0xe0,
1262 #if __BYTE_ORDER == __BIG_ENDIAN
1263 IC_UCS_4 = 0xec,
1264 IC_UTF_32 = 0xe4,
1265 IC_UCS_2 = 0xe2,
1266 IC_UTF_16 = 0xea,
1267 #else
1268 IC_UCS_4 = 0xed,
1269 IC_UTF_32 = 0xe5,
1270 IC_UCS_2 = 0xe3,
1271 IC_UTF_16 = 0xeb,
1272 #endif
1273 IC_UTF_8 = 2,
1274 IC_ASCII = 1
1275 };
1276
1277
find_codeset(const char * name)1278 static int find_codeset(const char *name)
1279 {
1280 const unsigned char *s;
1281 int codeset;
1282
1283 for (s = __iconv_codesets; *s; s += *s) {
1284 if (!strcasecmp((char*) (s + 2), name)) {
1285 return s[1];
1286 }
1287 }
1288
1289 /* The following is ripped from find_locale in locale.c. */
1290
1291 /* TODO: maybe CODESET_LIST + *s ??? */
1292 /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1293 codeset = 2;
1294 s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1295 do {
1296 ++codeset; /* Increment codeset first. */
1297 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1298 return codeset;
1299 }
1300 } while (*++s);
1301
1302 return 0; /* No matching codeset! */
1303 }
1304
iconv_open(const char * tocode,const char * fromcode)1305 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1306 {
1307 register _UC_iconv_t *px;
1308 int tocodeset, fromcodeset;
1309
1310 if (((tocodeset = find_codeset(tocode)) != 0)
1311 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1312 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1313 px->tocodeset = tocodeset;
1314 px->tobom0 = px->tobom = (tocodeset >= 0xe0) ? (tocodeset & 0x10) >> 4 : 0;
1315 px->fromcodeset0 = px->fromcodeset = fromcodeset;
1316 px->frombom0 = px->frombom = (fromcodeset >= 0xe0) ? (fromcodeset & 0x10) >> 4 : 0;
1317 px->skip_invalid_input = px->tostate.__mask
1318 = px->fromstate.__mask = 0;
1319 return (iconv_t) px;
1320 }
1321 } else {
1322 __set_errno(EINVAL);
1323 }
1324 return (iconv_t)(-1);
1325 }
1326
iconv_close(iconv_t cd)1327 int weak_function iconv_close(iconv_t cd)
1328 {
1329 free(cd);
1330
1331 return 0;
1332 }
1333
iconv(iconv_t cd,char ** __restrict inbuf,size_t * __restrict inbytesleft,char ** __restrict outbuf,size_t * __restrict outbytesleft)1334 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1335 size_t *__restrict inbytesleft,
1336 char **__restrict outbuf,
1337 size_t *__restrict outbytesleft)
1338 {
1339 _UC_iconv_t *px = (_UC_iconv_t *) cd;
1340 size_t nrcount, r;
1341 wchar_t wc, wc2;
1342 int inci, inco;
1343
1344 assert(px != (_UC_iconv_t *)(-1));
1345 assert(sizeof(wchar_t) == 4);
1346
1347 if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
1348 /* Note: For shift-state encodings we possibly need to output the
1349 * shift sequence to return to initial state! */
1350 if ((px->fromcodeset & 0xf0) == 0xe0) {
1351 }
1352 px->tostate.__mask = px->fromstate.__mask = 0;
1353 px->fromcodeset = px->fromcodeset0;
1354 px->tobom = px->tobom0;
1355 px->frombom = px->frombom0;
1356 return 0;
1357 }
1358
1359 nrcount = 0;
1360 while (*inbytesleft) {
1361 if (!*outbytesleft) {
1362 TOO_BIG:
1363 __set_errno(E2BIG);
1364 return (size_t) -1;
1365 }
1366
1367 inci = inco = 1;
1368 if (px->fromcodeset >= IC_MULTIBYTE) {
1369 inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1370 if (*inbytesleft < inci) goto INVALID;
1371 wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1372 + ((unsigned char)((*inbuf)[1]));
1373 if (inci == 4) {
1374 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1375 + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1376 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1377 } else {
1378 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1379 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1380 && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1381 ) { /* surrogate */
1382 wc =- 0xd800U;
1383 if (*inbytesleft < 4) goto INVALID;
1384 wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1385 + ((unsigned char)((*inbuf)[3]));
1386 if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1387 if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1388 goto ILLEGAL;
1389 }
1390 inci = 4; /* Change inci here in case skipping illegals. */
1391 wc = 0x10000UL + (wc << 10) + wc2;
1392 }
1393 }
1394
1395 if (px->frombom) {
1396 px->frombom = 0;
1397 if ((wc == 0xfeffU)
1398 || (wc == ((inci == 4)
1399 ? (((wchar_t) 0xfffe0000UL))
1400 : ((wchar_t)(0xfffeUL))))
1401 ) {
1402 if (wc != 0xfeffU) {
1403 px->fromcodeset ^= 1; /* toggle endianness */
1404 wc = 0xfeffU;
1405 }
1406 if (!px->frombom) {
1407 goto BOM_SKIP_OUTPUT;
1408 }
1409 goto GOT_BOM;
1410 }
1411 }
1412
1413 if (px->fromcodeset != IC_WCHAR_T) {
1414 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1415 ? 0x7fffffffUL : 0x10ffffUL)
1416 #ifdef KUHN
1417 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1418 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1419 #endif
1420 ) {
1421 goto ILLEGAL;
1422 }
1423 }
1424 } else if (px->fromcodeset == IC_UTF_8) {
1425 const char *p = *inbuf;
1426 r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1427 if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1428 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1429 assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1430 if (r == (size_t)(-2)) {
1431 INVALID:
1432 __set_errno(EINVAL);
1433 } else {
1434 px->fromstate.__mask = 0;
1435 inci = 1;
1436 ILLEGAL:
1437 if (px->skip_invalid_input) {
1438 px->skip_invalid_input = 2; /* flag for iconv utility */
1439 goto BOM_SKIP_OUTPUT;
1440 }
1441 __set_errno(EILSEQ);
1442 }
1443 return (size_t)(-1);
1444 }
1445 #ifdef __UCLIBC_MJN3_ONLY__
1446 #warning TODO: optimize this.
1447 #endif
1448 if (p != NULL) { /* incomplete char case */
1449 goto INVALID;
1450 }
1451 p = *inbuf + 1; /* nul */
1452 }
1453 inci = p - *inbuf;
1454 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1455 if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1456 goto ILLEGAL;
1457 } else { /* some other 8-bit ascii-extension codeset */
1458 const __codeset_8_bit_t *c8b
1459 = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1460 wc -= 0x80;
1461 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1462 (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1463 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1464 if (!wc) {
1465 goto ILLEGAL;
1466 }
1467 }
1468 }
1469
1470
1471 if (px->tobom) {
1472 inci = 0;
1473 wc = 0xfeffU;
1474 GOT_BOM:
1475 px->tobom = 0;
1476 }
1477
1478 if (px->tocodeset >= IC_MULTIBYTE) {
1479 inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1480 if (*outbytesleft < inco) goto TOO_BIG;
1481 if (px->tocodeset != IC_WCHAR_T) {
1482 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1483 ? 0x7fffffffUL : 0x10ffffUL)
1484 #ifdef KUHN
1485 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1486 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1487 #endif
1488 ) {
1489 REPLACE_32:
1490 wc = 0xfffd;
1491 ++nrcount;
1492 }
1493 }
1494 if (inco == 4) {
1495 if (px->tocodeset & 1) wc = bswap_32(wc);
1496 } else {
1497 if (((__uwchar_t)wc ) > 0xffffU) {
1498 if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1499 goto REPLACE_32;
1500 }
1501 if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1502 wc2 = 0xdc00U + (wc & 0x3ff);
1503 wc = 0xd800U + ((wc >> 10) & 0x3ff);
1504 if (px->tocodeset & 1) {
1505 wc = bswap_16(wc);
1506 wc2 = bswap_16(wc2);
1507 }
1508 wc += (wc2 << 16);
1509 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1510 }
1511 (*outbuf)[0] = (char)((unsigned char)(wc));
1512 (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1513 if (inco == 4) {
1514 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1515 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1516 }
1517 } else if (px->tocodeset == IC_UTF_8) {
1518 const wchar_t *pw = &wc;
1519 do {
1520 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1521 if (r != (size_t)(-1)) {
1522 #ifdef __UCLIBC_MJN3_ONLY__
1523 #warning TODO: What happens for a nul?
1524 #endif
1525 if (r == 0) {
1526 if (wc != 0) {
1527 goto TOO_BIG;
1528 }
1529 ++r;
1530 }
1531 break;
1532 }
1533 wc = 0xfffdU;
1534 ++nrcount;
1535 } while (1);
1536 inco = r;
1537 } else if (((__uwchar_t)(wc)) < 0x80) {
1538 CHAR_GOOD:
1539 **outbuf = wc;
1540 } else {
1541 if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1542 const __codeset_8_bit_t *c8b
1543 = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1544 __uwchar_t u;
1545 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1546 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1547 + ((wc >> Cwc2c_TT_SHIFT)
1548 & ((1 << Cwc2c_TI_SHIFT)-1))];
1549 wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1550 + (u << Cwc2c_TT_SHIFT)
1551 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1552 if (wc) {
1553 goto CHAR_GOOD;
1554 }
1555 }
1556 **outbuf = '?';
1557 ++nrcount;
1558 }
1559
1560 *outbuf += inco;
1561 *outbytesleft -= inco;
1562 BOM_SKIP_OUTPUT:
1563 *inbuf += inci;
1564 *inbytesleft -= inci;
1565 }
1566 return nrcount;
1567 }
1568 #endif
1569