1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  *  charset conversion utils
4  *
5  *  Copyright (c) 2017 Rob Clark
6  */
7 
8 #include <common.h>
9 #include <charset.h>
10 #include <capitalization.h>
11 #include <cp437.h>
12 #include <efi_loader.h>
13 #include <errno.h>
14 #include <malloc.h>
15 
16 /**
17  * codepage_437 - Unicode to codepage 437 translation table
18  */
19 const u16 codepage_437[128] = CP437;
20 
21 static struct capitalization_table capitalization_table[] =
22 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
23 	UNICODE_CAPITALIZATION_TABLE;
24 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
25 	CP1250_CAPITALIZATION_TABLE;
26 #else
27 	CP437_CAPITALIZATION_TABLE;
28 #endif
29 
30 /**
31  * get_code() - read Unicode code point from UTF-8 stream
32  *
33  * @read_u8:	- stream reader
34  * @src:	- string buffer passed to stream reader, optional
35  * Return:	- Unicode code point, or -1
36  */
get_code(u8 (* read_u8)(void * data),void * data)37 static int get_code(u8 (*read_u8)(void *data), void *data)
38 {
39 	s32 ch = 0;
40 
41 	ch = read_u8(data);
42 	if (!ch)
43 		return 0;
44 	if (ch >= 0xc2 && ch <= 0xf4) {
45 		int code = 0;
46 
47 		if (ch >= 0xe0) {
48 			if (ch >= 0xf0) {
49 				/* 0xf0 - 0xf4 */
50 				ch &= 0x07;
51 				code = ch << 18;
52 				ch = read_u8(data);
53 				if (ch < 0x80 || ch > 0xbf)
54 					goto error;
55 				ch &= 0x3f;
56 			} else {
57 				/* 0xe0 - 0xef */
58 				ch &= 0x0f;
59 			}
60 			code += ch << 12;
61 			if ((code >= 0xD800 && code <= 0xDFFF) ||
62 			    code >= 0x110000)
63 				goto error;
64 			ch = read_u8(data);
65 			if (ch < 0x80 || ch > 0xbf)
66 				goto error;
67 		}
68 		/* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
69 		ch &= 0x3f;
70 		code += ch << 6;
71 		ch = read_u8(data);
72 		if (ch < 0x80 || ch > 0xbf)
73 			goto error;
74 		ch &= 0x3f;
75 		ch += code;
76 	} else if (ch >= 0x80) {
77 		goto error;
78 	}
79 	return ch;
80 error:
81 	return -1;
82 }
83 
84 /**
85  * read_string() - read byte from character string
86  *
87  * @data:	- pointer to string
88  * Return:	- byte read
89  *
90  * The string pointer is incremented if it does not point to '\0'.
91  */
read_string(void * data)92 static u8 read_string(void *data)
93 
94 {
95 	const char **src = (const char **)data;
96 	u8 c;
97 
98 	if (!src || !*src || !**src)
99 		return 0;
100 	c = **src;
101 	++*src;
102 	return c;
103 }
104 
105 /**
106  * read_console() - read byte from console
107  *
108  * @data	- not used, needed to match interface
109  * Return:	- byte read or 0 on error
110  */
read_console(void * data)111 static u8 read_console(void *data)
112 {
113 	int ch;
114 
115 	ch = getchar();
116 	if (ch < 0)
117 		ch = 0;
118 	return ch;
119 }
120 
console_read_unicode(s32 * code)121 int console_read_unicode(s32 *code)
122 {
123 	for (;;) {
124 		s32 c;
125 
126 		if (!tstc()) {
127 			/* No input available */
128 			return 1;
129 		}
130 
131 		/* Read Unicode code */
132 		c = get_code(read_console, NULL);
133 		if (c > 0) {
134 			*code = c;
135 			return 0;
136 		}
137 	}
138 }
139 
utf8_get(const char ** src)140 s32 utf8_get(const char **src)
141 {
142 	return get_code(read_string, src);
143 }
144 
utf8_put(s32 code,char ** dst)145 int utf8_put(s32 code, char **dst)
146 {
147 	if (!dst || !*dst)
148 		return -1;
149 	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
150 		return -1;
151 	if (code <= 0x007F) {
152 		**dst = code;
153 	} else {
154 		if (code <= 0x07FF) {
155 			**dst = code >> 6 | 0xC0;
156 		} else {
157 			if (code < 0x10000) {
158 				**dst = code >> 12 | 0xE0;
159 			} else {
160 				**dst = code >> 18 | 0xF0;
161 				++*dst;
162 				**dst = (code >> 12 & 0x3F) | 0x80;
163 			}
164 			++*dst;
165 			**dst = (code >> 6 & 0x3F) | 0x80;
166 		}
167 		++*dst;
168 		**dst = (code & 0x3F) | 0x80;
169 	}
170 	++*dst;
171 	return 0;
172 }
173 
utf8_utf16_strnlen(const char * src,size_t count)174 size_t utf8_utf16_strnlen(const char *src, size_t count)
175 {
176 	size_t len = 0;
177 
178 	for (; *src && count; --count)  {
179 		s32 code = utf8_get(&src);
180 
181 		if (!code)
182 			break;
183 		if (code < 0) {
184 			/* Reserve space for a replacement character */
185 			len += 1;
186 		} else if (code < 0x10000) {
187 			len += 1;
188 		} else {
189 			len += 2;
190 		}
191 	}
192 	return len;
193 }
194 
utf8_utf16_strncpy(u16 ** dst,const char * src,size_t count)195 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
196 {
197 	if (!src || !dst || !*dst)
198 		return -1;
199 
200 	for (; count && *src; --count) {
201 		s32 code = utf8_get(&src);
202 
203 		if (code < 0)
204 			code = '?';
205 		utf16_put(code, dst);
206 	}
207 	**dst = 0;
208 	return 0;
209 }
210 
utf16_get(const u16 ** src)211 s32 utf16_get(const u16 **src)
212 {
213 	s32 code, code2;
214 
215 	if (!src || !*src)
216 		return -1;
217 	if (!**src)
218 		return 0;
219 	code = **src;
220 	++*src;
221 	if (code >= 0xDC00 && code <= 0xDFFF)
222 		return -1;
223 	if (code >= 0xD800 && code <= 0xDBFF) {
224 		if (!**src)
225 			return -1;
226 		code &= 0x3ff;
227 		code <<= 10;
228 		code += 0x10000;
229 		code2 = **src;
230 		++*src;
231 		if (code2 <= 0xDC00 || code2 >= 0xDFFF)
232 			return -1;
233 		code2 &= 0x3ff;
234 		code += code2;
235 	}
236 	return code;
237 }
238 
utf16_put(s32 code,u16 ** dst)239 int utf16_put(s32 code, u16 **dst)
240 {
241 	if (!dst || !*dst)
242 		return -1;
243 	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
244 		return -1;
245 	if (code < 0x10000) {
246 		**dst = code;
247 	} else {
248 		code -= 0x10000;
249 		**dst = code >> 10 | 0xD800;
250 		++*dst;
251 		**dst = (code & 0x3ff) | 0xDC00;
252 	}
253 	++*dst;
254 	return 0;
255 }
256 
utf16_strnlen(const u16 * src,size_t count)257 size_t utf16_strnlen(const u16 *src, size_t count)
258 {
259 	size_t len = 0;
260 
261 	for (; *src && count; --count)  {
262 		s32 code = utf16_get(&src);
263 
264 		if (!code)
265 			break;
266 		/*
267 		 * In case of an illegal sequence still reserve space for a
268 		 * replacement character.
269 		 */
270 		++len;
271 	}
272 	return len;
273 }
274 
utf16_utf8_strnlen(const u16 * src,size_t count)275 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
276 {
277 	size_t len = 0;
278 
279 	for (; *src && count; --count)  {
280 		s32 code = utf16_get(&src);
281 
282 		if (!code)
283 			break;
284 		if (code < 0)
285 			/* Reserve space for a replacement character */
286 			len += 1;
287 		else if (code < 0x80)
288 			len += 1;
289 		else if (code < 0x800)
290 			len += 2;
291 		else if (code < 0x10000)
292 			len += 3;
293 		else
294 			len += 4;
295 	}
296 	return len;
297 }
298 
utf16_utf8_strncpy(char ** dst,const u16 * src,size_t count)299 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
300 {
301 	if (!src || !dst || !*dst)
302 		return -1;
303 
304 	for (; count && *src; --count) {
305 		s32 code = utf16_get(&src);
306 
307 		if (code < 0)
308 			code = '?';
309 		utf8_put(code, dst);
310 	}
311 	**dst = 0;
312 	return 0;
313 }
314 
utf_to_lower(const s32 code)315 s32 utf_to_lower(const s32 code)
316 {
317 	struct capitalization_table *pos = capitalization_table;
318 	s32 ret = code;
319 
320 	if (code <= 0x7f) {
321 		if (code >= 'A' && code <= 'Z')
322 			ret += 0x20;
323 		return ret;
324 	}
325 	for (; pos->upper; ++pos) {
326 		if (pos->upper == code) {
327 			ret = pos->lower;
328 			break;
329 		}
330 	}
331 	return ret;
332 }
333 
utf_to_upper(const s32 code)334 s32 utf_to_upper(const s32 code)
335 {
336 	struct capitalization_table *pos = capitalization_table;
337 	s32 ret = code;
338 
339 	if (code <= 0x7f) {
340 		if (code >= 'a' && code <= 'z')
341 			ret -= 0x20;
342 		return ret;
343 	}
344 	for (; pos->lower; ++pos) {
345 		if (pos->lower == code) {
346 			ret = pos->upper;
347 			break;
348 		}
349 	}
350 	return ret;
351 }
352 
353 /*
354  * u16_strcasecmp() - compare two u16 strings case insensitively
355  *
356  * @s1:		first string to compare
357  * @s2:		second string to compare
358  * @n:		maximum number of u16 to compare
359  * Return:	0  if the first n u16 are the same in s1 and s2
360  *		< 0 if the first different u16 in s1 is less than the
361  *		corresponding u16 in s2
362  *		> 0 if the first different u16 in s1 is greater than the
363  */
u16_strcasecmp(const u16 * s1,const u16 * s2)364 int u16_strcasecmp(const u16 *s1, const u16 *s2)
365 {
366 	int ret = 0;
367 	s32 c1, c2;
368 
369 	for (;;) {
370 		c1 = utf_to_upper(utf16_get(&s1));
371 		c2 = utf_to_upper(utf16_get(&s2));
372 		ret = c1 - c2;
373 		if (ret || !c1 || c1 == -1 || c2 == -1)
374 			break;
375 	}
376 	return ret;
377 }
378 
379 /*
380  * u16_strncmp() - compare two u16 string
381  *
382  * @s1:		first string to compare
383  * @s2:		second string to compare
384  * @n:		maximum number of u16 to compare
385  * Return:	0  if the first n u16 are the same in s1 and s2
386  *		< 0 if the first different u16 in s1 is less than the
387  *		corresponding u16 in s2
388  *		> 0 if the first different u16 in s1 is greater than the
389  *		corresponding u16 in s2
390  */
u16_strncmp(const u16 * s1,const u16 * s2,size_t n)391 int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
392 {
393 	int ret = 0;
394 
395 	for (; n; --n, ++s1, ++s2) {
396 		ret = *s1 - *s2;
397 		if (ret || !*s1)
398 			break;
399 	}
400 
401 	return ret;
402 }
403 
u16_strnlen(const u16 * in,size_t count)404 size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
405 {
406 	size_t i;
407 	for (i = 0; count-- && in[i]; i++);
408 	return i;
409 }
410 
u16_strsize(const void * in)411 size_t u16_strsize(const void *in)
412 {
413 	return (u16_strlen(in) + 1) * sizeof(u16);
414 }
415 
u16_strcpy(u16 * dest,const u16 * src)416 u16 *u16_strcpy(u16 *dest, const u16 *src)
417 {
418 	u16 *tmp = dest;
419 
420 	for (;; dest++, src++) {
421 		*dest = *src;
422 		if (!*src)
423 			break;
424 	}
425 
426 	return tmp;
427 }
428 
u16_strdup(const void * src)429 u16 *u16_strdup(const void *src)
430 {
431 	u16 *new;
432 	size_t len;
433 
434 	if (!src)
435 		return NULL;
436 	len = u16_strsize(src);
437 	new = malloc(len);
438 	if (!new)
439 		return NULL;
440 	memcpy(new, src, len);
441 
442 	return new;
443 }
444 
u16_strlcat(u16 * dest,const u16 * src,size_t count)445 size_t u16_strlcat(u16 *dest, const u16 *src, size_t count)
446 {
447 	size_t destlen = u16_strlen(dest);
448 	size_t srclen = u16_strlen(src);
449 	size_t ret = destlen + srclen + 1;
450 
451 	if (destlen >= count)
452 		return ret;
453 	if (ret > count)
454 		srclen -= ret - count;
455 	memcpy(&dest[destlen], src, 2 * srclen);
456 	dest[destlen + srclen] = 0x0000;
457 
458 	return ret;
459 }
460 
461 /* Convert UTF-16 to UTF-8.  */
utf16_to_utf8(uint8_t * dest,const uint16_t * src,size_t size)462 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
463 {
464 	uint32_t code_high = 0;
465 
466 	while (size--) {
467 		uint32_t code = *src++;
468 
469 		if (code_high) {
470 			if (code >= 0xDC00 && code <= 0xDFFF) {
471 				/* Surrogate pair.  */
472 				code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
473 
474 				*dest++ = (code >> 18) | 0xF0;
475 				*dest++ = ((code >> 12) & 0x3F) | 0x80;
476 				*dest++ = ((code >> 6) & 0x3F) | 0x80;
477 				*dest++ = (code & 0x3F) | 0x80;
478 			} else {
479 				/* Error...  */
480 				*dest++ = '?';
481 				/* *src may be valid. Don't eat it.  */
482 				src--;
483 			}
484 
485 			code_high = 0;
486 		} else {
487 			if (code <= 0x007F) {
488 				*dest++ = code;
489 			} else if (code <= 0x07FF) {
490 				*dest++ = (code >> 6) | 0xC0;
491 				*dest++ = (code & 0x3F) | 0x80;
492 			} else if (code >= 0xD800 && code <= 0xDBFF) {
493 				code_high = code;
494 				continue;
495 			} else if (code >= 0xDC00 && code <= 0xDFFF) {
496 				/* Error... */
497 				*dest++ = '?';
498 			} else if (code < 0x10000) {
499 				*dest++ = (code >> 12) | 0xE0;
500 				*dest++ = ((code >> 6) & 0x3F) | 0x80;
501 				*dest++ = (code & 0x3F) | 0x80;
502 			} else {
503 				*dest++ = (code >> 18) | 0xF0;
504 				*dest++ = ((code >> 12) & 0x3F) | 0x80;
505 				*dest++ = ((code >> 6) & 0x3F) | 0x80;
506 				*dest++ = (code & 0x3F) | 0x80;
507 			}
508 		}
509 	}
510 
511 	return dest;
512 }
513 
utf_to_cp(s32 * c,const u16 * codepage)514 int utf_to_cp(s32 *c, const u16 *codepage)
515 {
516 	if (*c >= 0x80) {
517 		int j;
518 
519 		/* Look up codepage translation */
520 		for (j = 0; j < 0x80; ++j) {
521 			if (*c == codepage[j]) {
522 				*c = j + 0x80;
523 				return 0;
524 			}
525 		}
526 		*c = '?';
527 		return -ENOENT;
528 	}
529 	return 0;
530 }
531 
utf8_to_cp437_stream(u8 c,char * buffer)532 int utf8_to_cp437_stream(u8 c, char *buffer)
533 {
534 	char *end;
535 	const char *pos;
536 	s32 s;
537 	int ret;
538 
539 	for (;;) {
540 		pos = buffer;
541 		end = buffer + strlen(buffer);
542 		*end++ = c;
543 		*end = 0;
544 		s = utf8_get(&pos);
545 		if (s > 0) {
546 			*buffer = 0;
547 			ret = utf_to_cp(&s, codepage_437);
548 			return s;
549 			}
550 		if (pos == end)
551 			return 0;
552 		*buffer = 0;
553 	}
554 }
555 
utf8_to_utf32_stream(u8 c,char * buffer)556 int utf8_to_utf32_stream(u8 c, char *buffer)
557 {
558 	char *end;
559 	const char *pos;
560 	s32 s;
561 
562 	for (;;) {
563 		pos = buffer;
564 		end = buffer + strlen(buffer);
565 		*end++ = c;
566 		*end = 0;
567 		s = utf8_get(&pos);
568 		if (s > 0) {
569 			*buffer = 0;
570 			return s;
571 		}
572 		if (pos == end)
573 			return 0;
574 		*buffer = 0;
575 	}
576 }
577