1 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
2 Copyright (C) 2000-2001 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Usage example:
21 $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
22 */
23
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <string.h>
28 #include <time.h>
29
30 /* This structure represents one line in the UnicodeData.txt file. */
31 struct unicode_attribute
32 {
33 const char *name; /* Character name */
34 const char *category; /* General category */
35 const char *combining; /* Canonical combining classes */
36 const char *bidi; /* Bidirectional category */
37 const char *decomposition; /* Character decomposition mapping */
38 const char *decdigit; /* Decimal digit value */
39 const char *digit; /* Digit value */
40 const char *numeric; /* Numeric value */
41 int mirrored; /* mirrored */
42 const char *oldname; /* Old Unicode 1.0 name */
43 const char *comment; /* Comment */
44 unsigned int upper; /* Uppercase mapping */
45 unsigned int lower; /* Lowercase mapping */
46 unsigned int title; /* Titlecase mapping */
47 };
48
49 /* Missing fields are represented with "" for strings, and NONE for
50 characters. */
51 #define NONE (~(unsigned int)0)
52
53 /* The entire contents of the UnicodeData.txt file. */
54 struct unicode_attribute unicode_attributes [0x110000];
55
56 /* Stores in unicode_attributes[i] the values from the given fields. */
57 static void
fill_attribute(unsigned int i,const char * field1,const char * field2,const char * field3,const char * field4,const char * field5,const char * field6,const char * field7,const char * field8,const char * field9,const char * field10,const char * field11,const char * field12,const char * field13,const char * field14)58 fill_attribute (unsigned int i,
59 const char *field1, const char *field2,
60 const char *field3, const char *field4,
61 const char *field5, const char *field6,
62 const char *field7, const char *field8,
63 const char *field9, const char *field10,
64 const char *field11, const char *field12,
65 const char *field13, const char *field14)
66 {
67 struct unicode_attribute * uni;
68
69 if (i >= 0x110000)
70 {
71 fprintf (stderr, "index too large\n");
72 exit (1);
73 }
74 if (strcmp (field2, "Cs") == 0)
75 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
76 return;
77 uni = &unicode_attributes[i];
78 /* Copy the strings. */
79 uni->name = strdup (field1);
80 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
81 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
82 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
83 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
85 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
86 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
87 uni->mirrored = (field9[0] == 'Y');
88 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
89 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
90 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
93 }
94
95 /* Maximum length of a field in the UnicodeData.txt file. */
96 #define FIELDLEN 120
97
98 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
99 Reads up to (but excluding) DELIM.
100 Returns 1 when a field was successfully read, otherwise 0. */
101 static int
getfield(FILE * stream,char * buffer,int delim)102 getfield (FILE *stream, char *buffer, int delim)
103 {
104 int count = 0;
105 int c;
106
107 for (; (c = getc (stream)), (c != EOF && c != delim); )
108 {
109 /* The original unicode.org UnicodeData.txt file happens to have
110 CR/LF line terminators. Silently convert to LF. */
111 if (c == '\r')
112 continue;
113
114 /* Put c into the buffer. */
115 if (++count >= FIELDLEN - 1)
116 {
117 fprintf (stderr, "field too long\n");
118 exit (1);
119 }
120 *buffer++ = c;
121 }
122
123 if (c == EOF)
124 return 0;
125
126 *buffer = '\0';
127 return 1;
128 }
129
130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
131 file. */
132 static void
fill_attributes(const char * unicodedata_filename)133 fill_attributes (const char *unicodedata_filename)
134 {
135 unsigned int i, j;
136 FILE *stream;
137 char field0[FIELDLEN];
138 char field1[FIELDLEN];
139 char field2[FIELDLEN];
140 char field3[FIELDLEN];
141 char field4[FIELDLEN];
142 char field5[FIELDLEN];
143 char field6[FIELDLEN];
144 char field7[FIELDLEN];
145 char field8[FIELDLEN];
146 char field9[FIELDLEN];
147 char field10[FIELDLEN];
148 char field11[FIELDLEN];
149 char field12[FIELDLEN];
150 char field13[FIELDLEN];
151 char field14[FIELDLEN];
152 int lineno = 0;
153
154 for (i = 0; i < 0x110000; i++)
155 unicode_attributes[i].name = NULL;
156
157 stream = fopen (unicodedata_filename, "r");
158 if (stream == NULL)
159 {
160 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
161 exit (1);
162 }
163
164 for (;;)
165 {
166 int n;
167
168 lineno++;
169 n = getfield (stream, field0, ';');
170 n += getfield (stream, field1, ';');
171 n += getfield (stream, field2, ';');
172 n += getfield (stream, field3, ';');
173 n += getfield (stream, field4, ';');
174 n += getfield (stream, field5, ';');
175 n += getfield (stream, field6, ';');
176 n += getfield (stream, field7, ';');
177 n += getfield (stream, field8, ';');
178 n += getfield (stream, field9, ';');
179 n += getfield (stream, field10, ';');
180 n += getfield (stream, field11, ';');
181 n += getfield (stream, field12, ';');
182 n += getfield (stream, field13, ';');
183 n += getfield (stream, field14, '\n');
184 if (n == 0)
185 break;
186 if (n != 15)
187 {
188 fprintf (stderr, "short line in'%s':%d\n",
189 unicodedata_filename, lineno);
190 exit (1);
191 }
192 i = strtoul (field0, NULL, 16);
193 if (field1[0] == '<'
194 && strlen (field1) >= 9
195 && !strcmp (field1 + strlen(field1) - 8, ", First>"))
196 {
197 /* Deal with a range. */
198 lineno++;
199 n = getfield (stream, field0, ';');
200 n += getfield (stream, field1, ';');
201 n += getfield (stream, field2, ';');
202 n += getfield (stream, field3, ';');
203 n += getfield (stream, field4, ';');
204 n += getfield (stream, field5, ';');
205 n += getfield (stream, field6, ';');
206 n += getfield (stream, field7, ';');
207 n += getfield (stream, field8, ';');
208 n += getfield (stream, field9, ';');
209 n += getfield (stream, field10, ';');
210 n += getfield (stream, field11, ';');
211 n += getfield (stream, field12, ';');
212 n += getfield (stream, field13, ';');
213 n += getfield (stream, field14, '\n');
214 if (n != 15)
215 {
216 fprintf (stderr, "missing end range in '%s':%d\n",
217 unicodedata_filename, lineno);
218 exit (1);
219 }
220 if (!(field1[0] == '<'
221 && strlen (field1) >= 8
222 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
223 {
224 fprintf (stderr, "missing end range in '%s':%d\n",
225 unicodedata_filename, lineno);
226 exit (1);
227 }
228 field1[strlen (field1) - 7] = '\0';
229 j = strtoul (field0, NULL, 16);
230 for (; i <= j; i++)
231 fill_attribute (i, field1+1, field2, field3, field4, field5,
232 field6, field7, field8, field9, field10,
233 field11, field12, field13, field14);
234 }
235 else
236 {
237 /* Single character line */
238 fill_attribute (i, field1, field2, field3, field4, field5,
239 field6, field7, field8, field9, field10,
240 field11, field12, field13, field14);
241 }
242 }
243 if (ferror (stream) || fclose (stream))
244 {
245 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
246 exit (1);
247 }
248 }
249
250 /* Character mappings. */
251
252 static unsigned int
to_upper(unsigned int ch)253 to_upper (unsigned int ch)
254 {
255 if (unicode_attributes[ch].name != NULL
256 && unicode_attributes[ch].upper != NONE)
257 return unicode_attributes[ch].upper;
258 else
259 return ch;
260 }
261
262 static unsigned int
to_lower(unsigned int ch)263 to_lower (unsigned int ch)
264 {
265 if (unicode_attributes[ch].name != NULL
266 && unicode_attributes[ch].lower != NONE)
267 return unicode_attributes[ch].lower;
268 else
269 return ch;
270 }
271
272 static unsigned int
to_title(unsigned int ch)273 to_title (unsigned int ch)
274 {
275 if (unicode_attributes[ch].name != NULL
276 && unicode_attributes[ch].title != NONE)
277 return unicode_attributes[ch].title;
278 else
279 return ch;
280 }
281
282 /* Character class properties. */
283
284 static bool
is_upper(unsigned int ch)285 is_upper (unsigned int ch)
286 {
287 return (to_lower (ch) != ch);
288 }
289
290 static bool
is_lower(unsigned int ch)291 is_lower (unsigned int ch)
292 {
293 return (to_upper (ch) != ch)
294 /* <U00DF> is lowercase, but without simple to_upper mapping. */
295 || (ch == 0x00DF);
296 }
297
298 static bool
is_alpha(unsigned int ch)299 is_alpha (unsigned int ch)
300 {
301 return (unicode_attributes[ch].name != NULL
302 && ((unicode_attributes[ch].category[0] == 'L'
303 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
304 <U0E2F>, <U0E46> should belong to is_punct. */
305 && (ch != 0x0E2F) && (ch != 0x0E46))
306 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
307 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
308 || (ch == 0x0E31)
309 || (ch >= 0x0E34 && ch <= 0x0E3A)
310 || (ch >= 0x0E47 && ch <= 0x0E4E)
311 /* Avoid warning for <U0345>. */
312 || (ch == 0x0345)
313 /* Avoid warnings for <U2160>..<U217F>. */
314 || (unicode_attributes[ch].category[0] == 'N'
315 && unicode_attributes[ch].category[1] == 'l')
316 /* Avoid warnings for <U24B6>..<U24E9>. */
317 || (unicode_attributes[ch].category[0] == 'S'
318 && unicode_attributes[ch].category[1] == 'o'
319 && strstr (unicode_attributes[ch].name, " LETTER ")
320 != NULL)
321 /* Consider all the non-ASCII digits as alphabetic.
322 ISO C 99 forbids us to have them in category "digit",
323 but we want iswalnum to return true on them. */
324 || (unicode_attributes[ch].category[0] == 'N'
325 && unicode_attributes[ch].category[1] == 'd'
326 && !(ch >= 0x0030 && ch <= 0x0039))));
327 }
328
329 static bool
is_digit(unsigned int ch)330 is_digit (unsigned int ch)
331 {
332 #if 0
333 return (unicode_attributes[ch].name != NULL
334 && unicode_attributes[ch].category[0] == 'N'
335 && unicode_attributes[ch].category[1] == 'd');
336 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
337 a zero. Must add <0> in front of them by hand. */
338 #else
339 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
340 takes it away:
341 7.25.2.1.5:
342 The iswdigit function tests for any wide character that corresponds
343 to a decimal-digit character (as defined in 5.2.1).
344 5.2.1:
345 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
346 */
347 return (ch >= 0x0030 && ch <= 0x0039);
348 #endif
349 }
350
351 static bool
is_outdigit(unsigned int ch)352 is_outdigit (unsigned int ch)
353 {
354 return (ch >= 0x0030 && ch <= 0x0039);
355 }
356
357 static bool
is_blank(unsigned int ch)358 is_blank (unsigned int ch)
359 {
360 return (ch == 0x0009 /* '\t' */
361 /* Category Zs without mention of "<noBreak>" */
362 || (unicode_attributes[ch].name != NULL
363 && unicode_attributes[ch].category[0] == 'Z'
364 && unicode_attributes[ch].category[1] == 's'
365 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
366 }
367
368 static bool
is_space(unsigned int ch)369 is_space (unsigned int ch)
370 {
371 /* Don't make U+00A0 a space. Non-breaking space means that all programs
372 should treat it like a punctuation character, not like a space. */
373 return (ch == 0x0020 /* ' ' */
374 || ch == 0x000C /* '\f' */
375 || ch == 0x000A /* '\n' */
376 || ch == 0x000D /* '\r' */
377 || ch == 0x0009 /* '\t' */
378 || ch == 0x000B /* '\v' */
379 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
380 || (unicode_attributes[ch].name != NULL
381 && unicode_attributes[ch].category[0] == 'Z'
382 && (unicode_attributes[ch].category[1] == 'l'
383 || unicode_attributes[ch].category[1] == 'p'
384 || (unicode_attributes[ch].category[1] == 's'
385 && !strstr (unicode_attributes[ch].decomposition,
386 "<noBreak>")))));
387 }
388
389 static bool
is_cntrl(unsigned int ch)390 is_cntrl (unsigned int ch)
391 {
392 return (unicode_attributes[ch].name != NULL
393 && (!strcmp (unicode_attributes[ch].name, "<control>")
394 /* Categories Zl and Zp */
395 || (unicode_attributes[ch].category[0] == 'Z'
396 && (unicode_attributes[ch].category[1] == 'l'
397 || unicode_attributes[ch].category[1] == 'p'))));
398 }
399
400 static bool
is_xdigit(unsigned int ch)401 is_xdigit (unsigned int ch)
402 {
403 #if 0
404 return is_digit (ch)
405 || (ch >= 0x0041 && ch <= 0x0046)
406 || (ch >= 0x0061 && ch <= 0x0066);
407 #else
408 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
409 takes it away:
410 7.25.2.1.12:
411 The iswxdigit function tests for any wide character that corresponds
412 to a hexadecimal-digit character (as defined in 6.4.4.1).
413 6.4.4.1:
414 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
415 */
416 return (ch >= 0x0030 && ch <= 0x0039)
417 || (ch >= 0x0041 && ch <= 0x0046)
418 || (ch >= 0x0061 && ch <= 0x0066);
419 #endif
420 }
421
422 static bool
is_graph(unsigned int ch)423 is_graph (unsigned int ch)
424 {
425 return (unicode_attributes[ch].name != NULL
426 && strcmp (unicode_attributes[ch].name, "<control>")
427 && !is_space (ch));
428 }
429
430 static bool
is_print(unsigned int ch)431 is_print (unsigned int ch)
432 {
433 return (unicode_attributes[ch].name != NULL
434 && strcmp (unicode_attributes[ch].name, "<control>")
435 /* Categories Zl and Zp */
436 && !(unicode_attributes[ch].name != NULL
437 && unicode_attributes[ch].category[0] == 'Z'
438 && (unicode_attributes[ch].category[1] == 'l'
439 || unicode_attributes[ch].category[1] == 'p')));
440 }
441
442 static bool
is_punct(unsigned int ch)443 is_punct (unsigned int ch)
444 {
445 #if 0
446 return (unicode_attributes[ch].name != NULL
447 && unicode_attributes[ch].category[0] == 'P');
448 #else
449 /* The traditional POSIX definition of punctuation is every graphic,
450 non-alphanumeric character. */
451 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
452 #endif
453 }
454
455 static bool
is_combining(unsigned int ch)456 is_combining (unsigned int ch)
457 {
458 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
459 file. In 3.0.1 it was identical to the union of the general categories
460 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
461 PropList.txt file, so we take the latter definition. */
462 return (unicode_attributes[ch].name != NULL
463 && unicode_attributes[ch].category[0] == 'M'
464 && (unicode_attributes[ch].category[1] == 'n'
465 || unicode_attributes[ch].category[1] == 'c'
466 || unicode_attributes[ch].category[1] == 'e'));
467 }
468
469 static bool
is_combining_level3(unsigned int ch)470 is_combining_level3 (unsigned int ch)
471 {
472 return is_combining (ch)
473 && !(unicode_attributes[ch].combining[0] != '\0'
474 && unicode_attributes[ch].combining[0] != '0'
475 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
476 }
477
478 /* Return the UCS symbol string for a Unicode character. */
479 static const char *
ucs_symbol(unsigned int i)480 ucs_symbol (unsigned int i)
481 {
482 static char buf[11+1];
483
484 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
485 return buf;
486 }
487
488 /* Return the UCS symbol range string for a Unicode characters interval. */
489 static const char *
ucs_symbol_range(unsigned int low,unsigned int high)490 ucs_symbol_range (unsigned int low, unsigned int high)
491 {
492 static char buf[24+1];
493
494 strcpy (buf, ucs_symbol (low));
495 strcat (buf, "..");
496 strcat (buf, ucs_symbol (high));
497 return buf;
498 }
499
500 /* Output a character class (= property) table. */
501
502 static void
output_charclass(FILE * stream,const char * classname,bool (* func)(unsigned int))503 output_charclass (FILE *stream, const char *classname,
504 bool (*func) (unsigned int))
505 {
506 char table[0x110000];
507 unsigned int i;
508 bool need_semicolon;
509 const int max_column = 75;
510 int column;
511
512 for (i = 0; i < 0x110000; i++)
513 table[i] = (int) func (i);
514
515 fprintf (stream, "%s ", classname);
516 need_semicolon = false;
517 column = 1000;
518 for (i = 0; i < 0x110000; )
519 {
520 if (!table[i])
521 i++;
522 else
523 {
524 unsigned int low, high;
525 char buf[25];
526
527 low = i;
528 do
529 i++;
530 while (i < 0x110000 && table[i]);
531 high = i - 1;
532
533 if (low == high)
534 strcpy (buf, ucs_symbol (low));
535 else
536 strcpy (buf, ucs_symbol_range (low, high));
537
538 if (need_semicolon)
539 {
540 fprintf (stream, ";");
541 column++;
542 }
543
544 if (column + strlen (buf) > max_column)
545 {
546 fprintf (stream, "/\n ");
547 column = 3;
548 }
549
550 fprintf (stream, "%s", buf);
551 column += strlen (buf);
552 need_semicolon = true;
553 }
554 }
555 fprintf (stream, "\n");
556 }
557
558 /* Output a character mapping table. */
559
560 static void
output_charmap(FILE * stream,const char * mapname,unsigned int (* func)(unsigned int))561 output_charmap (FILE *stream, const char *mapname,
562 unsigned int (*func) (unsigned int))
563 {
564 char table[0x110000];
565 unsigned int i;
566 bool need_semicolon;
567 const int max_column = 75;
568 int column;
569
570 for (i = 0; i < 0x110000; i++)
571 table[i] = (func (i) != i);
572
573 fprintf (stream, "%s ", mapname);
574 need_semicolon = false;
575 column = 1000;
576 for (i = 0; i < 0x110000; i++)
577 if (table[i])
578 {
579 char buf[25+1];
580
581 strcpy (buf, "(");
582 strcat (buf, ucs_symbol (i));
583 strcat (buf, ",");
584 strcat (buf, ucs_symbol (func (i)));
585 strcat (buf, ")");
586
587 if (need_semicolon)
588 {
589 fprintf (stream, ";");
590 column++;
591 }
592
593 if (column + strlen (buf) > max_column)
594 {
595 fprintf (stream, "/\n ");
596 column = 3;
597 }
598
599 fprintf (stream, "%s", buf);
600 column += strlen (buf);
601 need_semicolon = true;
602 }
603 fprintf (stream, "\n");
604 }
605
606 /* Output the width table. */
607
608 static void
output_widthmap(FILE * stream)609 output_widthmap (FILE *stream)
610 {
611 }
612
613 /* Output the tables to the given file. */
614
615 static void
output_tables(const char * filename,const char * version)616 output_tables (const char *filename, const char *version)
617 {
618 FILE *stream;
619 unsigned int ch;
620
621 stream = fopen (filename, "w");
622 if (stream == NULL)
623 {
624 fprintf (stderr, "cannot open '%s' for writing\n", filename);
625 exit (1);
626 }
627
628 fprintf (stream, "escape_char /\n");
629 fprintf (stream, "comment_char %%\n");
630 fprintf (stream, "\n");
631 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
632 version);
633 fprintf (stream, "\n");
634
635 fprintf (stream, "LC_IDENTIFICATION\n");
636 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
637 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
638 fprintf (stream, "address \"\"\n");
639 fprintf (stream, "contact \"\"\n");
640 fprintf (stream, "email \"bug-glibc-locales@gnu.org\"\n");
641 fprintf (stream, "tel \"\"\n");
642 fprintf (stream, "fax \"\"\n");
643 fprintf (stream, "language \"\"\n");
644 fprintf (stream, "territory \"Earth\"\n");
645 fprintf (stream, "revision \"%s\"\n", version);
646 {
647 time_t now;
648 char date[11];
649 now = time (NULL);
650 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
651 fprintf (stream, "date \"%s\"\n", date);
652 }
653 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
654 fprintf (stream, "END LC_IDENTIFICATION\n");
655 fprintf (stream, "\n");
656
657 /* Verifications. */
658 for (ch = 0; ch < 0x110000; ch++)
659 {
660 /* toupper restriction: "Only characters specified for the keywords
661 lower and upper shall be specified. */
662 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
663 fprintf (stderr,
664 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
665 ucs_symbol (ch), ch, to_upper (ch));
666
667 /* tolower restriction: "Only characters specified for the keywords
668 lower and upper shall be specified. */
669 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
670 fprintf (stderr,
671 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
672 ucs_symbol (ch), ch, to_lower (ch));
673
674 /* alpha restriction: "Characters classified as either upper or lower
675 shall automatically belong to this class. */
676 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
677 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
678
679 /* alpha restriction: "No character specified for the keywords cntrl,
680 digit, punct or space shall be specified." */
681 if (is_alpha (ch) && is_cntrl (ch))
682 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
683 if (is_alpha (ch) && is_digit (ch))
684 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
685 if (is_alpha (ch) && is_punct (ch))
686 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
687 if (is_alpha (ch) && is_space (ch))
688 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
689
690 /* space restriction: "No character specified for the keywords upper,
691 lower, alpha, digit, graph or xdigit shall be specified."
692 upper, lower, alpha already checked above. */
693 if (is_space (ch) && is_digit (ch))
694 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
695 if (is_space (ch) && is_graph (ch))
696 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
697 if (is_space (ch) && is_xdigit (ch))
698 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
699
700 /* cntrl restriction: "No character specified for the keywords upper,
701 lower, alpha, digit, punct, graph, print or xdigit shall be
702 specified." upper, lower, alpha already checked above. */
703 if (is_cntrl (ch) && is_digit (ch))
704 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
705 if (is_cntrl (ch) && is_punct (ch))
706 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
707 if (is_cntrl (ch) && is_graph (ch))
708 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
709 if (is_cntrl (ch) && is_print (ch))
710 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
711 if (is_cntrl (ch) && is_xdigit (ch))
712 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
713
714 /* punct restriction: "No character specified for the keywords upper,
715 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
716 be specified." upper, lower, alpha, cntrl already checked above. */
717 if (is_punct (ch) && is_digit (ch))
718 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
719 if (is_punct (ch) && is_xdigit (ch))
720 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
721 if (is_punct (ch) && (ch == 0x0020))
722 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
723
724 /* graph restriction: "No character specified for the keyword cntrl
725 shall be specified." Already checked above. */
726
727 /* print restriction: "No character specified for the keyword cntrl
728 shall be specified." Already checked above. */
729
730 /* graph - print relation: differ only in the <space> character.
731 How is this possible if there are more than one space character?!
732 I think susv2/xbd/locale.html should speak of "space characters",
733 not "space character". */
734 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
735 fprintf (stderr,
736 "%s is print but not graph|<space>\n", ucs_symbol (ch));
737 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
738 fprintf (stderr,
739 "%s is graph|<space> but not print\n", ucs_symbol (ch));
740 }
741
742 fprintf (stream, "LC_CTYPE\n");
743 output_charclass (stream, "upper", is_upper);
744 output_charclass (stream, "lower", is_lower);
745 output_charclass (stream, "alpha", is_alpha);
746 output_charclass (stream, "digit", is_digit);
747 output_charclass (stream, "outdigit", is_outdigit);
748 output_charclass (stream, "blank", is_blank);
749 output_charclass (stream, "space", is_space);
750 output_charclass (stream, "cntrl", is_cntrl);
751 output_charclass (stream, "punct", is_punct);
752 output_charclass (stream, "xdigit", is_xdigit);
753 output_charclass (stream, "graph", is_graph);
754 output_charclass (stream, "print", is_print);
755 output_charclass (stream, "class \"combining\";", is_combining);
756 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
757 output_charmap (stream, "toupper", to_upper);
758 output_charmap (stream, "tolower", to_lower);
759 output_charmap (stream, "map \"totitle\";", to_title);
760 output_widthmap (stream);
761 fprintf (stream, "END LC_CTYPE\n");
762
763 if (ferror (stream) || fclose (stream))
764 {
765 fprintf (stderr, "error writing to '%s'\n", filename);
766 exit (1);
767 }
768 }
769
770 int
main(int argc,char * argv[])771 main (int argc, char * argv[])
772 {
773 if (argc != 3)
774 {
775 fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
776 exit (1);
777 }
778
779 fill_attributes (argv[1]);
780
781 output_tables ("unicode", argv[2]);
782
783 return 0;
784 }
785