1 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
2    Copyright (C) 2000-2001 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 /* Usage example:
21      $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <string.h>
28 #include <time.h>
29 
30 /* This structure represents one line in the UnicodeData.txt file.  */
31 struct unicode_attribute
32 {
33   const char *name;           /* Character name */
34   const char *category;       /* General category */
35   const char *combining;      /* Canonical combining classes */
36   const char *bidi;           /* Bidirectional category */
37   const char *decomposition;  /* Character decomposition mapping */
38   const char *decdigit;       /* Decimal digit value */
39   const char *digit;          /* Digit value */
40   const char *numeric;        /* Numeric value */
41   int mirrored;               /* mirrored */
42   const char *oldname;        /* Old Unicode 1.0 name */
43   const char *comment;        /* Comment */
44   unsigned int upper;         /* Uppercase mapping */
45   unsigned int lower;         /* Lowercase mapping */
46   unsigned int title;         /* Titlecase mapping */
47 };
48 
49 /* Missing fields are represented with "" for strings, and NONE for
50    characters.  */
51 #define NONE (~(unsigned int)0)
52 
53 /* The entire contents of the UnicodeData.txt file.  */
54 struct unicode_attribute unicode_attributes [0x110000];
55 
56 /* Stores in unicode_attributes[i] the values from the given fields.  */
57 static void
fill_attribute(unsigned int i,const char * field1,const char * field2,const char * field3,const char * field4,const char * field5,const char * field6,const char * field7,const char * field8,const char * field9,const char * field10,const char * field11,const char * field12,const char * field13,const char * field14)58 fill_attribute (unsigned int i,
59 		const char *field1, const char *field2,
60 		const char *field3, const char *field4,
61 		const char *field5, const char *field6,
62 		const char *field7, const char *field8,
63 		const char *field9, const char *field10,
64 		const char *field11, const char *field12,
65 		const char *field13, const char *field14)
66 {
67   struct unicode_attribute * uni;
68 
69   if (i >= 0x110000)
70     {
71       fprintf (stderr, "index too large\n");
72       exit (1);
73     }
74   if (strcmp (field2, "Cs") == 0)
75     /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
76     return;
77   uni = &unicode_attributes[i];
78   /* Copy the strings.  */
79   uni->name          = strdup (field1);
80   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
81   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
82   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
83   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
85   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
86   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
87   uni->mirrored      = (field9[0] == 'Y');
88   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
89   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
90   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
93 }
94 
95 /* Maximum length of a field in the UnicodeData.txt file.  */
96 #define FIELDLEN 120
97 
98 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
99    Reads up to (but excluding) DELIM.
100    Returns 1 when a field was successfully read, otherwise 0.  */
101 static int
getfield(FILE * stream,char * buffer,int delim)102 getfield (FILE *stream, char *buffer, int delim)
103 {
104   int count = 0;
105   int c;
106 
107   for (; (c = getc (stream)), (c != EOF && c != delim); )
108     {
109       /* The original unicode.org UnicodeData.txt file happens to have
110 	 CR/LF line terminators.  Silently convert to LF.  */
111       if (c == '\r')
112 	continue;
113 
114       /* Put c into the buffer.  */
115       if (++count >= FIELDLEN - 1)
116 	{
117 	  fprintf (stderr, "field too long\n");
118 	  exit (1);
119 	}
120       *buffer++ = c;
121     }
122 
123   if (c == EOF)
124     return 0;
125 
126   *buffer = '\0';
127   return 1;
128 }
129 
130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
131    file.  */
132 static void
fill_attributes(const char * unicodedata_filename)133 fill_attributes (const char *unicodedata_filename)
134 {
135   unsigned int i, j;
136   FILE *stream;
137   char field0[FIELDLEN];
138   char field1[FIELDLEN];
139   char field2[FIELDLEN];
140   char field3[FIELDLEN];
141   char field4[FIELDLEN];
142   char field5[FIELDLEN];
143   char field6[FIELDLEN];
144   char field7[FIELDLEN];
145   char field8[FIELDLEN];
146   char field9[FIELDLEN];
147   char field10[FIELDLEN];
148   char field11[FIELDLEN];
149   char field12[FIELDLEN];
150   char field13[FIELDLEN];
151   char field14[FIELDLEN];
152   int lineno = 0;
153 
154   for (i = 0; i < 0x110000; i++)
155     unicode_attributes[i].name = NULL;
156 
157   stream = fopen (unicodedata_filename, "r");
158   if (stream == NULL)
159     {
160       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
161       exit (1);
162     }
163 
164   for (;;)
165     {
166       int n;
167 
168       lineno++;
169       n = getfield (stream, field0, ';');
170       n += getfield (stream, field1, ';');
171       n += getfield (stream, field2, ';');
172       n += getfield (stream, field3, ';');
173       n += getfield (stream, field4, ';');
174       n += getfield (stream, field5, ';');
175       n += getfield (stream, field6, ';');
176       n += getfield (stream, field7, ';');
177       n += getfield (stream, field8, ';');
178       n += getfield (stream, field9, ';');
179       n += getfield (stream, field10, ';');
180       n += getfield (stream, field11, ';');
181       n += getfield (stream, field12, ';');
182       n += getfield (stream, field13, ';');
183       n += getfield (stream, field14, '\n');
184       if (n == 0)
185 	break;
186       if (n != 15)
187 	{
188 	  fprintf (stderr, "short line in'%s':%d\n",
189 		   unicodedata_filename, lineno);
190 	  exit (1);
191 	}
192       i = strtoul (field0, NULL, 16);
193       if (field1[0] == '<'
194 	  && strlen (field1) >= 9
195 	  && !strcmp (field1 + strlen(field1) - 8, ", First>"))
196 	{
197 	  /* Deal with a range. */
198 	  lineno++;
199 	  n = getfield (stream, field0, ';');
200 	  n += getfield (stream, field1, ';');
201 	  n += getfield (stream, field2, ';');
202 	  n += getfield (stream, field3, ';');
203 	  n += getfield (stream, field4, ';');
204 	  n += getfield (stream, field5, ';');
205 	  n += getfield (stream, field6, ';');
206 	  n += getfield (stream, field7, ';');
207 	  n += getfield (stream, field8, ';');
208 	  n += getfield (stream, field9, ';');
209 	  n += getfield (stream, field10, ';');
210 	  n += getfield (stream, field11, ';');
211 	  n += getfield (stream, field12, ';');
212 	  n += getfield (stream, field13, ';');
213 	  n += getfield (stream, field14, '\n');
214 	  if (n != 15)
215 	    {
216 	      fprintf (stderr, "missing end range in '%s':%d\n",
217 		       unicodedata_filename, lineno);
218 	      exit (1);
219 	    }
220 	  if (!(field1[0] == '<'
221 		&& strlen (field1) >= 8
222 		&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
223 	    {
224 	      fprintf (stderr, "missing end range in '%s':%d\n",
225 		       unicodedata_filename, lineno);
226 	      exit (1);
227 	    }
228 	  field1[strlen (field1) - 7] = '\0';
229 	  j = strtoul (field0, NULL, 16);
230 	  for (; i <= j; i++)
231 	    fill_attribute (i, field1+1, field2, field3, field4, field5,
232 			       field6, field7, field8, field9, field10,
233 			       field11, field12, field13, field14);
234 	}
235       else
236 	{
237 	  /* Single character line */
238 	  fill_attribute (i, field1, field2, field3, field4, field5,
239 			     field6, field7, field8, field9, field10,
240 			     field11, field12, field13, field14);
241 	}
242     }
243   if (ferror (stream) || fclose (stream))
244     {
245       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
246       exit (1);
247     }
248 }
249 
250 /* Character mappings.  */
251 
252 static unsigned int
to_upper(unsigned int ch)253 to_upper (unsigned int ch)
254 {
255   if (unicode_attributes[ch].name != NULL
256       && unicode_attributes[ch].upper != NONE)
257     return unicode_attributes[ch].upper;
258   else
259     return ch;
260 }
261 
262 static unsigned int
to_lower(unsigned int ch)263 to_lower (unsigned int ch)
264 {
265   if (unicode_attributes[ch].name != NULL
266       && unicode_attributes[ch].lower != NONE)
267     return unicode_attributes[ch].lower;
268   else
269     return ch;
270 }
271 
272 static unsigned int
to_title(unsigned int ch)273 to_title (unsigned int ch)
274 {
275   if (unicode_attributes[ch].name != NULL
276       && unicode_attributes[ch].title != NONE)
277     return unicode_attributes[ch].title;
278   else
279     return ch;
280 }
281 
282 /* Character class properties.  */
283 
284 static bool
is_upper(unsigned int ch)285 is_upper (unsigned int ch)
286 {
287   return (to_lower (ch) != ch);
288 }
289 
290 static bool
is_lower(unsigned int ch)291 is_lower (unsigned int ch)
292 {
293   return (to_upper (ch) != ch)
294 	 /* <U00DF> is lowercase, but without simple to_upper mapping.  */
295 	 || (ch == 0x00DF);
296 }
297 
298 static bool
is_alpha(unsigned int ch)299 is_alpha (unsigned int ch)
300 {
301   return (unicode_attributes[ch].name != NULL
302 	  && ((unicode_attributes[ch].category[0] == 'L'
303 	       /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
304 		  <U0E2F>, <U0E46> should belong to is_punct.  */
305 	       && (ch != 0x0E2F) && (ch != 0x0E46))
306 	      /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
307 		 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
308 	      || (ch == 0x0E31)
309 	      || (ch >= 0x0E34 && ch <= 0x0E3A)
310 	      || (ch >= 0x0E47 && ch <= 0x0E4E)
311 	      /* Avoid warning for <U0345>.  */
312 	      || (ch == 0x0345)
313 	      /* Avoid warnings for <U2160>..<U217F>.  */
314 	      || (unicode_attributes[ch].category[0] == 'N'
315 		  && unicode_attributes[ch].category[1] == 'l')
316 	      /* Avoid warnings for <U24B6>..<U24E9>.  */
317 	      || (unicode_attributes[ch].category[0] == 'S'
318 		  && unicode_attributes[ch].category[1] == 'o'
319 		  && strstr (unicode_attributes[ch].name, " LETTER ")
320 		     != NULL)
321 	      /* Consider all the non-ASCII digits as alphabetic.
322 		 ISO C 99 forbids us to have them in category "digit",
323 		 but we want iswalnum to return true on them.  */
324 	      || (unicode_attributes[ch].category[0] == 'N'
325 		  && unicode_attributes[ch].category[1] == 'd'
326 		  && !(ch >= 0x0030 && ch <= 0x0039))));
327 }
328 
329 static bool
is_digit(unsigned int ch)330 is_digit (unsigned int ch)
331 {
332 #if 0
333   return (unicode_attributes[ch].name != NULL
334 	  && unicode_attributes[ch].category[0] == 'N'
335 	  && unicode_attributes[ch].category[1] == 'd');
336   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
337      a zero.  Must add <0> in front of them by hand.  */
338 #else
339   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
340      takes it away:
341      7.25.2.1.5:
342         The iswdigit function tests for any wide character that corresponds
343         to a decimal-digit character (as defined in 5.2.1).
344      5.2.1:
345         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
346    */
347   return (ch >= 0x0030 && ch <= 0x0039);
348 #endif
349 }
350 
351 static bool
is_outdigit(unsigned int ch)352 is_outdigit (unsigned int ch)
353 {
354   return (ch >= 0x0030 && ch <= 0x0039);
355 }
356 
357 static bool
is_blank(unsigned int ch)358 is_blank (unsigned int ch)
359 {
360   return (ch == 0x0009 /* '\t' */
361 	  /* Category Zs without mention of "<noBreak>" */
362 	  || (unicode_attributes[ch].name != NULL
363 	      && unicode_attributes[ch].category[0] == 'Z'
364 	      && unicode_attributes[ch].category[1] == 's'
365 	      && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
366 }
367 
368 static bool
is_space(unsigned int ch)369 is_space (unsigned int ch)
370 {
371   /* Don't make U+00A0 a space. Non-breaking space means that all programs
372      should treat it like a punctuation character, not like a space. */
373   return (ch == 0x0020 /* ' ' */
374 	  || ch == 0x000C /* '\f' */
375 	  || ch == 0x000A /* '\n' */
376 	  || ch == 0x000D /* '\r' */
377 	  || ch == 0x0009 /* '\t' */
378 	  || ch == 0x000B /* '\v' */
379 	  /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
380 	  || (unicode_attributes[ch].name != NULL
381 	      && unicode_attributes[ch].category[0] == 'Z'
382 	      && (unicode_attributes[ch].category[1] == 'l'
383 		  || unicode_attributes[ch].category[1] == 'p'
384 		  || (unicode_attributes[ch].category[1] == 's'
385 		      && !strstr (unicode_attributes[ch].decomposition,
386 				  "<noBreak>")))));
387 }
388 
389 static bool
is_cntrl(unsigned int ch)390 is_cntrl (unsigned int ch)
391 {
392   return (unicode_attributes[ch].name != NULL
393 	  && (!strcmp (unicode_attributes[ch].name, "<control>")
394 	      /* Categories Zl and Zp */
395 	      || (unicode_attributes[ch].category[0] == 'Z'
396 		  && (unicode_attributes[ch].category[1] == 'l'
397 		      || unicode_attributes[ch].category[1] == 'p'))));
398 }
399 
400 static bool
is_xdigit(unsigned int ch)401 is_xdigit (unsigned int ch)
402 {
403 #if 0
404   return is_digit (ch)
405 	 || (ch >= 0x0041 && ch <= 0x0046)
406 	 || (ch >= 0x0061 && ch <= 0x0066);
407 #else
408   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
409      takes it away:
410      7.25.2.1.12:
411         The iswxdigit function tests for any wide character that corresponds
412         to a hexadecimal-digit character (as defined in 6.4.4.1).
413      6.4.4.1:
414         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
415    */
416   return (ch >= 0x0030 && ch <= 0x0039)
417 	 || (ch >= 0x0041 && ch <= 0x0046)
418 	 || (ch >= 0x0061 && ch <= 0x0066);
419 #endif
420 }
421 
422 static bool
is_graph(unsigned int ch)423 is_graph (unsigned int ch)
424 {
425   return (unicode_attributes[ch].name != NULL
426 	  && strcmp (unicode_attributes[ch].name, "<control>")
427 	  && !is_space (ch));
428 }
429 
430 static bool
is_print(unsigned int ch)431 is_print (unsigned int ch)
432 {
433   return (unicode_attributes[ch].name != NULL
434 	  && strcmp (unicode_attributes[ch].name, "<control>")
435 	  /* Categories Zl and Zp */
436 	  && !(unicode_attributes[ch].name != NULL
437 	       && unicode_attributes[ch].category[0] == 'Z'
438 	       && (unicode_attributes[ch].category[1] == 'l'
439 		   || unicode_attributes[ch].category[1] == 'p')));
440 }
441 
442 static bool
is_punct(unsigned int ch)443 is_punct (unsigned int ch)
444 {
445 #if 0
446   return (unicode_attributes[ch].name != NULL
447 	  && unicode_attributes[ch].category[0] == 'P');
448 #else
449   /* The traditional POSIX definition of punctuation is every graphic,
450      non-alphanumeric character.  */
451   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
452 #endif
453 }
454 
455 static bool
is_combining(unsigned int ch)456 is_combining (unsigned int ch)
457 {
458   /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
459      file. In 3.0.1 it was identical to the union of the general categories
460      "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
461      PropList.txt file, so we take the latter definition.  */
462   return (unicode_attributes[ch].name != NULL
463 	  && unicode_attributes[ch].category[0] == 'M'
464 	  && (unicode_attributes[ch].category[1] == 'n'
465 	      || unicode_attributes[ch].category[1] == 'c'
466 	      || unicode_attributes[ch].category[1] == 'e'));
467 }
468 
469 static bool
is_combining_level3(unsigned int ch)470 is_combining_level3 (unsigned int ch)
471 {
472   return is_combining (ch)
473 	 && !(unicode_attributes[ch].combining[0] != '\0'
474 	      && unicode_attributes[ch].combining[0] != '0'
475 	      && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
476 }
477 
478 /* Return the UCS symbol string for a Unicode character.  */
479 static const char *
ucs_symbol(unsigned int i)480 ucs_symbol (unsigned int i)
481 {
482   static char buf[11+1];
483 
484   sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
485   return buf;
486 }
487 
488 /* Return the UCS symbol range string for a Unicode characters interval.  */
489 static const char *
ucs_symbol_range(unsigned int low,unsigned int high)490 ucs_symbol_range (unsigned int low, unsigned int high)
491 {
492   static char buf[24+1];
493 
494   strcpy (buf, ucs_symbol (low));
495   strcat (buf, "..");
496   strcat (buf, ucs_symbol (high));
497   return buf;
498 }
499 
500 /* Output a character class (= property) table.  */
501 
502 static void
output_charclass(FILE * stream,const char * classname,bool (* func)(unsigned int))503 output_charclass (FILE *stream, const char *classname,
504 		  bool (*func) (unsigned int))
505 {
506   char table[0x110000];
507   unsigned int i;
508   bool need_semicolon;
509   const int max_column = 75;
510   int column;
511 
512   for (i = 0; i < 0x110000; i++)
513     table[i] = (int) func (i);
514 
515   fprintf (stream, "%s ", classname);
516   need_semicolon = false;
517   column = 1000;
518   for (i = 0; i < 0x110000; )
519     {
520       if (!table[i])
521 	i++;
522       else
523 	{
524 	  unsigned int low, high;
525 	  char buf[25];
526 
527 	  low = i;
528 	  do
529 	    i++;
530 	  while (i < 0x110000 && table[i]);
531 	  high = i - 1;
532 
533 	  if (low == high)
534 	    strcpy (buf, ucs_symbol (low));
535 	  else
536 	    strcpy (buf, ucs_symbol_range (low, high));
537 
538 	  if (need_semicolon)
539 	    {
540 	      fprintf (stream, ";");
541 	      column++;
542 	    }
543 
544 	  if (column + strlen (buf) > max_column)
545 	    {
546 	      fprintf (stream, "/\n   ");
547 	      column = 3;
548 	    }
549 
550 	  fprintf (stream, "%s", buf);
551 	  column += strlen (buf);
552 	  need_semicolon = true;
553 	}
554     }
555   fprintf (stream, "\n");
556 }
557 
558 /* Output a character mapping table.  */
559 
560 static void
output_charmap(FILE * stream,const char * mapname,unsigned int (* func)(unsigned int))561 output_charmap (FILE *stream, const char *mapname,
562 		unsigned int (*func) (unsigned int))
563 {
564   char table[0x110000];
565   unsigned int i;
566   bool need_semicolon;
567   const int max_column = 75;
568   int column;
569 
570   for (i = 0; i < 0x110000; i++)
571     table[i] = (func (i) != i);
572 
573   fprintf (stream, "%s ", mapname);
574   need_semicolon = false;
575   column = 1000;
576   for (i = 0; i < 0x110000; i++)
577     if (table[i])
578       {
579 	char buf[25+1];
580 
581 	strcpy (buf, "(");
582 	strcat (buf, ucs_symbol (i));
583 	strcat (buf, ",");
584 	strcat (buf, ucs_symbol (func (i)));
585 	strcat (buf, ")");
586 
587 	if (need_semicolon)
588 	  {
589 	    fprintf (stream, ";");
590 	    column++;
591 	  }
592 
593 	if (column + strlen (buf) > max_column)
594 	  {
595 	    fprintf (stream, "/\n   ");
596 	    column = 3;
597 	  }
598 
599 	fprintf (stream, "%s", buf);
600 	column += strlen (buf);
601 	need_semicolon = true;
602       }
603   fprintf (stream, "\n");
604 }
605 
606 /* Output the width table.  */
607 
608 static void
output_widthmap(FILE * stream)609 output_widthmap (FILE *stream)
610 {
611 }
612 
613 /* Output the tables to the given file.  */
614 
615 static void
output_tables(const char * filename,const char * version)616 output_tables (const char *filename, const char *version)
617 {
618   FILE *stream;
619   unsigned int ch;
620 
621   stream = fopen (filename, "w");
622   if (stream == NULL)
623     {
624       fprintf (stderr, "cannot open '%s' for writing\n", filename);
625       exit (1);
626     }
627 
628   fprintf (stream, "escape_char /\n");
629   fprintf (stream, "comment_char %%\n");
630   fprintf (stream, "\n");
631   fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
632 	   version);
633   fprintf (stream, "\n");
634 
635   fprintf (stream, "LC_IDENTIFICATION\n");
636   fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
637   fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
638   fprintf (stream, "address   \"\"\n");
639   fprintf (stream, "contact   \"\"\n");
640   fprintf (stream, "email     \"bug-glibc-locales@gnu.org\"\n");
641   fprintf (stream, "tel       \"\"\n");
642   fprintf (stream, "fax       \"\"\n");
643   fprintf (stream, "language  \"\"\n");
644   fprintf (stream, "territory \"Earth\"\n");
645   fprintf (stream, "revision  \"%s\"\n", version);
646   {
647     time_t now;
648     char date[11];
649     now = time (NULL);
650     strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
651     fprintf (stream, "date      \"%s\"\n", date);
652   }
653   fprintf (stream, "category  \"unicode:2001\";LC_CTYPE\n");
654   fprintf (stream, "END LC_IDENTIFICATION\n");
655   fprintf (stream, "\n");
656 
657   /* Verifications. */
658   for (ch = 0; ch < 0x110000; ch++)
659     {
660       /* toupper restriction: "Only characters specified for the keywords
661 	 lower and upper shall be specified.  */
662       if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
663 	fprintf (stderr,
664 		 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
665 		 ucs_symbol (ch), ch, to_upper (ch));
666 
667       /* tolower restriction: "Only characters specified for the keywords
668 	 lower and upper shall be specified.  */
669       if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
670 	fprintf (stderr,
671 		 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
672 		 ucs_symbol (ch), ch, to_lower (ch));
673 
674       /* alpha restriction: "Characters classified as either upper or lower
675 	 shall automatically belong to this class.  */
676       if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
677 	fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
678 
679       /* alpha restriction: "No character specified for the keywords cntrl,
680 	 digit, punct or space shall be specified."  */
681       if (is_alpha (ch) && is_cntrl (ch))
682 	fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
683       if (is_alpha (ch) && is_digit (ch))
684 	fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
685       if (is_alpha (ch) && is_punct (ch))
686 	fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
687       if (is_alpha (ch) && is_space (ch))
688 	fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
689 
690       /* space restriction: "No character specified for the keywords upper,
691 	 lower, alpha, digit, graph or xdigit shall be specified."
692 	 upper, lower, alpha already checked above.  */
693       if (is_space (ch) && is_digit (ch))
694 	fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
695       if (is_space (ch) && is_graph (ch))
696 	fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
697       if (is_space (ch) && is_xdigit (ch))
698 	fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
699 
700       /* cntrl restriction: "No character specified for the keywords upper,
701 	 lower, alpha, digit, punct, graph, print or xdigit shall be
702 	 specified."  upper, lower, alpha already checked above.  */
703       if (is_cntrl (ch) && is_digit (ch))
704 	fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
705       if (is_cntrl (ch) && is_punct (ch))
706 	fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
707       if (is_cntrl (ch) && is_graph (ch))
708 	fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
709       if (is_cntrl (ch) && is_print (ch))
710 	fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
711       if (is_cntrl (ch) && is_xdigit (ch))
712 	fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
713 
714       /* punct restriction: "No character specified for the keywords upper,
715 	 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
716 	 be specified."  upper, lower, alpha, cntrl already checked above.  */
717       if (is_punct (ch) && is_digit (ch))
718 	fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
719       if (is_punct (ch) && is_xdigit (ch))
720 	fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
721       if (is_punct (ch) && (ch == 0x0020))
722 	fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
723 
724       /* graph restriction: "No character specified for the keyword cntrl
725 	 shall be specified."  Already checked above.  */
726 
727       /* print restriction: "No character specified for the keyword cntrl
728 	 shall be specified."  Already checked above.  */
729 
730       /* graph - print relation: differ only in the <space> character.
731 	 How is this possible if there are more than one space character?!
732 	 I think susv2/xbd/locale.html should speak of "space characters",
733 	 not "space character".  */
734       if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
735 	fprintf (stderr,
736 		 "%s is print but not graph|<space>\n", ucs_symbol (ch));
737       if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
738 	fprintf (stderr,
739 		 "%s is graph|<space> but not print\n", ucs_symbol (ch));
740     }
741 
742   fprintf (stream, "LC_CTYPE\n");
743   output_charclass (stream, "upper", is_upper);
744   output_charclass (stream, "lower", is_lower);
745   output_charclass (stream, "alpha", is_alpha);
746   output_charclass (stream, "digit", is_digit);
747   output_charclass (stream, "outdigit", is_outdigit);
748   output_charclass (stream, "blank", is_blank);
749   output_charclass (stream, "space", is_space);
750   output_charclass (stream, "cntrl", is_cntrl);
751   output_charclass (stream, "punct", is_punct);
752   output_charclass (stream, "xdigit", is_xdigit);
753   output_charclass (stream, "graph", is_graph);
754   output_charclass (stream, "print", is_print);
755   output_charclass (stream, "class \"combining\";", is_combining);
756   output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
757   output_charmap (stream, "toupper", to_upper);
758   output_charmap (stream, "tolower", to_lower);
759   output_charmap (stream, "map \"totitle\";", to_title);
760   output_widthmap (stream);
761   fprintf (stream, "END LC_CTYPE\n");
762 
763   if (ferror (stream) || fclose (stream))
764     {
765       fprintf (stderr, "error writing to '%s'\n", filename);
766       exit (1);
767     }
768 }
769 
770 int
main(int argc,char * argv[])771 main (int argc, char * argv[])
772 {
773   if (argc != 3)
774     {
775       fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
776       exit (1);
777     }
778 
779   fill_attributes (argv[1]);
780 
781   output_tables ("unicode", argv[2]);
782 
783   return 0;
784 }
785