1#! /usr/bin/perl -w
2# -*- Perl -*-
3#
4# afblue.pl
5#
6# Process a blue zone character data file.
7#
8# Copyright 2013, 2014 by
9# David Turner, Robert Wilhelm, and Werner Lemberg.
10#
11# This file is part of the FreeType project, and may only be used,
12# modified, and distributed under the terms of the FreeType project
13# license, LICENSE.TXT.  By continuing to use, modify, or distribute
14# this file you indicate that you have read the license and
15# understand and accept it fully.
16
17use strict;
18use warnings;
19use English '-no_match_vars';
20use open ':std', ':encoding(UTF-8)';
21
22
23my $prog = $PROGRAM_NAME;
24$prog =~ s| .* / ||x;      # Remove path.
25
26die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0;
27
28
29my $datafile = $ARGV[0];
30
31my %diversions;        # The extracted and massaged data from `datafile'.
32my @else_stack;        # Booleans to track else-clauses.
33my @name_stack;        # Stack of integers used for names of aux. variables.
34
35my $curr_enum;         # Name of the current enumeration.
36my $curr_array;        # Name of the current array.
37my $curr_max;          # Name of the current maximum value.
38
39my $curr_enum_element; # Name of the current enumeration element.
40my $curr_offset;       # The offset relative to current aux. variable.
41my $curr_elem_size;    # The size of the current string or block.
42
43my $have_sections = 0; # Boolean; set if start of a section has been seen.
44my $have_strings;      # Boolean; set if current section contains strings.
45my $have_blocks;       # Boolean; set if current section contains blocks.
46
47my $have_enum_element; # Boolean; set if we have an enumeration element.
48my $in_string;         # Boolean; set if a string has been parsed.
49
50my $num_sections = 0;  # Number of sections seen so far.
51
52my $last_aux;          # Name of last auxiliary variable.
53
54
55# Regular expressions.
56
57# [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n'
58my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x;
59
60# [<ws>] <enum_element_name> [<ws>] '\n'
61my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x;
62
63# '#' <preprocessor directive> '\n'
64my $preprocessor_re = qr/ ^ \# /x;
65
66# '/' '/' <comment> '\n'
67my $comment_re = qr| ^ // |x;
68
69# empty line
70my $whitespace_only_re = qr/ ^ \s* $ /x;
71
72# [<ws>] '"' <string> '"' [<ws>] '\n'  (<string> doesn't contain newlines)
73my $string_re = qr/ ^ \s*
74                       " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) "
75                       \s* $ /x;
76
77# [<ws>] '{' <block> '}' [<ws>] '\n'  (<block> can contain newlines)
78my $block_start_re = qr/ ^ \s* \{ /x;
79
80# We need the capturing group for `split' to make it return the separator
81# tokens (i.e., the opening and closing brace) also.
82my $brace_re = qr/ ( [{}] ) /x;
83
84
85sub Warn
86{
87  my $message = shift;
88  warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n";
89}
90
91
92sub Die
93{
94  my $message = shift;
95  die "$datafile:$INPUT_LINE_NUMBER: error: $message\n";
96}
97
98
99my $warned_before = 0;
100
101sub warn_before
102{
103  Warn("data before first section gets ignored") unless $warned_before;
104  $warned_before = 1;
105}
106
107
108sub strip_newline
109{
110  chomp;
111  s/ \x0D $ //x;
112}
113
114
115sub end_curr_string
116{
117  # Append final null byte to string.
118  if ($have_strings)
119  {
120    push @{$diversions{$curr_array}}, "    '\\0',\n" if $in_string;
121
122    $curr_offset++;
123    $in_string = 0;
124  }
125}
126
127
128sub update_max_elem_size
129{
130  if ($curr_elem_size)
131  {
132    my $max = pop @{$diversions{$curr_max}};
133    $max = $curr_elem_size if $curr_elem_size > $max;
134    push @{$diversions{$curr_max}}, $max;
135  }
136}
137
138
139sub convert_non_ascii_char
140{
141  # A UTF-8 character outside of the printable ASCII range, with possibly a
142  # leading backslash character.
143  my $s = shift;
144
145  # Here we count characters, not bytes.
146  $curr_elem_size += length $s;
147
148  utf8::encode($s);
149  $s = uc unpack 'H*', $s;
150
151  $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg;
152
153  return $s;
154}
155
156
157sub convert_ascii_chars
158{
159  # A series of ASCII characters in the printable range.
160  my $s = shift;
161
162  my $count = $s =~ s/\G(.)/'$1', /g;
163  $curr_offset += $count;
164  $curr_elem_size += $count;
165
166  return $s;
167}
168
169
170sub convert_literal
171{
172  my $s = shift;
173  my $orig = $s;
174
175  # ASCII printables and space
176  my $safe_re = '\x20-\x7E';
177  # ASCII printables and space, no backslash
178  my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E';
179
180  $s =~ s{
181           (?: \\? ( [^$safe_re] )
182               | ( (?: [$safe_no_backslash_re]
183                       | \\ [$safe_re] )+ ) )
184         }
185         {
186           defined($1) ? convert_non_ascii_char($1)
187                       : convert_ascii_chars($2)
188         }egx;
189
190   # We assume that `$orig' doesn't contain `*/'
191   return $s . " /* $orig */";
192}
193
194
195sub aux_name
196{
197  return "af_blue_" . $num_sections. "_" . join('_', @name_stack);
198}
199
200
201sub aux_name_next
202{
203  $name_stack[$#name_stack]++;
204  my $name = aux_name();
205  $name_stack[$#name_stack]--;
206
207  return $name;
208}
209
210
211sub enum_val_string
212{
213  # Build string that holds code to save the current offset in an
214  # enumeration element.
215  my $aux = shift;
216
217  my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" )
218              ? ""
219              : "$last_aux + ";
220
221  return "    $aux = $add$curr_offset,\n";
222}
223
224
225
226# Process data file.
227
228open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n";
229
230while (<DATA>)
231{
232  strip_newline();
233
234  next if /$comment_re/;
235  next if /$whitespace_only_re/;
236
237  if (/$section_re/)
238  {
239    Warn("previous section is empty") if ($have_sections
240                                          && !$have_strings
241                                          && !$have_blocks);
242
243    end_curr_string();
244    update_max_elem_size();
245
246    # Save captured groups from `section_re'.
247    $curr_enum = $1;
248    $curr_array = $2;
249    $curr_max = $3;
250
251    $curr_enum_element = "";
252    $curr_offset = 0;
253
254    Warn("overwriting already defined enumeration \`$curr_enum'")
255      if exists($diversions{$curr_enum});
256    Warn("overwriting already defined array \`$curr_array'")
257      if exists($diversions{$curr_array});
258    Warn("overwriting already defined maximum value \`$curr_max'")
259      if exists($diversions{$curr_max});
260
261    $diversions{$curr_enum} = [];
262    $diversions{$curr_array} = [];
263    $diversions{$curr_max} = [];
264
265    push @{$diversions{$curr_max}}, 0;
266
267    @name_stack = ();
268    push @name_stack, 0;
269
270    $have_sections = 1;
271    $have_strings = 0;
272    $have_blocks = 0;
273
274    $have_enum_element = 0;
275    $in_string = 0;
276
277    $num_sections++;
278    $curr_elem_size = 0;
279
280    $last_aux = aux_name();
281
282    next;
283  }
284
285  if (/$preprocessor_re/)
286  {
287    if ($have_sections)
288    {
289      # Having preprocessor conditionals complicates the computation of
290      # correct offset values.  We have to introduce auxiliary enumeration
291      # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store
292      # offsets to be used in conditional clauses.  `<s>' is the number of
293      # sections seen so far, `<n1>' is the number of `#if' and `#endif'
294      # conditionals seen so far in the topmost level, `<n2>' the number of
295      # `#if' and `#endif' conditionals seen so far one level deeper, etc.
296      # As a consequence, uneven values are used within a clause, and even
297      # values after a clause, since the C standard doesn't allow the
298      # redefinition of an enumeration value.  For example, the name
299      # `af_blue_5_1_6' is used to construct enumeration values in the fifth
300      # section after the third (second-level) if-clause within the first
301      # (top-level) if-clause.  After the first top-level clause has
302      # finished, `af_blue_5_2' is used.  The current offset is then
303      # relative to the value stored in the current auxiliary element.
304
305      if (/ ^ \# \s* if /x)
306      {
307        push @else_stack, 0;
308
309        $name_stack[$#name_stack]++;
310
311        push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
312        $last_aux = aux_name();
313
314        push @name_stack, 0;
315
316        $curr_offset = 0;
317      }
318      elsif (/ ^ \# \s* elif /x)
319      {
320        Die("unbalanced #elif") unless @else_stack;
321
322        pop @name_stack;
323
324        push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
325        $last_aux = aux_name();
326
327        push @name_stack, 0;
328
329        $curr_offset = 0;
330      }
331      elsif (/ ^ \# \s* else /x)
332      {
333        my $prev_else = pop @else_stack;
334        Die("unbalanced #else") unless defined($prev_else);
335        Die("#else already seen") if $prev_else;
336        push @else_stack, 1;
337
338        pop @name_stack;
339
340        push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
341        $last_aux = aux_name();
342
343        push @name_stack, 0;
344
345        $curr_offset = 0;
346      }
347      elsif (/ ^ (\# \s*) endif /x)
348      {
349        my $prev_else = pop @else_stack;
350        Die("unbalanced #endif") unless defined($prev_else);
351
352        pop @name_stack;
353
354        # If there is no else-clause for an if-clause, we add one.  This is
355        # necessary to have correct offsets.
356        if (!$prev_else)
357        {
358          # Use amount of whitespace from `endif'.
359          push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next())
360                                           . $1 . "else\n";
361          $last_aux = aux_name();
362
363          $curr_offset = 0;
364        }
365
366        $name_stack[$#name_stack]++;
367
368        push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
369        $last_aux = aux_name();
370
371        $curr_offset = 0;
372      }
373
374      # Handle (probably continued) preprocessor lines.
375    CONTINUED_LOOP:
376      {
377        do
378        {
379          strip_newline();
380
381          push @{$diversions{$curr_enum}}, $ARG . "\n";
382          push @{$diversions{$curr_array}}, $ARG . "\n";
383
384          last CONTINUED_LOOP unless / \\ $ /x;
385
386        } while (<DATA>);
387      }
388    }
389    else
390    {
391      warn_before();
392    }
393
394    next;
395  }
396
397  if (/$enum_element_re/)
398  {
399    end_curr_string();
400    update_max_elem_size();
401
402    $curr_enum_element = $1;
403    $have_enum_element = 1;
404    $curr_elem_size = 0;
405
406    next;
407  }
408
409  if (/$string_re/)
410  {
411    if ($have_sections)
412    {
413      Die("strings and blocks can't be mixed in a section") if $have_blocks;
414
415      # Save captured group from `string_re'.
416      my $string = $1;
417
418      if ($have_enum_element)
419      {
420        push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
421        $have_enum_element = 0;
422      }
423
424      $string = convert_literal($string);
425
426      push @{$diversions{$curr_array}}, "    $string\n";
427
428      $have_strings = 1;
429      $in_string = 1;
430    }
431    else
432    {
433      warn_before();
434    }
435
436    next;
437  }
438
439  if (/$block_start_re/)
440  {
441    if ($have_sections)
442    {
443      Die("strings and blocks can't be mixed in a section") if $have_strings;
444
445      my $depth = 0;
446      my $block = "";
447      my $block_end = 0;
448
449      # Count braces while getting the block.
450    BRACE_LOOP:
451      {
452        do
453        {
454          strip_newline();
455
456          foreach my $substring (split(/$brace_re/))
457          {
458            if ($block_end)
459            {
460              Die("invalid data after last matching closing brace")
461                if $substring !~ /$whitespace_only_re/;
462            }
463
464            $block .= $substring;
465
466            if ($substring eq '{')
467            {
468              $depth++;
469            }
470            elsif ($substring eq '}')
471            {
472              $depth--;
473
474              $block_end = 1 if $depth == 0;
475            }
476          }
477
478          # If we are here, we have run out of substrings, so get next line
479          # or exit.
480          last BRACE_LOOP if $block_end;
481
482          $block .= "\n";
483
484        } while (<DATA>);
485      }
486
487      if ($have_enum_element)
488      {
489        push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
490        $have_enum_element = 0;
491      }
492
493      push @{$diversions{$curr_array}}, $block . ",\n";
494
495      $curr_offset++;
496      $curr_elem_size++;
497
498      $have_blocks = 1;
499    }
500    else
501    {
502      warn_before();
503    }
504
505    next;
506  }
507
508  # Garbage.  We weren't able to parse the data.
509  Die("syntax error");
510}
511
512# Finalize data.
513end_curr_string();
514update_max_elem_size();
515
516
517# Filter stdin to stdout, replacing `@...@' templates.
518
519sub emit_diversion
520{
521  my $diversion_name = shift;
522  return (exists($diversions{$1})) ? "@{$diversions{$1}}"
523                                   : "@" . $diversion_name . "@";
524}
525
526
527$LIST_SEPARATOR = '';
528
529my $s1 = "This file has been generated by the Perl script \`$prog',";
530my $s1len = length $s1;
531my $s2 = "using data from file \`$datafile'.";
532my $s2len = length $s2;
533my $slen = ($s1len > $s2len) ? $s1len : $s2len;
534
535print "/* " . $s1 . " " x ($slen - $s1len) . " */\n"
536      . "/* " . $s2 . " " x ($slen - $s2len) . " */\n"
537      . "\n";
538
539while (<STDIN>)
540{
541  s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx;
542  print;
543}
544
545# EOF
546