1#! /usr/bin/perl -w 2# -*- Perl -*- 3# 4# afblue.pl 5# 6# Process a blue zone character data file. 7# 8# Copyright 2013, 2014 by 9# David Turner, Robert Wilhelm, and Werner Lemberg. 10# 11# This file is part of the FreeType project, and may only be used, 12# modified, and distributed under the terms of the FreeType project 13# license, LICENSE.TXT. By continuing to use, modify, or distribute 14# this file you indicate that you have read the license and 15# understand and accept it fully. 16 17use strict; 18use warnings; 19use English '-no_match_vars'; 20use open ':std', ':encoding(UTF-8)'; 21 22 23my $prog = $PROGRAM_NAME; 24$prog =~ s| .* / ||x; # Remove path. 25 26die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0; 27 28 29my $datafile = $ARGV[0]; 30 31my %diversions; # The extracted and massaged data from `datafile'. 32my @else_stack; # Booleans to track else-clauses. 33my @name_stack; # Stack of integers used for names of aux. variables. 34 35my $curr_enum; # Name of the current enumeration. 36my $curr_array; # Name of the current array. 37my $curr_max; # Name of the current maximum value. 38 39my $curr_enum_element; # Name of the current enumeration element. 40my $curr_offset; # The offset relative to current aux. variable. 41my $curr_elem_size; # The size of the current string or block. 42 43my $have_sections = 0; # Boolean; set if start of a section has been seen. 44my $have_strings; # Boolean; set if current section contains strings. 45my $have_blocks; # Boolean; set if current section contains blocks. 46 47my $have_enum_element; # Boolean; set if we have an enumeration element. 48my $in_string; # Boolean; set if a string has been parsed. 49 50my $num_sections = 0; # Number of sections seen so far. 51 52my $last_aux; # Name of last auxiliary variable. 53 54 55# Regular expressions. 56 57# [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n' 58my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x; 59 60# [<ws>] <enum_element_name> [<ws>] '\n' 61my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x; 62 63# '#' <preprocessor directive> '\n' 64my $preprocessor_re = qr/ ^ \# /x; 65 66# '/' '/' <comment> '\n' 67my $comment_re = qr| ^ // |x; 68 69# empty line 70my $whitespace_only_re = qr/ ^ \s* $ /x; 71 72# [<ws>] '"' <string> '"' [<ws>] '\n' (<string> doesn't contain newlines) 73my $string_re = qr/ ^ \s* 74 " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) " 75 \s* $ /x; 76 77# [<ws>] '{' <block> '}' [<ws>] '\n' (<block> can contain newlines) 78my $block_start_re = qr/ ^ \s* \{ /x; 79 80# We need the capturing group for `split' to make it return the separator 81# tokens (i.e., the opening and closing brace) also. 82my $brace_re = qr/ ( [{}] ) /x; 83 84 85sub Warn 86{ 87 my $message = shift; 88 warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n"; 89} 90 91 92sub Die 93{ 94 my $message = shift; 95 die "$datafile:$INPUT_LINE_NUMBER: error: $message\n"; 96} 97 98 99my $warned_before = 0; 100 101sub warn_before 102{ 103 Warn("data before first section gets ignored") unless $warned_before; 104 $warned_before = 1; 105} 106 107 108sub strip_newline 109{ 110 chomp; 111 s/ \x0D $ //x; 112} 113 114 115sub end_curr_string 116{ 117 # Append final null byte to string. 118 if ($have_strings) 119 { 120 push @{$diversions{$curr_array}}, " '\\0',\n" if $in_string; 121 122 $curr_offset++; 123 $in_string = 0; 124 } 125} 126 127 128sub update_max_elem_size 129{ 130 if ($curr_elem_size) 131 { 132 my $max = pop @{$diversions{$curr_max}}; 133 $max = $curr_elem_size if $curr_elem_size > $max; 134 push @{$diversions{$curr_max}}, $max; 135 } 136} 137 138 139sub convert_non_ascii_char 140{ 141 # A UTF-8 character outside of the printable ASCII range, with possibly a 142 # leading backslash character. 143 my $s = shift; 144 145 # Here we count characters, not bytes. 146 $curr_elem_size += length $s; 147 148 utf8::encode($s); 149 $s = uc unpack 'H*', $s; 150 151 $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg; 152 153 return $s; 154} 155 156 157sub convert_ascii_chars 158{ 159 # A series of ASCII characters in the printable range. 160 my $s = shift; 161 162 my $count = $s =~ s/\G(.)/'$1', /g; 163 $curr_offset += $count; 164 $curr_elem_size += $count; 165 166 return $s; 167} 168 169 170sub convert_literal 171{ 172 my $s = shift; 173 my $orig = $s; 174 175 # ASCII printables and space 176 my $safe_re = '\x20-\x7E'; 177 # ASCII printables and space, no backslash 178 my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E'; 179 180 $s =~ s{ 181 (?: \\? ( [^$safe_re] ) 182 | ( (?: [$safe_no_backslash_re] 183 | \\ [$safe_re] )+ ) ) 184 } 185 { 186 defined($1) ? convert_non_ascii_char($1) 187 : convert_ascii_chars($2) 188 }egx; 189 190 # We assume that `$orig' doesn't contain `*/' 191 return $s . " /* $orig */"; 192} 193 194 195sub aux_name 196{ 197 return "af_blue_" . $num_sections. "_" . join('_', @name_stack); 198} 199 200 201sub aux_name_next 202{ 203 $name_stack[$#name_stack]++; 204 my $name = aux_name(); 205 $name_stack[$#name_stack]--; 206 207 return $name; 208} 209 210 211sub enum_val_string 212{ 213 # Build string that holds code to save the current offset in an 214 # enumeration element. 215 my $aux = shift; 216 217 my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" ) 218 ? "" 219 : "$last_aux + "; 220 221 return " $aux = $add$curr_offset,\n"; 222} 223 224 225 226# Process data file. 227 228open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n"; 229 230while (<DATA>) 231{ 232 strip_newline(); 233 234 next if /$comment_re/; 235 next if /$whitespace_only_re/; 236 237 if (/$section_re/) 238 { 239 Warn("previous section is empty") if ($have_sections 240 && !$have_strings 241 && !$have_blocks); 242 243 end_curr_string(); 244 update_max_elem_size(); 245 246 # Save captured groups from `section_re'. 247 $curr_enum = $1; 248 $curr_array = $2; 249 $curr_max = $3; 250 251 $curr_enum_element = ""; 252 $curr_offset = 0; 253 254 Warn("overwriting already defined enumeration \`$curr_enum'") 255 if exists($diversions{$curr_enum}); 256 Warn("overwriting already defined array \`$curr_array'") 257 if exists($diversions{$curr_array}); 258 Warn("overwriting already defined maximum value \`$curr_max'") 259 if exists($diversions{$curr_max}); 260 261 $diversions{$curr_enum} = []; 262 $diversions{$curr_array} = []; 263 $diversions{$curr_max} = []; 264 265 push @{$diversions{$curr_max}}, 0; 266 267 @name_stack = (); 268 push @name_stack, 0; 269 270 $have_sections = 1; 271 $have_strings = 0; 272 $have_blocks = 0; 273 274 $have_enum_element = 0; 275 $in_string = 0; 276 277 $num_sections++; 278 $curr_elem_size = 0; 279 280 $last_aux = aux_name(); 281 282 next; 283 } 284 285 if (/$preprocessor_re/) 286 { 287 if ($have_sections) 288 { 289 # Having preprocessor conditionals complicates the computation of 290 # correct offset values. We have to introduce auxiliary enumeration 291 # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store 292 # offsets to be used in conditional clauses. `<s>' is the number of 293 # sections seen so far, `<n1>' is the number of `#if' and `#endif' 294 # conditionals seen so far in the topmost level, `<n2>' the number of 295 # `#if' and `#endif' conditionals seen so far one level deeper, etc. 296 # As a consequence, uneven values are used within a clause, and even 297 # values after a clause, since the C standard doesn't allow the 298 # redefinition of an enumeration value. For example, the name 299 # `af_blue_5_1_6' is used to construct enumeration values in the fifth 300 # section after the third (second-level) if-clause within the first 301 # (top-level) if-clause. After the first top-level clause has 302 # finished, `af_blue_5_2' is used. The current offset is then 303 # relative to the value stored in the current auxiliary element. 304 305 if (/ ^ \# \s* if /x) 306 { 307 push @else_stack, 0; 308 309 $name_stack[$#name_stack]++; 310 311 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 312 $last_aux = aux_name(); 313 314 push @name_stack, 0; 315 316 $curr_offset = 0; 317 } 318 elsif (/ ^ \# \s* elif /x) 319 { 320 Die("unbalanced #elif") unless @else_stack; 321 322 pop @name_stack; 323 324 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 325 $last_aux = aux_name(); 326 327 push @name_stack, 0; 328 329 $curr_offset = 0; 330 } 331 elsif (/ ^ \# \s* else /x) 332 { 333 my $prev_else = pop @else_stack; 334 Die("unbalanced #else") unless defined($prev_else); 335 Die("#else already seen") if $prev_else; 336 push @else_stack, 1; 337 338 pop @name_stack; 339 340 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 341 $last_aux = aux_name(); 342 343 push @name_stack, 0; 344 345 $curr_offset = 0; 346 } 347 elsif (/ ^ (\# \s*) endif /x) 348 { 349 my $prev_else = pop @else_stack; 350 Die("unbalanced #endif") unless defined($prev_else); 351 352 pop @name_stack; 353 354 # If there is no else-clause for an if-clause, we add one. This is 355 # necessary to have correct offsets. 356 if (!$prev_else) 357 { 358 # Use amount of whitespace from `endif'. 359 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()) 360 . $1 . "else\n"; 361 $last_aux = aux_name(); 362 363 $curr_offset = 0; 364 } 365 366 $name_stack[$#name_stack]++; 367 368 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 369 $last_aux = aux_name(); 370 371 $curr_offset = 0; 372 } 373 374 # Handle (probably continued) preprocessor lines. 375 CONTINUED_LOOP: 376 { 377 do 378 { 379 strip_newline(); 380 381 push @{$diversions{$curr_enum}}, $ARG . "\n"; 382 push @{$diversions{$curr_array}}, $ARG . "\n"; 383 384 last CONTINUED_LOOP unless / \\ $ /x; 385 386 } while (<DATA>); 387 } 388 } 389 else 390 { 391 warn_before(); 392 } 393 394 next; 395 } 396 397 if (/$enum_element_re/) 398 { 399 end_curr_string(); 400 update_max_elem_size(); 401 402 $curr_enum_element = $1; 403 $have_enum_element = 1; 404 $curr_elem_size = 0; 405 406 next; 407 } 408 409 if (/$string_re/) 410 { 411 if ($have_sections) 412 { 413 Die("strings and blocks can't be mixed in a section") if $have_blocks; 414 415 # Save captured group from `string_re'. 416 my $string = $1; 417 418 if ($have_enum_element) 419 { 420 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 421 $have_enum_element = 0; 422 } 423 424 $string = convert_literal($string); 425 426 push @{$diversions{$curr_array}}, " $string\n"; 427 428 $have_strings = 1; 429 $in_string = 1; 430 } 431 else 432 { 433 warn_before(); 434 } 435 436 next; 437 } 438 439 if (/$block_start_re/) 440 { 441 if ($have_sections) 442 { 443 Die("strings and blocks can't be mixed in a section") if $have_strings; 444 445 my $depth = 0; 446 my $block = ""; 447 my $block_end = 0; 448 449 # Count braces while getting the block. 450 BRACE_LOOP: 451 { 452 do 453 { 454 strip_newline(); 455 456 foreach my $substring (split(/$brace_re/)) 457 { 458 if ($block_end) 459 { 460 Die("invalid data after last matching closing brace") 461 if $substring !~ /$whitespace_only_re/; 462 } 463 464 $block .= $substring; 465 466 if ($substring eq '{') 467 { 468 $depth++; 469 } 470 elsif ($substring eq '}') 471 { 472 $depth--; 473 474 $block_end = 1 if $depth == 0; 475 } 476 } 477 478 # If we are here, we have run out of substrings, so get next line 479 # or exit. 480 last BRACE_LOOP if $block_end; 481 482 $block .= "\n"; 483 484 } while (<DATA>); 485 } 486 487 if ($have_enum_element) 488 { 489 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 490 $have_enum_element = 0; 491 } 492 493 push @{$diversions{$curr_array}}, $block . ",\n"; 494 495 $curr_offset++; 496 $curr_elem_size++; 497 498 $have_blocks = 1; 499 } 500 else 501 { 502 warn_before(); 503 } 504 505 next; 506 } 507 508 # Garbage. We weren't able to parse the data. 509 Die("syntax error"); 510} 511 512# Finalize data. 513end_curr_string(); 514update_max_elem_size(); 515 516 517# Filter stdin to stdout, replacing `@...@' templates. 518 519sub emit_diversion 520{ 521 my $diversion_name = shift; 522 return (exists($diversions{$1})) ? "@{$diversions{$1}}" 523 : "@" . $diversion_name . "@"; 524} 525 526 527$LIST_SEPARATOR = ''; 528 529my $s1 = "This file has been generated by the Perl script \`$prog',"; 530my $s1len = length $s1; 531my $s2 = "using data from file \`$datafile'."; 532my $s2len = length $s2; 533my $slen = ($s1len > $s2len) ? $s1len : $s2len; 534 535print "/* " . $s1 . " " x ($slen - $s1len) . " */\n" 536 . "/* " . $s2 . " " x ($slen - $s2len) . " */\n" 537 . "\n"; 538 539while (<STDIN>) 540{ 541 s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx; 542 print; 543} 544 545# EOF 546