1#!/usr/bin/perl -w 2# 3# Written with reference to pandoc_markdown from Debian jessie 4# We require atx-style headers 5# 6# usage: 7# pandoc -t json SUPPORT.md >j-unstable 8# git cat-file blob origin/staging-4.11:SUPPORT.md | pandoc -t json >j-4.11 9# docs/parse-support-md \ 10# j-unstable https://xenbits/unstable/SUPPORT.html 11# j-4.11 https://xenbits/4.11/SUPPORT.html 12# or equivalent 13 14use strict; 15use JSON; 16use Tie::IxHash; 17use IO::File; 18use CGI qw(escapeHTML); 19use Data::Dumper; 20use POSIX; 21 22#---------- accumulating input/output ---------- 23 24# This combines information from all of the input files. 25 26sub new_sectlist () { { } }; 27our $toplevel_sectlist = new_sectlist(); 28# an $sectlist is 29# { } nothing seen yet 30# a tied hashref something seen 31# (tied $sectlist) is an object of type Tie::IxHash 32# $sectlist->{KEY} a $sectnode: 33# $sectlist->{KEY}{Status}[VI] = absent or string or markdown content 34# $sectlist->{KEY}{Children} = a further $sectlist 35# $sectlist->{KEY}{Key} = KEY 36# $sectlist->{KEY}{RealSectNode} = us, or our parent 37# $sectlist->{KEY}{RealSectNode}{HasCaveat}[VI] = trueish iff other in a Para 38# $sectlist->{KEY}{RealInSect} = containing real section in @insections, so 39# $sectlist->{KEY}{RealInSect}{HasDescription} = VI for some Emph in Para 40# $sectlist->{KEY}{RealInSect}{Anchor} = value for < id="" > in the pandoc html 41# A $sectnode represents a single section from the original markdown 42# document. Its subsections are in Children. 43# 44# Also, the input syntax: 45# Status, something or other: Supported 46# is treated as a $sectnode, is as if it were a subsection - 47# one called `something or other'. That is not a `real' section. 48# 49# KEY is the Anchor, or derived from the `something or other'. 50# It is used to match up identical features in different versions. 51 52#---------- state for this input file ---------- 53 54our $version_index; 55our @version_urls; 56 57our @insections; 58# $insections[]{Key} = string 59# $insections[]{Headline} = markdown content 60# these next are only defined for real sections, not Status elements 61# $insections[]{Anchor} = string 62# $insections[]{HasDescription} VI, likewise 63 64our $had_unknown; 65our $had_feature; 66# adding new variable ? it must be reset in r_toplevel 67 68#---------- parsing ---------- 69 70sub find_current_sectnode () { 71 die unless @insections; 72 73 my $sectnode; 74 my $realinsect; 75 my $realsectnode; 76 foreach my $s (@insections) { 77 my $sectlist = $sectnode 78 ? $sectnode->{Children} : $toplevel_sectlist; 79 my $key = $s->{Key}; 80 $realinsect = $s if $s->{Anchor}; 81 tie %$sectlist, 'Tie::IxHash' unless tied %$sectlist; 82#print STDERR "FIND_CURRENT_SECTNODE ", Dumper($s); 83 $sectlist->{$key} //= 84 { 85 Children => new_sectlist(), 86 Headline => $s->{Headline}, 87 Key => $key, 88 RealInSect => $realinsect, 89 HasCaveat => [], 90 }; 91 $sectnode = $sectlist->{$key}; 92 $realsectnode = $sectnode if $s->{Anchor}; 93 $sectnode->{RealSectNode} = $realsectnode; 94 } 95 die unless $sectnode; 96 return $sectnode; 97} 98 99sub ri_Header { 100 my ($c) = @_; 101 my ($level, $infos, $hl) = @$c; 102#print STDERR 'RI_HEADER ', Dumper($c, \@c); 103 my ($id) = @$infos; 104 die unless $level >= 1; 105 die unless $level-2 <= $#insections; 106 $#insections = $level-2; 107 push @insections, 108 { 109 Key => $id, 110 Anchor => $id, 111 Headline => $hl, 112 HasDescription => undef, 113 }; 114#print STDERR Dumper(\@insections); 115 $had_feature = 0; 116} 117 118sub ri_Para { 119 return unless @insections; 120 my $insection = $insections[$#insections]; 121# print DEBUG "ri_Para ", 122# Dumper($version_index, $had_feature, $insection); 123 124 if ($had_feature) { 125 my $sectnode = find_current_sectnode(); 126 $sectnode->{RealSectNode}{HasCaveat}[$version_index] = 1; 127 } else { 128 $insection->{HasDescription} //= $version_index; 129 } 130}; 131 132sub parse_feature_entry ($) { 133 my ($value) = @_; 134 135 $had_feature = 1; 136 my $sectnode = find_current_sectnode(); 137 $sectnode->{Status}[$version_index] = $value; 138} 139 140sub descr2key ($) { 141 my ($descr) = @_; 142 143 die unless @insections; 144 my $insection = $insections[$#insections]; 145 146 my $key = lc $descr; 147 $key =~ y/ /-/; 148 $key =~ y/-0-9A-Za-z//cd; 149 $key = $insection->{Anchor}.'--'.$key; 150 return $key; 151} 152 153sub ri_CodeBlock { 154 my ($c) = @_; 155 my ($infos, $text) = @$c; 156 157 if ($text =~ m{^(?: Functional\ completeness 158 | Functional\ stability 159 | Interface\ stability 160 | Security\ supported ) \:}x) { 161 # ignore this 162 return; 163 } 164 die "$had_unknown / $text ?" if $had_unknown; 165 166 my $toplevel = $text =~ m{^Xen-Version:}; 167 168 foreach my $l (split /\n/, $text) { 169 $l =~ s/\s*$//; 170 next unless $l =~ m/\S/; 171 172 my ($descr, $value) = 173 $toplevel 174 ? $l =~ m{^([A-Z][-A-Z0-9a-z]+)\:\s+(\S.*)$} 175 : $l =~ m{^(?:Status|Supported)(?:\,\s*([^:]+))?\:\s+(\S.*)$} 176 or die ("$text\n^ cannot parse status codeblock line:". 177 ($toplevel and 'top'). 178 "\n$l\n ?"); 179 180 if (length $descr) { 181 push @insections, 182 { 183 Key => descr2key($descr), 184 Headline => [{ t => 'Str', c => $descr }], 185 }; 186 } 187 parse_feature_entry $value; 188 if (length $descr) { 189 pop @insections; 190 } 191 } 192} 193 194sub ri_DefinitionList { 195 my ($c) = @_; 196 foreach my $defent (@$c) { 197 my ($term, $defns) = @$defent; 198 my $descr = 199 join ' ', 200 map { $_->{c} } 201 grep { $_->{t} eq 'Str' } 202 @$term; 203 push @insections, 204 { 205 Key => descr2key($descr), 206 Headline => $term, 207 }; 208 die "multiple definitions in definition list definition" 209 if @$defns > 1; 210 my $defn = $defns->[0]; 211 die "multiple paras in definition list definition" 212 if @$defn > 1; 213 my $defnp = $defn->[0]; 214 die "only understand plain definition not $defnp->{t} ?" 215 unless $defnp->{t} eq 'Plain'; 216 parse_feature_entry $defnp->{c}; 217 pop @insections; 218 } 219} 220 221sub ri_BulletList { 222 # Assume a paragraph introduce this bullet list, which would mean that 223 # ri_Para() has already been called, and there's nothing else to do about 224 # the caveat. 225 return; 226} 227 228sub process_unknown { 229 my ($c, $e) = @_; 230 $had_unknown = Dumper($e); 231} 232 233sub r_content ($) { 234 my ($i) = @_; 235 foreach my $e (@$i) { 236 my $f = ${*::}{"ri_$e->{t}"}; 237 $f //= \&process_unknown; 238 $f->($e->{c}, $e); 239 } 240} 241 242our $pandoc_toplevel_constructor; 243 244sub r_toplevel ($) { 245 my ($i) = @_; 246 247 die unless defined $version_index; 248 249 @insections = (); 250 $had_unknown = undef; 251 $had_feature = undef; 252 253 # Pandoc's JSON output changed some time between 1.17.2 (stretch) 254 # and 2.2.1 (buster). I can find no documentation about this 255 # change or about the compatibility rules. (It seems that 256 # processing the parse tree *is* supported upstream: they offer 257 # many libraries to do this inside the pandoc process.) 258 # Empirically, what has changed is just the top level structure. 259 # Also pandoc wants the same structure back that it spat out, 260 # when we ask it to format snippets. 261 262 my $blocks; 263 if (ref $i eq 'ARRAY') { 264 $pandoc_toplevel_constructor = sub { 265 my ($blocks) = @_; 266 return [ 267 { unMeta => { } }, 268 $blocks, 269 ]; 270 }; 271 foreach my $e (@$i) { 272 next unless ref $e eq 'ARRAY'; 273 r_content $e; 274 } 275 } elsif (ref $i eq 'HASH') { 276 my $api_version = $i->{'pandoc-api-version'}; 277 $pandoc_toplevel_constructor = sub { 278 my ($blocks) = @_; 279 return { 280 blocks => $blocks, 281 meta => { }, 282 'pandoc-api-version' => $api_version, 283 }; 284 }; 285 r_content $i->{blocks}; 286 } else { 287 die; 288 } 289} 290 291sub read_inputs () { 292 $version_index = 0; 293 294 local $/; 295 undef $/; 296 297 while (my $f = shift @ARGV) { 298 push @version_urls, shift @ARGV; 299 eval { 300 open F, '<', $f or die $!; 301 my $input_toplevel = decode_json <F>; 302 r_toplevel $input_toplevel; 303 }; 304 die "$@\nwhile processing input file $f\n" if $@; 305 $version_index++; 306 } 307} 308 309#---------- reprocessing ---------- 310 311# variables generated by analyse_reprocess: 312our $maxdepth; 313 314sub pandoc2html_inline ($) { 315 my ($content) = @_; 316 317 my $json_fh = IO::File::new_tmpfile or die $!; 318 319 my $blocks = [{ t => 'Para', c => $content }]; 320 my $data = $pandoc_toplevel_constructor->($blocks); 321 my $j = to_json($data) or die $!; 322 print $json_fh $j; 323 flush $json_fh or die $!; 324 seek $json_fh,0,0 or die $!; 325 326 my $c = open PD, "-|" // die $!; 327 if (!$c) { 328 open STDIN, "<&", $json_fh; 329 exec qw(pandoc -f json) or die $!; 330 } 331 332 local $/; 333 undef $/; 334 my $html = <PD>; 335 $?=$!=0; 336 if (!close PD) { 337 eval { 338 seek $json_fh,0,0 or die $!; 339 open STDIN, '<&', $json_fh or die $!; 340 system 'json_pp'; 341 }; 342 die "$j \n $? $!"; 343 } 344 345 $html =~ s{^\<p\>}{} or die "$html ?"; 346 $html =~ s{\</p\>$}{} or die "$html ?"; 347 $html =~ s{\n$}{}; 348 return $html; 349} 350 351sub reprocess_sectlist ($$); 352 353sub reprocess_sectnode ($$) { 354 my ($sectnode, $d) = @_; 355 356 $sectnode->{Depth} = $d; 357 358 if ($sectnode->{Status}) { 359 $maxdepth = $d if $d > $maxdepth; 360 } 361 362 if ($sectnode->{Headline}) { 363# print STDERR Dumper($sectnode); 364 $sectnode->{Headline} = 365 pandoc2html_inline $sectnode->{Headline}; 366 } 367 368 reprocess_sectlist $sectnode->{Children}, $d; 369} 370 371sub reprocess_sectlist ($$) { 372 my ($sectlist, $d) = @_; 373 $d++; 374 375 foreach my $sectnode (values %$sectlist) { 376 reprocess_sectnode $sectnode, $d; 377 } 378} 379 380sub count_rows_sectlist ($); 381 382sub count_rows_sectnode ($) { 383 my ($sectnode) = @_; 384 my $rows = 0; 385 $sectnode->{RealInSect}{OwnRows} //= 0; 386 if ($sectnode->{Status}) { 387 $rows++; 388 $sectnode->{RealInSect}{OwnRows}++; 389 } 390 $rows += count_rows_sectlist $sectnode->{Children}; 391 $sectnode->{Rows} = $rows; 392 $sectnode->{RealInSect}{Rows} = $rows; 393 return $rows; 394} 395 396# Now we have 397# $sectnode->{Rows} 398# $sectnode->{RealInSect}{Rows} 399# $sectnode->{RealInSect}{OwnRows} 400 401sub count_rows_sectlist ($) { 402 my ($sectlist) = @_; 403 my $rows = 0; 404 foreach my $sectnode (values %$sectlist) { 405 $rows += count_rows_sectnode $sectnode; 406 } 407 return $rows; 408} 409 410# After reprocess_sectlist, 411# ->{Headline} is in html 412# ->{Status} is (still) string or markdown content 413 414sub analyse_reprocess () { 415 $maxdepth = 0; 416 reprocess_sectlist $toplevel_sectlist, 0; 417} 418 419#---------- output ---------- 420 421sub o { print @_ or die $!; } 422 423our @pending_headings; 424 425sub docref_a ($$) { 426 my ($i, $realinsect) = @_; 427 return sprintf '<a href="%s#%s">', 428 $version_urls[$i], $realinsect->{Anchor}; 429} 430 431sub write_output_row ($) { 432 my ($sectnode) = @_; 433# print STDERR 'WOR ', Dumper($d, $sectnode); 434 o('<tr>'); 435 my $span = sub { 436 my ($rowcol, $n) = @_; 437 o(sprintf ' %sspan="%d"', $rowcol, $n) if $n != 1; 438 }; 439 # This is all a bit tricky because (i) the input is hierarchical 440 # with variable depth, whereas the output has to have a fixed 441 # number of heading columns on the LHS; (ii) the HTML 442 # colspan/rowspan system means that when we are writing out, we 443 # have to not write table elements for table entries which have 444 # already been written with a span instruction that covers what we 445 # would write now. 446 while (my $heading = shift @pending_headings) { 447 o('<th valign="top"'); 448 o(sprintf ' id="%s"', $heading->{Key}); 449 $span->('row', $heading->{Rows}); 450 $span->('col', $maxdepth - $heading->{Depth} + 1) 451 if !%{ $heading->{Children} }; 452 o(' align="left">'); 453 my $end_a = ''; 454 my $desc_i = $heading->{RealInSect}{HasDescription}; 455 if (defined $desc_i) { 456 o(docref_a $desc_i, $heading->{RealInSect}); 457 $end_a= '</a>'; 458 } 459 o($heading->{Headline}); 460 o($end_a); 461 o('</th>'); 462 } 463 if (%{ $sectnode->{Children} }) { 464 # we suppressed the colspan above, but we do need to make the gap 465 my $n = $maxdepth - $sectnode->{Depth}; 466 die 'XX '. Dumper($n, $sectnode) if $n<0; 467 if ($n) { 468 o('<td'); 469 $span->('col', $n); 470 o('></td>'); 471 } 472 } 473 for (my $i=0; $i<@version_urls; $i++) { 474 my $st = $sectnode->{Status}[$i]; 475 476 my $colspan = $sectnode->{RealInSect}{ColSpan}[$i]; 477 my $nextcell = ''; 478 if (!defined $colspan) { # first row of this RealInSect 479 $colspan= ' colspan="2"'; 480 if ($sectnode->{RealSectNode}{HasCaveat}[$i] && $st 481 && $sectnode->{RealInSect}{Anchor}) { 482 my $rows = $sectnode->{RealInSect}{OwnRows}; 483 $nextcell = '<td'; 484 $nextcell .= sprintf ' rowspan=%d', $rows if $rows>1; 485 $nextcell .= '>'; 486 $nextcell .= docref_a $i, $sectnode->{RealInSect}; 487 $nextcell .= '[*]</a>'; 488 $nextcell .= '</td>'; 489 $colspan = ''; 490 } 491 $sectnode->{RealInSect}{ColSpan}[$i] = $colspan; 492 } 493 494 $st //= '-'; 495 o("\n<td$colspan>"); 496 my $end_a = ''; 497 if ($sectnode->{Key} eq 'release-support--xen-version') { 498 o(sprintf '<a href="%s">', $version_urls[$i]); 499 $end_a = '</a>'; 500 } 501 if (ref $st) { 502 $st = pandoc2html_inline $st; 503 } else { 504 $st = escapeHTML($st); 505 } 506 o($st); 507 o($end_a); 508 o('</td>'); 509 o($nextcell); 510 } 511 o("</tr>\n"); 512} 513 514sub write_output_sectlist ($); 515sub write_output_sectlist ($) { 516 my ($sectlist) = @_; 517 foreach my $key (keys %$sectlist) { 518 my $sectnode = $sectlist->{$key}; 519 push @pending_headings, $sectnode; 520 write_output_row $sectnode if $sectnode->{Status}; 521 write_output_sectlist $sectnode->{Children}; 522 } 523} 524 525sub write_output () { 526 o('<table rules="all">'); 527 write_output_sectlist $toplevel_sectlist; 528 o('</table>'); 529} 530 531#---------- main program ---------- 532 533open DEBUG, '>', '/dev/null' or die $!; 534if (@ARGV && $ARGV[0] eq '-D') { 535 shift @ARGV; 536 open DEBUG, '>&2' or die $!; 537} 538 539die unless @ARGV; 540die if $ARGV[0] =~ m/^-/; 541die if @ARGV % 2; 542 543read_inputs(); 544 545#use Data::Dumper; 546#print DEBUG Dumper($toplevel_sectlist); 547 548analyse_reprocess(); 549# Now Headline is in HTML 550 551count_rows_sectlist($toplevel_sectlist); 552 553#use Data::Dumper; 554print DEBUG Dumper($toplevel_sectlist); 555 556write_output(); 557