1#!/usr/bin/perl -w
2#
3# Written with reference to pandoc_markdown from Debian jessie
4# We require atx-style headers
5#
6# usage:
7#   pandoc -t json SUPPORT.md >j-unstable
8#   git cat-file blob origin/staging-4.11:SUPPORT.md | pandoc -t json >j-4.11
9#   docs/parse-support-md \
10#            j-unstable https://xenbits/unstable/SUPPORT.html
11#            j-4.11 https://xenbits/4.11/SUPPORT.html
12# or equivalent
13
14use strict;
15use JSON;
16use Tie::IxHash;
17use IO::File;
18use CGI qw(escapeHTML);
19use Data::Dumper;
20use POSIX;
21
22#---------- accumulating input/output ----------
23
24# This combines information from all of the input files.
25
26sub new_sectlist () { { } };
27our $toplevel_sectlist = new_sectlist();
28# an $sectlist is
29#   { }                 nothing seen yet
30#   a tied hashref      something seen
31# (tied $sectlist)    is an object of type Tie::IxHash
32# $sectlist->{KEY} a $sectnode:
33# $sectlist->{KEY}{Status}[VI] = absent or string or markdown content
34# $sectlist->{KEY}{Children} = a further $sectlist
35# $sectlist->{KEY}{Key} = KEY
36# $sectlist->{KEY}{RealSectNode} = us, or our parent
37# $sectlist->{KEY}{RealSectNode}{HasCaveat}[VI] = trueish iff other in a Para
38# $sectlist->{KEY}{RealInSect} = containing real section in @insections, so
39# $sectlist->{KEY}{RealInSect}{HasDescription} = VI for some Emph in Para
40# $sectlist->{KEY}{RealInSect}{Anchor} = value for < id="" > in the pandoc html
41# A $sectnode represents a single section from the original markdown
42# document.  Its subsections are in Children.
43#
44# Also, the input syntax:
45#    Status, something or other: Supported
46# is treated as a $sectnode, is as if it were a subsection -
47# one called `something or other'.  That is not a `real' section.
48#
49# KEY is the Anchor, or derived from the `something or other'.
50# It is used to match up identical features in different versions.
51
52#---------- state for this input file ----------
53
54our $version_index;
55our @version_urls;
56
57our @insections;
58# $insections[]{Key} = string
59# $insections[]{Headline} = markdown content
60# these next are only defined for real sections, not Status elements
61# $insections[]{Anchor} = string
62# $insections[]{HasDescription} VI, likewise
63
64our $had_unknown;
65our $had_feature;
66# adding new variable ?  it must be reset in r_toplevel
67
68#---------- parsing ----------
69
70sub find_current_sectnode () {
71    die unless @insections;
72
73    my $sectnode;
74    my $realinsect;
75    my $realsectnode;
76    foreach my $s (@insections) {
77        my $sectlist = $sectnode
78            ? $sectnode->{Children} : $toplevel_sectlist;
79        my $key = $s->{Key};
80        $realinsect = $s if $s->{Anchor};
81        tie %$sectlist, 'Tie::IxHash' unless tied %$sectlist;
82#print STDERR "FIND_CURRENT_SECTNODE ", Dumper($s);
83        $sectlist->{$key} //=
84            {
85             Children => new_sectlist(),
86             Headline => $s->{Headline},
87             Key => $key,
88             RealInSect => $realinsect,
89             HasCaveat => [],
90            };
91        $sectnode = $sectlist->{$key};
92        $realsectnode = $sectnode if $s->{Anchor};
93        $sectnode->{RealSectNode} = $realsectnode;
94    }
95    die unless $sectnode;
96    return $sectnode;
97}
98
99sub ri_Header {
100    my ($c) = @_;
101    my ($level, $infos, $hl) = @$c;
102#print STDERR 'RI_HEADER ', Dumper($c, \@c);
103    my ($id) = @$infos;
104    die unless $level >= 1;
105    die unless $level-2 <= $#insections;
106    $#insections = $level-2;
107    push @insections,
108        {
109         Key => $id,
110         Anchor => $id,
111         Headline => $hl,
112         HasDescription => undef,
113        };
114#print STDERR Dumper(\@insections);
115    $had_feature = 0;
116}
117
118sub ri_Para {
119    return unless @insections;
120    my $insection = $insections[$#insections];
121#    print DEBUG "ri_Para ",
122#        Dumper($version_index, $had_feature, $insection);
123
124    if ($had_feature) {
125        my $sectnode = find_current_sectnode();
126        $sectnode->{RealSectNode}{HasCaveat}[$version_index] = 1;
127    } else {
128        $insection->{HasDescription} //= $version_index;
129    }
130};
131
132sub parse_feature_entry ($) {
133    my ($value) = @_;
134
135    $had_feature = 1;
136    my $sectnode = find_current_sectnode();
137    $sectnode->{Status}[$version_index] = $value;
138}
139
140sub descr2key ($) {
141    my ($descr) = @_;
142
143    die unless @insections;
144    my $insection = $insections[$#insections];
145
146    my $key = lc $descr;
147    $key =~ y/ /-/;
148    $key =~ y/-0-9A-Za-z//cd;
149    $key = $insection->{Anchor}.'--'.$key;
150    return $key;
151}
152
153sub ri_CodeBlock {
154    my ($c) = @_;
155    my ($infos, $text) = @$c;
156
157    if ($text =~ m{^(?: Functional\ completeness
158                   | Functional\ stability
159                   | Interface\ stability
160                   | Security\ supported ) \:}x) {
161        # ignore this
162        return;
163    }
164    die "$had_unknown / $text ?" if $had_unknown;
165
166    my $toplevel = $text =~ m{^Xen-Version:};
167
168    foreach my $l (split /\n/, $text) {
169        $l =~ s/\s*$//;
170        next unless $l =~ m/\S/;
171
172        my ($descr, $value) =
173            $toplevel
174            ? $l =~ m{^([A-Z][-A-Z0-9a-z]+)\:\s+(\S.*)$}
175            : $l =~ m{^(?:Status|Supported)(?:\,\s*([^:]+))?\:\s+(\S.*)$}
176            or die ("$text\n^ cannot parse status codeblock line:".
177                    ($toplevel and 'top').
178                    "\n$l\n ?");
179
180        if (length $descr) {
181            push @insections,
182                {
183                 Key => descr2key($descr),
184                 Headline => [{ t => 'Str', c => $descr }],
185                };
186        }
187        parse_feature_entry $value;
188        if (length $descr) {
189            pop @insections;
190        }
191    }
192}
193
194sub ri_DefinitionList {
195    my ($c) = @_;
196    foreach my $defent (@$c) {
197        my ($term, $defns) = @$defent;
198        my $descr =
199            join ' ',
200            map { $_->{c} }
201            grep { $_->{t} eq 'Str' }
202            @$term;
203        push @insections,
204            {
205             Key => descr2key($descr),
206             Headline => $term,
207            };
208        die "multiple definitions in definition list definition"
209            if @$defns > 1;
210        my $defn = $defns->[0];
211        die "multiple paras in definition list definition"
212            if @$defn > 1;
213        my $defnp = $defn->[0];
214        die "only understand plain definition not $defnp->{t} ?"
215            unless $defnp->{t} eq 'Plain';
216        parse_feature_entry $defnp->{c};
217        pop @insections;
218    }
219}
220
221sub ri_BulletList {
222    # Assume a paragraph introduce this bullet list, which would mean that
223    # ri_Para() has already been called, and there's nothing else to do about
224    # the caveat.
225    return;
226}
227
228sub process_unknown {
229    my ($c, $e) = @_;
230    $had_unknown = Dumper($e);
231}
232
233sub r_content ($) {
234    my ($i) = @_;
235    foreach my $e (@$i) {
236        my $f = ${*::}{"ri_$e->{t}"};
237        $f //= \&process_unknown;
238        $f->($e->{c}, $e);
239    }
240}
241
242our $pandoc_toplevel_constructor;
243
244sub r_toplevel ($) {
245    my ($i) = @_;
246
247    die unless defined $version_index;
248
249    @insections = ();
250    $had_unknown = undef;
251    $had_feature = undef;
252
253    # Pandoc's JSON output changed some time between 1.17.2 (stretch)
254    # and 2.2.1 (buster).  I can find no documentation about this
255    # change or about the compatibility rules.  (It seems that
256    # processing the parse tree *is* supported upstream: they offer
257    # many libraries to do this inside the pandoc process.)
258    # Empirically, what has changed is just the top level structure.
259    # Also pandoc wants the same structure back that it spat out,
260    # when we ask it to format snippets.
261
262    my $blocks;
263    if (ref $i eq 'ARRAY') {
264	$pandoc_toplevel_constructor = sub {
265	    my ($blocks) = @_;
266	    return [
267		    { unMeta => { } },
268		    $blocks,
269		   ];
270	};
271	foreach my $e (@$i) {
272	    next unless ref $e eq 'ARRAY';
273	    r_content $e;
274	}
275    } elsif (ref $i eq 'HASH') {
276	my $api_version = $i->{'pandoc-api-version'};
277	$pandoc_toplevel_constructor = sub {
278	    my ($blocks) = @_;
279	    return {
280		    blocks => $blocks,
281		    meta => { },
282		    'pandoc-api-version' => $api_version,
283		   };
284	};
285	r_content $i->{blocks};
286    } else {
287	die;
288    }
289}
290
291sub read_inputs () {
292    $version_index = 0;
293
294    local $/;
295    undef $/;
296
297    while (my $f = shift @ARGV) {
298        push @version_urls, shift @ARGV;
299        eval {
300            open F, '<', $f or die $!;
301            my $input_toplevel = decode_json <F>;
302            r_toplevel $input_toplevel;
303        };
304        die "$@\nwhile processing input file $f\n" if $@;
305        $version_index++;
306    }
307}
308
309#---------- reprocessing ----------
310
311# variables generated by analyse_reprocess:
312our $maxdepth;
313
314sub pandoc2html_inline ($) {
315    my ($content) = @_;
316
317    my $json_fh = IO::File::new_tmpfile or die $!;
318
319    my $blocks = [{ t => 'Para', c => $content }];
320    my $data = $pandoc_toplevel_constructor->($blocks);
321    my $j = to_json($data) or die $!;
322    print $json_fh $j;
323    flush $json_fh or die $!;
324    seek $json_fh,0,0 or die $!;
325
326    my $c = open PD, "-|" // die $!;
327    if (!$c) {
328        open STDIN, "<&", $json_fh;
329        exec qw(pandoc -f json) or die $!;
330    }
331
332    local $/;
333    undef $/;
334    my $html = <PD>;
335    $?=$!=0;
336    if (!close PD) {
337        eval {
338            seek $json_fh,0,0 or die $!;
339            open STDIN, '<&', $json_fh or die $!;
340            system 'json_pp';
341        };
342        die "$j \n $? $!";
343    }
344
345    $html =~ s{^\<p\>}{} or die "$html ?";
346    $html =~ s{\</p\>$}{} or die "$html ?";
347    $html =~ s{\n$}{};
348    return $html;
349}
350
351sub reprocess_sectlist ($$);
352
353sub reprocess_sectnode ($$) {
354    my ($sectnode, $d) = @_;
355
356    $sectnode->{Depth} = $d;
357
358    if ($sectnode->{Status}) {
359        $maxdepth = $d if $d > $maxdepth;
360    }
361
362    if ($sectnode->{Headline}) {
363#            print STDERR Dumper($sectnode);
364        $sectnode->{Headline} =
365            pandoc2html_inline $sectnode->{Headline};
366    }
367
368    reprocess_sectlist $sectnode->{Children}, $d;
369}
370
371sub reprocess_sectlist ($$) {
372    my ($sectlist, $d) = @_;
373    $d++;
374
375    foreach my $sectnode (values %$sectlist) {
376        reprocess_sectnode $sectnode, $d;
377    }
378}
379
380sub count_rows_sectlist ($);
381
382sub count_rows_sectnode ($) {
383    my ($sectnode) = @_;
384    my $rows = 0;
385    $sectnode->{RealInSect}{OwnRows} //= 0;
386    if ($sectnode->{Status}) {
387        $rows++;
388        $sectnode->{RealInSect}{OwnRows}++;
389    }
390    $rows += count_rows_sectlist $sectnode->{Children};
391    $sectnode->{Rows} = $rows;
392    $sectnode->{RealInSect}{Rows} = $rows;
393    return $rows;
394}
395
396# Now we have
397#   $sectnode->{Rows}
398#   $sectnode->{RealInSect}{Rows}
399#   $sectnode->{RealInSect}{OwnRows}
400
401sub count_rows_sectlist ($) {
402    my ($sectlist) = @_;
403    my $rows = 0;
404    foreach my $sectnode (values %$sectlist) {
405        $rows += count_rows_sectnode $sectnode;
406    }
407    return $rows;
408}
409
410# After reprocess_sectlist,
411#    ->{Headline}   is in html
412#    ->{Status}     is (still) string or markdown content
413
414sub analyse_reprocess () {
415    $maxdepth = 0;
416    reprocess_sectlist $toplevel_sectlist, 0;
417}
418
419#---------- output ----------
420
421sub o { print @_ or die $!; }
422
423our @pending_headings;
424
425sub docref_a ($$) {
426    my ($i, $realinsect) = @_;
427    return sprintf '<a href="%s#%s">',
428        $version_urls[$i], $realinsect->{Anchor};
429}
430
431sub write_output_row ($) {
432    my ($sectnode) = @_;
433#    print STDERR 'WOR ', Dumper($d, $sectnode);
434    o('<tr>');
435    my $span = sub {
436        my ($rowcol, $n) = @_;
437        o(sprintf ' %sspan="%d"', $rowcol, $n) if $n != 1;
438    };
439    # This is all a bit tricky because (i) the input is hierarchical
440    # with variable depth, whereas the output has to have a fixed
441    # number of heading columns on the LHS; (ii) the HTML
442    # colspan/rowspan system means that when we are writing out, we
443    # have to not write table elements for table entries which have
444    # already been written with a span instruction that covers what we
445    # would write now.
446    while (my $heading = shift @pending_headings) {
447        o('<th valign="top"');
448        o(sprintf ' id="%s"', $heading->{Key});
449        $span->('row', $heading->{Rows});
450        $span->('col', $maxdepth - $heading->{Depth} + 1)
451            if !%{ $heading->{Children} };
452        o(' align="left">');
453        my $end_a = '';
454        my $desc_i = $heading->{RealInSect}{HasDescription};
455        if (defined $desc_i) {
456            o(docref_a $desc_i, $heading->{RealInSect});
457            $end_a= '</a>';
458        }
459        o($heading->{Headline});
460        o($end_a);
461        o('</th>');
462    }
463    if (%{ $sectnode->{Children} }) {
464        # we suppressed the colspan above, but we do need to make the gap
465        my $n = $maxdepth - $sectnode->{Depth};
466        die 'XX '. Dumper($n, $sectnode) if $n<0;
467        if ($n) {
468            o('<td');
469            $span->('col', $n);
470            o('></td>');
471        }
472    }
473    for (my $i=0; $i<@version_urls; $i++) {
474        my $st = $sectnode->{Status}[$i];
475
476        my $colspan = $sectnode->{RealInSect}{ColSpan}[$i];
477        my $nextcell = '';
478        if (!defined $colspan) { # first row of this RealInSect
479            $colspan= ' colspan="2"';
480            if ($sectnode->{RealSectNode}{HasCaveat}[$i] && $st
481                && $sectnode->{RealInSect}{Anchor}) {
482                my $rows = $sectnode->{RealInSect}{OwnRows};
483                $nextcell = '<td';
484                $nextcell .= sprintf ' rowspan=%d', $rows if $rows>1;
485                $nextcell .= '>';
486                $nextcell .= docref_a $i, $sectnode->{RealInSect};
487                $nextcell .= '[*]</a>';
488                $nextcell .= '</td>';
489                $colspan = '';
490            }
491            $sectnode->{RealInSect}{ColSpan}[$i] = $colspan;
492        }
493
494        $st //= '-';
495        o("\n<td$colspan>");
496        my $end_a = '';
497        if ($sectnode->{Key} eq 'release-support--xen-version') {
498            o(sprintf '<a href="%s">', $version_urls[$i]);
499            $end_a = '</a>';
500        }
501        if (ref $st) {
502            $st = pandoc2html_inline $st;
503        } else {
504            $st = escapeHTML($st);
505        }
506        o($st);
507        o($end_a);
508        o('</td>');
509        o($nextcell);
510    }
511    o("</tr>\n");
512}
513
514sub write_output_sectlist ($);
515sub write_output_sectlist ($) {
516    my ($sectlist) = @_;
517    foreach my $key (keys %$sectlist) {
518        my $sectnode = $sectlist->{$key};
519        push @pending_headings, $sectnode;
520        write_output_row $sectnode if $sectnode->{Status};
521        write_output_sectlist $sectnode->{Children};
522    }
523}
524
525sub write_output () {
526    o('<table rules="all">');
527    write_output_sectlist $toplevel_sectlist;
528    o('</table>');
529}
530
531#---------- main program ----------
532
533open DEBUG, '>', '/dev/null' or die $!;
534if (@ARGV && $ARGV[0] eq '-D') {
535    shift @ARGV;
536    open DEBUG, '>&2' or die $!;
537}
538
539die unless @ARGV;
540die if $ARGV[0] =~ m/^-/;
541die if @ARGV % 2;
542
543read_inputs();
544
545#use Data::Dumper;
546#print DEBUG Dumper($toplevel_sectlist);
547
548analyse_reprocess();
549# Now Headline is in HTML
550
551count_rows_sectlist($toplevel_sectlist);
552
553#use Data::Dumper;
554print DEBUG Dumper($toplevel_sectlist);
555
556write_output();
557