1#!/usr/bin/env python3
2
3"""Assemble Mbed TLS change log entries into the change log file.
4
5Add changelog entries to the first level-2 section.
6Create a new level-2 section for unreleased changes if needed.
7Remove the input files unless --keep-entries is specified.
8
9In each level-3 section, entries are sorted in chronological order
10(oldest first). From oldest to newest:
11* Merged entry files are sorted according to their merge date (date of
12  the merge commit that brought the commit that created the file into
13  the target branch).
14* Committed but unmerged entry files are sorted according to the date
15  of the commit that adds them.
16* Uncommitted entry files are sorted according to their modification time.
17
18You must run this program from within a git working directory.
19"""
20
21# Copyright The Mbed TLS Contributors
22# SPDX-License-Identifier: Apache-2.0
23#
24# Licensed under the Apache License, Version 2.0 (the "License"); you may
25# not use this file except in compliance with the License.
26# You may obtain a copy of the License at
27#
28# http://www.apache.org/licenses/LICENSE-2.0
29#
30# Unless required by applicable law or agreed to in writing, software
31# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
32# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
33# See the License for the specific language governing permissions and
34# limitations under the License.
35
36import argparse
37from collections import OrderedDict, namedtuple
38import datetime
39import functools
40import glob
41import os
42import re
43import subprocess
44import sys
45
46class InputFormatError(Exception):
47    def __init__(self, filename, line_number, message, *args, **kwargs):
48        message = '{}:{}: {}'.format(filename, line_number,
49                                     message.format(*args, **kwargs))
50        super().__init__(message)
51
52class CategoryParseError(Exception):
53    def __init__(self, line_offset, error_message):
54        self.line_offset = line_offset
55        self.error_message = error_message
56        super().__init__('{}: {}'.format(line_offset, error_message))
57
58class LostContent(Exception):
59    def __init__(self, filename, line):
60        message = ('Lost content from {}: "{}"'.format(filename, line))
61        super().__init__(message)
62
63# The category names we use in the changelog.
64# If you edit this, update ChangeLog.d/README.md.
65STANDARD_CATEGORIES = (
66    'API changes',
67    'Default behavior changes',
68    'Requirement changes',
69    'New deprecations',
70    'Removals',
71    'Features',
72    'Security',
73    'Bugfix',
74    'Changes',
75)
76
77# The maximum line length for an entry
78MAX_LINE_LENGTH = 80
79
80CategoryContent = namedtuple('CategoryContent', [
81    'name', 'title_line', # Title text and line number of the title
82    'body', 'body_line', # Body text and starting line number of the body
83])
84
85class ChangelogFormat:
86    """Virtual class documenting how to write a changelog format class."""
87
88    @classmethod
89    def extract_top_version(cls, changelog_file_content):
90        """Split out the top version section.
91
92        If the top version is already released, create a new top
93        version section for an unreleased version.
94
95        Return ``(header, top_version_title, top_version_body, trailer)``
96        where the "top version" is the existing top version section if it's
97        for unreleased changes, and a newly created section otherwise.
98        To assemble the changelog after modifying top_version_body,
99        concatenate the four pieces.
100        """
101        raise NotImplementedError
102
103    @classmethod
104    def version_title_text(cls, version_title):
105        """Return the text of a formatted version section title."""
106        raise NotImplementedError
107
108    @classmethod
109    def split_categories(cls, version_body):
110        """Split a changelog version section body into categories.
111
112        Return a list of `CategoryContent` the name is category title
113        without any formatting.
114        """
115        raise NotImplementedError
116
117    @classmethod
118    def format_category(cls, title, body):
119        """Construct the text of a category section from its title and body."""
120        raise NotImplementedError
121
122class TextChangelogFormat(ChangelogFormat):
123    """The traditional Mbed TLS changelog format."""
124
125    _unreleased_version_text = '= mbed TLS x.x.x branch released xxxx-xx-xx'
126    @classmethod
127    def is_released_version(cls, title):
128        # Look for an incomplete release date
129        return not re.search(r'[0-9x]{4}-[0-9x]{2}-[0-9x]?x', title)
130
131    _top_version_re = re.compile(r'(?:\A|\n)(=[^\n]*\n+)(.*?\n)(?:=|$)',
132                                 re.DOTALL)
133    @classmethod
134    def extract_top_version(cls, changelog_file_content):
135        """A version section starts with a line starting with '='."""
136        m = re.search(cls._top_version_re, changelog_file_content)
137        top_version_start = m.start(1)
138        top_version_end = m.end(2)
139        top_version_title = m.group(1)
140        top_version_body = m.group(2)
141        if cls.is_released_version(top_version_title):
142            top_version_end = top_version_start
143            top_version_title = cls._unreleased_version_text + '\n\n'
144            top_version_body = ''
145        return (changelog_file_content[:top_version_start],
146                top_version_title, top_version_body,
147                changelog_file_content[top_version_end:])
148
149    @classmethod
150    def version_title_text(cls, version_title):
151        return re.sub(r'\n.*', version_title, re.DOTALL)
152
153    _category_title_re = re.compile(r'(^\w.*)\n+', re.MULTILINE)
154    @classmethod
155    def split_categories(cls, version_body):
156        """A category title is a line with the title in column 0."""
157        if not version_body:
158            return []
159        title_matches = list(re.finditer(cls._category_title_re, version_body))
160        if not title_matches or title_matches[0].start() != 0:
161            # There is junk before the first category.
162            raise CategoryParseError(0, 'Junk found where category expected')
163        title_starts = [m.start(1) for m in title_matches]
164        body_starts = [m.end(0) for m in title_matches]
165        body_ends = title_starts[1:] + [len(version_body)]
166        bodies = [version_body[body_start:body_end].rstrip('\n') + '\n'
167                  for (body_start, body_end) in zip(body_starts, body_ends)]
168        title_lines = [version_body[:pos].count('\n') for pos in title_starts]
169        body_lines = [version_body[:pos].count('\n') for pos in body_starts]
170        return [CategoryContent(title_match.group(1), title_line,
171                                body, body_line)
172                for title_match, title_line, body, body_line
173                in zip(title_matches, title_lines, bodies, body_lines)]
174
175    @classmethod
176    def format_category(cls, title, body):
177        # `split_categories` ensures that each body ends with a newline.
178        # Make sure that there is additionally a blank line between categories.
179        if not body.endswith('\n\n'):
180            body += '\n'
181        return title + '\n' + body
182
183class ChangeLog:
184    """An Mbed TLS changelog.
185
186    A changelog file consists of some header text followed by one or
187    more version sections. The version sections are in reverse
188    chronological order. Each version section consists of a title and a body.
189
190    The body of a version section consists of zero or more category
191    subsections. Each category subsection consists of a title and a body.
192
193    A changelog entry file has the same format as the body of a version section.
194
195    A `ChangelogFormat` object defines the concrete syntax of the changelog.
196    Entry files must have the same format as the changelog file.
197    """
198
199    # Only accept dotted version numbers (e.g. "3.1", not "3").
200    # Refuse ".x" in a version number where x is a letter: this indicates
201    # a version that is not yet released. Something like "3.1a" is accepted.
202    _version_number_re = re.compile(r'[0-9]+\.[0-9A-Za-z.]+')
203    _incomplete_version_number_re = re.compile(r'.*\.[A-Za-z]')
204    _only_url_re = re.compile(r'^\s*\w+://\S+\s*$')
205    _has_url_re = re.compile(r'.*://.*')
206
207    def add_categories_from_text(self, filename, line_offset,
208                                 text, allow_unknown_category):
209        """Parse a version section or entry file."""
210        try:
211            categories = self.format.split_categories(text)
212        except CategoryParseError as e:
213            raise InputFormatError(filename, line_offset + e.line_offset,
214                                   e.error_message)
215        for category in categories:
216            if not allow_unknown_category and \
217               category.name not in self.categories:
218                raise InputFormatError(filename,
219                                       line_offset + category.title_line,
220                                       'Unknown category: "{}"',
221                                       category.name)
222
223            body_split = category.body.splitlines()
224
225            for line_number, line in enumerate(body_split, 1):
226                if not self._only_url_re.match(line) and \
227                   len(line) > MAX_LINE_LENGTH:
228                    long_url_msg = '. URL exceeding length limit must be alone in its line.' \
229                        if self._has_url_re.match(line) else ""
230                    raise InputFormatError(filename,
231                                           category.body_line + line_number,
232                                           'Line is longer than allowed: '
233                                           'Length {} (Max {}){}',
234                                           len(line), MAX_LINE_LENGTH,
235                                           long_url_msg)
236
237            self.categories[category.name] += category.body
238
239    def __init__(self, input_stream, changelog_format):
240        """Create a changelog object.
241
242        Populate the changelog object from the content of the file
243        input_stream.
244        """
245        self.format = changelog_format
246        whole_file = input_stream.read()
247        (self.header,
248         self.top_version_title, top_version_body,
249         self.trailer) = self.format.extract_top_version(whole_file)
250        # Split the top version section into categories.
251        self.categories = OrderedDict()
252        for category in STANDARD_CATEGORIES:
253            self.categories[category] = ''
254        offset = (self.header + self.top_version_title).count('\n') + 1
255        self.add_categories_from_text(input_stream.name, offset,
256                                      top_version_body, True)
257
258    def add_file(self, input_stream):
259        """Add changelog entries from a file.
260        """
261        self.add_categories_from_text(input_stream.name, 1,
262                                      input_stream.read(), False)
263
264    def write(self, filename):
265        """Write the changelog to the specified file.
266        """
267        with open(filename, 'w', encoding='utf-8') as out:
268            out.write(self.header)
269            out.write(self.top_version_title)
270            for title, body in self.categories.items():
271                if not body:
272                    continue
273                out.write(self.format.format_category(title, body))
274            out.write(self.trailer)
275
276
277@functools.total_ordering
278class EntryFileSortKey:
279    """This classes defines an ordering on changelog entry files: older < newer.
280
281    * Merged entry files are sorted according to their merge date (date of
282      the merge commit that brought the commit that created the file into
283      the target branch).
284    * Committed but unmerged entry files are sorted according to the date
285      of the commit that adds them.
286    * Uncommitted entry files are sorted according to their modification time.
287
288    This class assumes that the file is in a git working directory with
289    the target branch checked out.
290    """
291
292    # Categories of files. A lower number is considered older.
293    MERGED = 0
294    COMMITTED = 1
295    LOCAL = 2
296
297    @staticmethod
298    def creation_hash(filename):
299        """Return the git commit id at which the given file was created.
300
301        Return None if the file was never checked into git.
302        """
303        hashes = subprocess.check_output(['git', 'log', '--format=%H',
304                                          '--follow',
305                                          '--', filename])
306        m = re.search('(.+)$', hashes.decode('ascii'))
307        if not m:
308            # The git output is empty. This means that the file was
309            # never checked in.
310            return None
311        # The last commit in the log is the oldest one, which is when the
312        # file was created.
313        return m.group(0)
314
315    @staticmethod
316    def list_merges(some_hash, target, *options):
317        """List merge commits from some_hash to target.
318
319        Pass options to git to select which commits are included.
320        """
321        text = subprocess.check_output(['git', 'rev-list',
322                                        '--merges', *options,
323                                        '..'.join([some_hash, target])])
324        return text.decode('ascii').rstrip('\n').split('\n')
325
326    @classmethod
327    def merge_hash(cls, some_hash):
328        """Return the git commit id at which the given commit was merged.
329
330        Return None if the given commit was never merged.
331        """
332        target = 'HEAD'
333        # List the merges from some_hash to the target in two ways.
334        # The ancestry list is the ones that are both descendants of
335        # some_hash and ancestors of the target.
336        ancestry = frozenset(cls.list_merges(some_hash, target,
337                                             '--ancestry-path'))
338        # The first_parents list only contains merges that are directly
339        # on the target branch. We want it in reverse order (oldest first).
340        first_parents = cls.list_merges(some_hash, target,
341                                        '--first-parent', '--reverse')
342        # Look for the oldest merge commit that's both on the direct path
343        # and directly on the target branch. That's the place where some_hash
344        # was merged on the target branch. See
345        # https://stackoverflow.com/questions/8475448/find-merge-commit-which-include-a-specific-commit
346        for commit in first_parents:
347            if commit in ancestry:
348                return commit
349        return None
350
351    @staticmethod
352    def commit_timestamp(commit_id):
353        """Return the timestamp of the given commit."""
354        text = subprocess.check_output(['git', 'show', '-s',
355                                        '--format=%ct',
356                                        commit_id])
357        return datetime.datetime.utcfromtimestamp(int(text))
358
359    @staticmethod
360    def file_timestamp(filename):
361        """Return the modification timestamp of the given file."""
362        mtime = os.stat(filename).st_mtime
363        return datetime.datetime.fromtimestamp(mtime)
364
365    def __init__(self, filename):
366        """Determine position of the file in the changelog entry order.
367
368        This constructor returns an object that can be used with comparison
369        operators, with `sort` and `sorted`, etc. Older entries are sorted
370        before newer entries.
371        """
372        self.filename = filename
373        creation_hash = self.creation_hash(filename)
374        if not creation_hash:
375            self.category = self.LOCAL
376            self.datetime = self.file_timestamp(filename)
377            return
378        merge_hash = self.merge_hash(creation_hash)
379        if not merge_hash:
380            self.category = self.COMMITTED
381            self.datetime = self.commit_timestamp(creation_hash)
382            return
383        self.category = self.MERGED
384        self.datetime = self.commit_timestamp(merge_hash)
385
386    def sort_key(self):
387        """"Return a concrete sort key for this entry file sort key object.
388
389        ``ts1 < ts2`` is implemented as ``ts1.sort_key() < ts2.sort_key()``.
390        """
391        return (self.category, self.datetime, self.filename)
392
393    def __eq__(self, other):
394        return self.sort_key() == other.sort_key()
395
396    def __lt__(self, other):
397        return self.sort_key() < other.sort_key()
398
399
400def check_output(generated_output_file, main_input_file, merged_files):
401    """Make sanity checks on the generated output.
402
403    The intent of these sanity checks is to have reasonable confidence
404    that no content has been lost.
405
406    The sanity check is that every line that is present in an input file
407    is also present in an output file. This is not perfect but good enough
408    for now.
409    """
410    generated_output = set(open(generated_output_file, 'r', encoding='utf-8'))
411    for line in open(main_input_file, 'r', encoding='utf-8'):
412        if line not in generated_output:
413            raise LostContent('original file', line)
414    for merged_file in merged_files:
415        for line in open(merged_file, 'r', encoding='utf-8'):
416            if line not in generated_output:
417                raise LostContent(merged_file, line)
418
419def finish_output(changelog, output_file, input_file, merged_files):
420    """Write the changelog to the output file.
421
422    The input file and the list of merged files are used only for sanity
423    checks on the output.
424    """
425    if os.path.exists(output_file) and not os.path.isfile(output_file):
426        # The output is a non-regular file (e.g. pipe). Write to it directly.
427        output_temp = output_file
428    else:
429        # The output is a regular file. Write to a temporary file,
430        # then move it into place atomically.
431        output_temp = output_file + '.tmp'
432    changelog.write(output_temp)
433    check_output(output_temp, input_file, merged_files)
434    if output_temp != output_file:
435        os.rename(output_temp, output_file)
436
437def remove_merged_entries(files_to_remove):
438    for filename in files_to_remove:
439        os.remove(filename)
440
441def list_files_to_merge(options):
442    """List the entry files to merge, oldest first.
443
444    "Oldest" is defined by `EntryFileSortKey`.
445    """
446    files_to_merge = glob.glob(os.path.join(options.dir, '*.txt'))
447    files_to_merge.sort(key=EntryFileSortKey)
448    return files_to_merge
449
450def merge_entries(options):
451    """Merge changelog entries into the changelog file.
452
453    Read the changelog file from options.input.
454    Read entries to merge from the directory options.dir.
455    Write the new changelog to options.output.
456    Remove the merged entries if options.keep_entries is false.
457    """
458    with open(options.input, 'r', encoding='utf-8') as input_file:
459        changelog = ChangeLog(input_file, TextChangelogFormat)
460    files_to_merge = list_files_to_merge(options)
461    if not files_to_merge:
462        sys.stderr.write('There are no pending changelog entries.\n')
463        return
464    for filename in files_to_merge:
465        with open(filename, 'r', encoding='utf-8') as input_file:
466            changelog.add_file(input_file)
467    finish_output(changelog, options.output, options.input, files_to_merge)
468    if not options.keep_entries:
469        remove_merged_entries(files_to_merge)
470
471def show_file_timestamps(options):
472    """List the files to merge and their timestamp.
473
474    This is only intended for debugging purposes.
475    """
476    files = list_files_to_merge(options)
477    for filename in files:
478        ts = EntryFileSortKey(filename)
479        print(ts.category, ts.datetime, filename)
480
481def set_defaults(options):
482    """Add default values for missing options."""
483    output_file = getattr(options, 'output', None)
484    if output_file is None:
485        options.output = options.input
486    if getattr(options, 'keep_entries', None) is None:
487        options.keep_entries = (output_file is not None)
488
489def main():
490    """Command line entry point."""
491    parser = argparse.ArgumentParser(description=__doc__)
492    parser.add_argument('--dir', '-d', metavar='DIR',
493                        default='ChangeLog.d',
494                        help='Directory to read entries from'
495                             ' (default: ChangeLog.d)')
496    parser.add_argument('--input', '-i', metavar='FILE',
497                        default='ChangeLog',
498                        help='Existing changelog file to read from and augment'
499                             ' (default: ChangeLog)')
500    parser.add_argument('--keep-entries',
501                        action='store_true', dest='keep_entries', default=None,
502                        help='Keep the files containing entries'
503                             ' (default: remove them if --output/-o is not specified)')
504    parser.add_argument('--no-keep-entries',
505                        action='store_false', dest='keep_entries',
506                        help='Remove the files containing entries after they are merged'
507                             ' (default: remove them if --output/-o is not specified)')
508    parser.add_argument('--output', '-o', metavar='FILE',
509                        help='Output changelog file'
510                             ' (default: overwrite the input)')
511    parser.add_argument('--list-files-only',
512                        action='store_true',
513                        help=('Only list the files that would be processed '
514                              '(with some debugging information)'))
515    options = parser.parse_args()
516    set_defaults(options)
517    if options.list_files_only:
518        show_file_timestamps(options)
519        return
520    merge_entries(options)
521
522if __name__ == '__main__':
523    main()
524