1#!/usr/bin/env python3
2
3# Copyright The Mbed TLS Contributors
4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5
6"""
7This script checks the current state of the source code for minor issues,
8including incorrect file permissions, presence of tabs, non-Unix line endings,
9trailing whitespace, and presence of UTF-8 BOM.
10Note: requires python 3, must be run from Mbed TLS root.
11"""
12
13import argparse
14import codecs
15import inspect
16import logging
17import os
18import re
19import subprocess
20import sys
21try:
22    from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import
23except ImportError:
24    pass
25
26import scripts_path # pylint: disable=unused-import
27from mbedtls_dev import build_tree
28
29
30class FileIssueTracker:
31    """Base class for file-wide issue tracking.
32
33    To implement a checker that processes a file as a whole, inherit from
34    this class and implement `check_file_for_issue` and define ``heading``.
35
36    ``suffix_exemptions``: files whose name ends with a string in this set
37     will not be checked.
38
39    ``path_exemptions``: files whose path (relative to the root of the source
40    tree) matches this regular expression will not be checked. This can be
41    ``None`` to match no path. Paths are normalized and converted to ``/``
42    separators before matching.
43
44    ``heading``: human-readable description of the issue
45    """
46
47    suffix_exemptions = frozenset() #type: FrozenSet[str]
48    path_exemptions = None #type: Optional[Pattern[str]]
49    # heading must be defined in derived classes.
50    # pylint: disable=no-member
51
52    def __init__(self):
53        self.files_with_issues = {}
54
55    @staticmethod
56    def normalize_path(filepath):
57        """Normalize ``filepath`` with / as the directory separator."""
58        filepath = os.path.normpath(filepath)
59        # On Windows, we may have backslashes to separate directories.
60        # We need slashes to match exemption lists.
61        seps = os.path.sep
62        if os.path.altsep is not None:
63            seps += os.path.altsep
64        return '/'.join(filepath.split(seps))
65
66    def should_check_file(self, filepath):
67        """Whether the given file name should be checked.
68
69        Files whose name ends with a string listed in ``self.suffix_exemptions``
70        or whose path matches ``self.path_exemptions`` will not be checked.
71        """
72        for files_exemption in self.suffix_exemptions:
73            if filepath.endswith(files_exemption):
74                return False
75        if self.path_exemptions and \
76           re.match(self.path_exemptions, self.normalize_path(filepath)):
77            return False
78        return True
79
80    def check_file_for_issue(self, filepath):
81        """Check the specified file for the issue that this class is for.
82
83        Subclasses must implement this method.
84        """
85        raise NotImplementedError
86
87    def record_issue(self, filepath, line_number):
88        """Record that an issue was found at the specified location."""
89        if filepath not in self.files_with_issues.keys():
90            self.files_with_issues[filepath] = []
91        self.files_with_issues[filepath].append(line_number)
92
93    def output_file_issues(self, logger):
94        """Log all the locations where the issue was found."""
95        if self.files_with_issues.values():
96            logger.info(self.heading)
97            for filename, lines in sorted(self.files_with_issues.items()):
98                if lines:
99                    logger.info("{}: {}".format(
100                        filename, ", ".join(str(x) for x in lines)
101                    ))
102                else:
103                    logger.info(filename)
104            logger.info("")
105
106BINARY_FILE_PATH_RE_LIST = [
107    r'docs/.*\.pdf\Z',
108    r'docs/.*\.png\Z',
109    r'programs/fuzz/corpuses/[^.]+\Z',
110    r'tests/data_files/[^.]+\Z',
111    r'tests/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z',
112    r'tests/data_files/.*\.req\.[^/]+\Z',
113    r'tests/data_files/.*malformed[^/]+\Z',
114    r'tests/data_files/format_pkcs12\.fmt\Z',
115    r'tests/data_files/.*\.bin\Z',
116]
117BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST))
118
119class LineIssueTracker(FileIssueTracker):
120    """Base class for line-by-line issue tracking.
121
122    To implement a checker that processes files line by line, inherit from
123    this class and implement `line_with_issue`.
124    """
125
126    # Exclude binary files.
127    path_exemptions = BINARY_FILE_PATH_RE
128
129    def issue_with_line(self, line, filepath, line_number):
130        """Check the specified line for the issue that this class is for.
131
132        Subclasses must implement this method.
133        """
134        raise NotImplementedError
135
136    def check_file_line(self, filepath, line, line_number):
137        if self.issue_with_line(line, filepath, line_number):
138            self.record_issue(filepath, line_number)
139
140    def check_file_for_issue(self, filepath):
141        """Check the lines of the specified file.
142
143        Subclasses must implement the ``issue_with_line`` method.
144        """
145        with open(filepath, "rb") as f:
146            for i, line in enumerate(iter(f.readline, b"")):
147                self.check_file_line(filepath, line, i + 1)
148
149
150def is_windows_file(filepath):
151    _root, ext = os.path.splitext(filepath)
152    return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj')
153
154
155class ShebangIssueTracker(FileIssueTracker):
156    """Track files with a bad, missing or extraneous shebang line.
157
158    Executable scripts must start with a valid shebang (#!) line.
159    """
160
161    heading = "Invalid shebang line:"
162
163    # Allow either /bin/sh, /bin/bash, or /usr/bin/env.
164    # Allow at most one argument (this is a Linux limitation).
165    # For sh and bash, the argument if present must be options.
166    # For env, the argument must be the base name of the interpreter.
167    _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?'
168                             rb'|/usr/bin/env ([^\n /]+))$')
169    _extensions = {
170        b'bash': 'sh',
171        b'perl': 'pl',
172        b'python3': 'py',
173        b'sh': 'sh',
174    }
175
176    path_exemptions = re.compile(r'tests/scripts/quiet/.*')
177
178    def is_valid_shebang(self, first_line, filepath):
179        m = re.match(self._shebang_re, first_line)
180        if not m:
181            return False
182        interpreter = m.group(1) or m.group(2)
183        if interpreter not in self._extensions:
184            return False
185        if not filepath.endswith('.' + self._extensions[interpreter]):
186            return False
187        return True
188
189    def check_file_for_issue(self, filepath):
190        is_executable = os.access(filepath, os.X_OK)
191        with open(filepath, "rb") as f:
192            first_line = f.readline()
193        if first_line.startswith(b'#!'):
194            if not is_executable:
195                # Shebang on a non-executable file
196                self.files_with_issues[filepath] = None
197            elif not self.is_valid_shebang(first_line, filepath):
198                self.files_with_issues[filepath] = [1]
199        elif is_executable:
200            # Executable without a shebang
201            self.files_with_issues[filepath] = None
202
203
204class EndOfFileNewlineIssueTracker(FileIssueTracker):
205    """Track files that end with an incomplete line
206    (no newline character at the end of the last line)."""
207
208    heading = "Missing newline at end of file:"
209
210    path_exemptions = BINARY_FILE_PATH_RE
211
212    def check_file_for_issue(self, filepath):
213        with open(filepath, "rb") as f:
214            try:
215                f.seek(-1, 2)
216            except OSError:
217                # This script only works on regular files. If we can't seek
218                # 1 before the end, it means that this position is before
219                # the beginning of the file, i.e. that the file is empty.
220                return
221            if f.read(1) != b"\n":
222                self.files_with_issues[filepath] = None
223
224
225class Utf8BomIssueTracker(FileIssueTracker):
226    """Track files that start with a UTF-8 BOM.
227    Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM."""
228
229    heading = "UTF-8 BOM present:"
230
231    suffix_exemptions = frozenset([".vcxproj", ".sln"])
232    path_exemptions = BINARY_FILE_PATH_RE
233
234    def check_file_for_issue(self, filepath):
235        with open(filepath, "rb") as f:
236            if f.read().startswith(codecs.BOM_UTF8):
237                self.files_with_issues[filepath] = None
238
239
240class UnicodeIssueTracker(LineIssueTracker):
241    """Track lines with invalid characters or invalid text encoding."""
242
243    heading = "Invalid UTF-8 or forbidden character:"
244
245    # Only allow valid UTF-8, and only other explicitly allowed characters.
246    # We deliberately exclude all characters that aren't a simple non-blank,
247    # non-zero-width glyph, apart from a very small set (tab, ordinary space,
248    # line breaks, "basic" no-break space and soft hyphen). In particular,
249    # non-ASCII control characters, combinig characters, and Unicode state
250    # changes (e.g. right-to-left text) are forbidden.
251    # Note that we do allow some characters with a risk of visual confusion,
252    # for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs
253    # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
254    # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
255    GOOD_CHARACTERS = ''.join([
256        '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
257        '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
258        '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
259        '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
260        '\u2190-\u21FF', # Arrows
261        '\u2200-\u22FF', # Mathematical Symbols
262        '\u2500-\u257F' # Box Drawings characters used in markdown trees
263    ])
264    # Allow any of the characters and ranges above, and anything classified
265    # as a word constituent.
266    GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
267
268    def issue_with_line(self, line, _filepath, line_number):
269        try:
270            text = line.decode('utf-8')
271        except UnicodeDecodeError:
272            return True
273        if line_number == 1 and text.startswith('\uFEFF'):
274            # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
275            # Which files are allowed to have a BOM is handled in
276            # Utf8BomIssueTracker.
277            text = text[1:]
278        return not self.GOOD_CHARACTERS_RE.match(text)
279
280class UnixLineEndingIssueTracker(LineIssueTracker):
281    """Track files with non-Unix line endings (i.e. files with CR)."""
282
283    heading = "Non-Unix line endings:"
284
285    def should_check_file(self, filepath):
286        if not super().should_check_file(filepath):
287            return False
288        return not is_windows_file(filepath)
289
290    def issue_with_line(self, line, _filepath, _line_number):
291        return b"\r" in line
292
293
294class WindowsLineEndingIssueTracker(LineIssueTracker):
295    """Track files with non-Windows line endings (i.e. CR or LF not in CRLF)."""
296
297    heading = "Non-Windows line endings:"
298
299    def should_check_file(self, filepath):
300        if not super().should_check_file(filepath):
301            return False
302        return is_windows_file(filepath)
303
304    def issue_with_line(self, line, _filepath, _line_number):
305        return not line.endswith(b"\r\n") or b"\r" in line[:-2]
306
307
308class TrailingWhitespaceIssueTracker(LineIssueTracker):
309    """Track lines with trailing whitespace."""
310
311    heading = "Trailing whitespace:"
312    suffix_exemptions = frozenset([".dsp", ".md"])
313
314    def issue_with_line(self, line, _filepath, _line_number):
315        return line.rstrip(b"\r\n") != line.rstrip()
316
317
318class TabIssueTracker(LineIssueTracker):
319    """Track lines with tabs."""
320
321    heading = "Tabs present:"
322    suffix_exemptions = frozenset([
323        ".make",
324        ".pem", # some openssl dumps have tabs
325        ".sln",
326        "/.gitmodules",
327        "/Makefile",
328        "/Makefile.inc",
329        "/generate_visualc_files.pl",
330    ])
331
332    def issue_with_line(self, line, _filepath, _line_number):
333        return b"\t" in line
334
335
336class MergeArtifactIssueTracker(LineIssueTracker):
337    """Track lines with merge artifacts.
338    These are leftovers from a ``git merge`` that wasn't fully edited."""
339
340    heading = "Merge artifact:"
341
342    def issue_with_line(self, line, _filepath, _line_number):
343        # Detect leftover git conflict markers.
344        if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
345            return True
346        if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3
347            return True
348        if line.rstrip(b'\r\n') == b'=======' and \
349           not _filepath.endswith('.md'):
350            return True
351        return False
352
353
354def this_location():
355    frame = inspect.currentframe()
356    assert frame is not None
357    info = inspect.getframeinfo(frame)
358    return os.path.basename(info.filename), info.lineno
359THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location()
360
361class LicenseIssueTracker(LineIssueTracker):
362    """Check copyright statements and license indications.
363
364    This class only checks that statements are correct if present. It does
365    not enforce the presence of statements in each file.
366    """
367
368    heading = "License issue:"
369
370    LICENSE_EXEMPTION_RE_LIST = [
371        # Third-party code, other than whitelisted third-party modules,
372        # may be under a different license.
373        r'3rdparty/(?!(p256-m)/.*)',
374        # Documentation explaining the license may have accidental
375        # false positives.
376        r'(ChangeLog|LICENSE|[-0-9A-Z_a-z]+\.md)\Z',
377        # Files imported from TF-M, and not used except in test builds,
378        # may be under a different license.
379        r'configs/ext/crypto_config_profile_medium\.h\Z',
380        r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z',
381        r'configs/ext/README\.md\Z',
382        # Third-party file.
383        r'dco\.txt\Z',
384    ]
385    path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST +
386                                          LICENSE_EXEMPTION_RE_LIST))
387
388    COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors'
389    # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc.
390    COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I)
391
392    SPDX_HEADER_KEY = b'SPDX-License-Identifier'
393    LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later'
394    SPDX_RE = re.compile(br'.*?(' +
395                         re.escape(SPDX_HEADER_KEY) +
396                         br')(:\s*(.*?)\W*\Z|.*)', re.I)
397
398    LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([
399        rb'Apache License',
400        rb'General Public License',
401    ]) + rb')', re.I)
402
403    def __init__(self):
404        super().__init__()
405        # Record what problem was caused. We can't easily report it due to
406        # the structure of the script. To be fixed after
407        # https://github.com/Mbed-TLS/mbedtls/pull/2506
408        self.problem = None
409
410    def issue_with_line(self, line, filepath, line_number):
411        #pylint: disable=too-many-return-statements
412
413        # Use endswith() rather than the more correct os.path.basename()
414        # because experimentally, it makes a significant difference to
415        # the running time.
416        if filepath.endswith(THIS_FILE_BASE_NAME) and \
417           line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER:
418            # Avoid false positives from the code in this class.
419            # Also skip the rest of this file, which is highly unlikely to
420            # contain any problematic statements since we put those near the
421            # top of files.
422            return False
423
424        m = self.COPYRIGHT_RE.match(line)
425        if m and m.group(1) != self.COPYRIGHT_HOLDER:
426            self.problem = 'Invalid copyright line'
427            return True
428
429        m = self.SPDX_RE.match(line)
430        if m:
431            if m.group(1) != self.SPDX_HEADER_KEY:
432                self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode()
433                return True
434            if not m.group(3):
435                self.problem = 'Improperly formatted SPDX license identifier'
436                return True
437            if m.group(3) != self.LICENSE_IDENTIFIER:
438                self.problem = 'Wrong SPDX license identifier'
439                return True
440
441        m = self.LICENSE_MENTION_RE.match(line)
442        if m:
443            self.problem = 'Suspicious license mention'
444            return True
445
446        return False
447
448
449class IntegrityChecker:
450    """Sanity-check files under the current directory."""
451
452    def __init__(self, log_file):
453        """Instantiate the sanity checker.
454        Check files under the current directory.
455        Write a report of issues to log_file."""
456        build_tree.check_repo_path()
457        self.logger = None
458        self.setup_logger(log_file)
459        self.issues_to_check = [
460            ShebangIssueTracker(),
461            EndOfFileNewlineIssueTracker(),
462            Utf8BomIssueTracker(),
463            UnicodeIssueTracker(),
464            UnixLineEndingIssueTracker(),
465            WindowsLineEndingIssueTracker(),
466            TrailingWhitespaceIssueTracker(),
467            TabIssueTracker(),
468            MergeArtifactIssueTracker(),
469            LicenseIssueTracker(),
470        ]
471
472    def setup_logger(self, log_file, level=logging.INFO):
473        """Log to log_file if provided, or to stderr if None."""
474        self.logger = logging.getLogger()
475        self.logger.setLevel(level)
476        if log_file:
477            handler = logging.FileHandler(log_file)
478            self.logger.addHandler(handler)
479        else:
480            console = logging.StreamHandler()
481            self.logger.addHandler(console)
482
483    @staticmethod
484    def collect_files():
485        """Return the list of files to check.
486
487        These are the regular files commited into Git.
488        """
489        bytes_output = subprocess.check_output(['git', 'ls-files', '-z'])
490        bytes_filepaths = bytes_output.split(b'\0')[:-1]
491        ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths)
492        # Filter out directories. Normally Git doesn't list directories
493        # (it only knows about the files inside them), but there is
494        # at least one case where 'git ls-files' includes a directory:
495        # submodules. Just skip submodules (and any other directories).
496        ascii_filepaths = [fp for fp in ascii_filepaths
497                           if os.path.isfile(fp)]
498        # Prepend './' to files in the top-level directory so that
499        # something like `'/Makefile' in fp` matches in the top-level
500        # directory as well as in subdirectories.
501        return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp)
502                for fp in ascii_filepaths]
503
504    def check_files(self):
505        """Check all files for all issues."""
506        for issue_to_check in self.issues_to_check:
507            for filepath in self.collect_files():
508                if issue_to_check.should_check_file(filepath):
509                    issue_to_check.check_file_for_issue(filepath)
510
511    def output_issues(self):
512        """Log the issues found and their locations.
513
514        Return 1 if there were issues, 0 otherwise.
515        """
516        integrity_return_code = 0
517        for issue_to_check in self.issues_to_check:
518            if issue_to_check.files_with_issues:
519                integrity_return_code = 1
520            issue_to_check.output_file_issues(self.logger)
521        return integrity_return_code
522
523
524def run_main():
525    parser = argparse.ArgumentParser(description=__doc__)
526    parser.add_argument(
527        "-l", "--log_file", type=str, help="path to optional output log",
528    )
529    check_args = parser.parse_args()
530    integrity_check = IntegrityChecker(check_args.log_file)
531    integrity_check.check_files()
532    return_code = integrity_check.output_issues()
533    sys.exit(return_code)
534
535
536if __name__ == "__main__":
537    run_main()
538