1#!/usr/bin/env python3 2 3# Copyright The Mbed TLS Contributors 4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 6""" 7This script checks the current state of the source code for minor issues, 8including incorrect file permissions, presence of tabs, non-Unix line endings, 9trailing whitespace, and presence of UTF-8 BOM. 10Note: requires python 3, must be run from Mbed TLS root. 11""" 12 13import argparse 14import codecs 15import inspect 16import logging 17import os 18import re 19import subprocess 20import sys 21try: 22 from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import 23except ImportError: 24 pass 25 26import scripts_path # pylint: disable=unused-import 27from mbedtls_dev import build_tree 28 29 30class FileIssueTracker: 31 """Base class for file-wide issue tracking. 32 33 To implement a checker that processes a file as a whole, inherit from 34 this class and implement `check_file_for_issue` and define ``heading``. 35 36 ``suffix_exemptions``: files whose name ends with a string in this set 37 will not be checked. 38 39 ``path_exemptions``: files whose path (relative to the root of the source 40 tree) matches this regular expression will not be checked. This can be 41 ``None`` to match no path. Paths are normalized and converted to ``/`` 42 separators before matching. 43 44 ``heading``: human-readable description of the issue 45 """ 46 47 suffix_exemptions = frozenset() #type: FrozenSet[str] 48 path_exemptions = None #type: Optional[Pattern[str]] 49 # heading must be defined in derived classes. 50 # pylint: disable=no-member 51 52 def __init__(self): 53 self.files_with_issues = {} 54 55 @staticmethod 56 def normalize_path(filepath): 57 """Normalize ``filepath`` with / as the directory separator.""" 58 filepath = os.path.normpath(filepath) 59 # On Windows, we may have backslashes to separate directories. 60 # We need slashes to match exemption lists. 61 seps = os.path.sep 62 if os.path.altsep is not None: 63 seps += os.path.altsep 64 return '/'.join(filepath.split(seps)) 65 66 def should_check_file(self, filepath): 67 """Whether the given file name should be checked. 68 69 Files whose name ends with a string listed in ``self.suffix_exemptions`` 70 or whose path matches ``self.path_exemptions`` will not be checked. 71 """ 72 for files_exemption in self.suffix_exemptions: 73 if filepath.endswith(files_exemption): 74 return False 75 if self.path_exemptions and \ 76 re.match(self.path_exemptions, self.normalize_path(filepath)): 77 return False 78 return True 79 80 def check_file_for_issue(self, filepath): 81 """Check the specified file for the issue that this class is for. 82 83 Subclasses must implement this method. 84 """ 85 raise NotImplementedError 86 87 def record_issue(self, filepath, line_number): 88 """Record that an issue was found at the specified location.""" 89 if filepath not in self.files_with_issues.keys(): 90 self.files_with_issues[filepath] = [] 91 self.files_with_issues[filepath].append(line_number) 92 93 def output_file_issues(self, logger): 94 """Log all the locations where the issue was found.""" 95 if self.files_with_issues.values(): 96 logger.info(self.heading) 97 for filename, lines in sorted(self.files_with_issues.items()): 98 if lines: 99 logger.info("{}: {}".format( 100 filename, ", ".join(str(x) for x in lines) 101 )) 102 else: 103 logger.info(filename) 104 logger.info("") 105 106BINARY_FILE_PATH_RE_LIST = [ 107 r'docs/.*\.pdf\Z', 108 r'docs/.*\.png\Z', 109 r'programs/fuzz/corpuses/[^.]+\Z', 110 r'tests/data_files/[^.]+\Z', 111 r'tests/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z', 112 r'tests/data_files/.*\.req\.[^/]+\Z', 113 r'tests/data_files/.*malformed[^/]+\Z', 114 r'tests/data_files/format_pkcs12\.fmt\Z', 115 r'tests/data_files/.*\.bin\Z', 116] 117BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST)) 118 119class LineIssueTracker(FileIssueTracker): 120 """Base class for line-by-line issue tracking. 121 122 To implement a checker that processes files line by line, inherit from 123 this class and implement `line_with_issue`. 124 """ 125 126 # Exclude binary files. 127 path_exemptions = BINARY_FILE_PATH_RE 128 129 def issue_with_line(self, line, filepath, line_number): 130 """Check the specified line for the issue that this class is for. 131 132 Subclasses must implement this method. 133 """ 134 raise NotImplementedError 135 136 def check_file_line(self, filepath, line, line_number): 137 if self.issue_with_line(line, filepath, line_number): 138 self.record_issue(filepath, line_number) 139 140 def check_file_for_issue(self, filepath): 141 """Check the lines of the specified file. 142 143 Subclasses must implement the ``issue_with_line`` method. 144 """ 145 with open(filepath, "rb") as f: 146 for i, line in enumerate(iter(f.readline, b"")): 147 self.check_file_line(filepath, line, i + 1) 148 149 150def is_windows_file(filepath): 151 _root, ext = os.path.splitext(filepath) 152 return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj') 153 154 155class ShebangIssueTracker(FileIssueTracker): 156 """Track files with a bad, missing or extraneous shebang line. 157 158 Executable scripts must start with a valid shebang (#!) line. 159 """ 160 161 heading = "Invalid shebang line:" 162 163 # Allow either /bin/sh, /bin/bash, or /usr/bin/env. 164 # Allow at most one argument (this is a Linux limitation). 165 # For sh and bash, the argument if present must be options. 166 # For env, the argument must be the base name of the interpreter. 167 _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?' 168 rb'|/usr/bin/env ([^\n /]+))$') 169 _extensions = { 170 b'bash': 'sh', 171 b'perl': 'pl', 172 b'python3': 'py', 173 b'sh': 'sh', 174 } 175 176 path_exemptions = re.compile(r'tests/scripts/quiet/.*') 177 178 def is_valid_shebang(self, first_line, filepath): 179 m = re.match(self._shebang_re, first_line) 180 if not m: 181 return False 182 interpreter = m.group(1) or m.group(2) 183 if interpreter not in self._extensions: 184 return False 185 if not filepath.endswith('.' + self._extensions[interpreter]): 186 return False 187 return True 188 189 def check_file_for_issue(self, filepath): 190 is_executable = os.access(filepath, os.X_OK) 191 with open(filepath, "rb") as f: 192 first_line = f.readline() 193 if first_line.startswith(b'#!'): 194 if not is_executable: 195 # Shebang on a non-executable file 196 self.files_with_issues[filepath] = None 197 elif not self.is_valid_shebang(first_line, filepath): 198 self.files_with_issues[filepath] = [1] 199 elif is_executable: 200 # Executable without a shebang 201 self.files_with_issues[filepath] = None 202 203 204class EndOfFileNewlineIssueTracker(FileIssueTracker): 205 """Track files that end with an incomplete line 206 (no newline character at the end of the last line).""" 207 208 heading = "Missing newline at end of file:" 209 210 path_exemptions = BINARY_FILE_PATH_RE 211 212 def check_file_for_issue(self, filepath): 213 with open(filepath, "rb") as f: 214 try: 215 f.seek(-1, 2) 216 except OSError: 217 # This script only works on regular files. If we can't seek 218 # 1 before the end, it means that this position is before 219 # the beginning of the file, i.e. that the file is empty. 220 return 221 if f.read(1) != b"\n": 222 self.files_with_issues[filepath] = None 223 224 225class Utf8BomIssueTracker(FileIssueTracker): 226 """Track files that start with a UTF-8 BOM. 227 Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM.""" 228 229 heading = "UTF-8 BOM present:" 230 231 suffix_exemptions = frozenset([".vcxproj", ".sln"]) 232 path_exemptions = BINARY_FILE_PATH_RE 233 234 def check_file_for_issue(self, filepath): 235 with open(filepath, "rb") as f: 236 if f.read().startswith(codecs.BOM_UTF8): 237 self.files_with_issues[filepath] = None 238 239 240class UnicodeIssueTracker(LineIssueTracker): 241 """Track lines with invalid characters or invalid text encoding.""" 242 243 heading = "Invalid UTF-8 or forbidden character:" 244 245 # Only allow valid UTF-8, and only other explicitly allowed characters. 246 # We deliberately exclude all characters that aren't a simple non-blank, 247 # non-zero-width glyph, apart from a very small set (tab, ordinary space, 248 # line breaks, "basic" no-break space and soft hyphen). In particular, 249 # non-ASCII control characters, combinig characters, and Unicode state 250 # changes (e.g. right-to-left text) are forbidden. 251 # Note that we do allow some characters with a risk of visual confusion, 252 # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs 253 # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs 254 # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). 255 GOOD_CHARACTERS = ''.join([ 256 '\t\n\r -~', # ASCII (tabs and line endings are checked separately) 257 '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) 258 '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) 259 '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts 260 '\u2190-\u21FF', # Arrows 261 '\u2200-\u22FF', # Mathematical Symbols 262 '\u2500-\u257F' # Box Drawings characters used in markdown trees 263 ]) 264 # Allow any of the characters and ranges above, and anything classified 265 # as a word constituent. 266 GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) 267 268 def issue_with_line(self, line, _filepath, line_number): 269 try: 270 text = line.decode('utf-8') 271 except UnicodeDecodeError: 272 return True 273 if line_number == 1 and text.startswith('\uFEFF'): 274 # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. 275 # Which files are allowed to have a BOM is handled in 276 # Utf8BomIssueTracker. 277 text = text[1:] 278 return not self.GOOD_CHARACTERS_RE.match(text) 279 280class UnixLineEndingIssueTracker(LineIssueTracker): 281 """Track files with non-Unix line endings (i.e. files with CR).""" 282 283 heading = "Non-Unix line endings:" 284 285 def should_check_file(self, filepath): 286 if not super().should_check_file(filepath): 287 return False 288 return not is_windows_file(filepath) 289 290 def issue_with_line(self, line, _filepath, _line_number): 291 return b"\r" in line 292 293 294class WindowsLineEndingIssueTracker(LineIssueTracker): 295 """Track files with non-Windows line endings (i.e. CR or LF not in CRLF).""" 296 297 heading = "Non-Windows line endings:" 298 299 def should_check_file(self, filepath): 300 if not super().should_check_file(filepath): 301 return False 302 return is_windows_file(filepath) 303 304 def issue_with_line(self, line, _filepath, _line_number): 305 return not line.endswith(b"\r\n") or b"\r" in line[:-2] 306 307 308class TrailingWhitespaceIssueTracker(LineIssueTracker): 309 """Track lines with trailing whitespace.""" 310 311 heading = "Trailing whitespace:" 312 suffix_exemptions = frozenset([".dsp", ".md"]) 313 314 def issue_with_line(self, line, _filepath, _line_number): 315 return line.rstrip(b"\r\n") != line.rstrip() 316 317 318class TabIssueTracker(LineIssueTracker): 319 """Track lines with tabs.""" 320 321 heading = "Tabs present:" 322 suffix_exemptions = frozenset([ 323 ".make", 324 ".pem", # some openssl dumps have tabs 325 ".sln", 326 "/.gitmodules", 327 "/Makefile", 328 "/Makefile.inc", 329 "/generate_visualc_files.pl", 330 ]) 331 332 def issue_with_line(self, line, _filepath, _line_number): 333 return b"\t" in line 334 335 336class MergeArtifactIssueTracker(LineIssueTracker): 337 """Track lines with merge artifacts. 338 These are leftovers from a ``git merge`` that wasn't fully edited.""" 339 340 heading = "Merge artifact:" 341 342 def issue_with_line(self, line, _filepath, _line_number): 343 # Detect leftover git conflict markers. 344 if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '): 345 return True 346 if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3 347 return True 348 if line.rstrip(b'\r\n') == b'=======' and \ 349 not _filepath.endswith('.md'): 350 return True 351 return False 352 353 354def this_location(): 355 frame = inspect.currentframe() 356 assert frame is not None 357 info = inspect.getframeinfo(frame) 358 return os.path.basename(info.filename), info.lineno 359THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location() 360 361class LicenseIssueTracker(LineIssueTracker): 362 """Check copyright statements and license indications. 363 364 This class only checks that statements are correct if present. It does 365 not enforce the presence of statements in each file. 366 """ 367 368 heading = "License issue:" 369 370 LICENSE_EXEMPTION_RE_LIST = [ 371 # Third-party code, other than whitelisted third-party modules, 372 # may be under a different license. 373 r'3rdparty/(?!(p256-m)/.*)', 374 # Documentation explaining the license may have accidental 375 # false positives. 376 r'(ChangeLog|LICENSE|[-0-9A-Z_a-z]+\.md)\Z', 377 # Files imported from TF-M, and not used except in test builds, 378 # may be under a different license. 379 r'configs/ext/crypto_config_profile_medium\.h\Z', 380 r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z', 381 r'configs/ext/README\.md\Z', 382 # Third-party file. 383 r'dco\.txt\Z', 384 ] 385 path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST + 386 LICENSE_EXEMPTION_RE_LIST)) 387 388 COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors' 389 # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc. 390 COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I) 391 392 SPDX_HEADER_KEY = b'SPDX-License-Identifier' 393 LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later' 394 SPDX_RE = re.compile(br'.*?(' + 395 re.escape(SPDX_HEADER_KEY) + 396 br')(:\s*(.*?)\W*\Z|.*)', re.I) 397 398 LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([ 399 rb'Apache License', 400 rb'General Public License', 401 ]) + rb')', re.I) 402 403 def __init__(self): 404 super().__init__() 405 # Record what problem was caused. We can't easily report it due to 406 # the structure of the script. To be fixed after 407 # https://github.com/Mbed-TLS/mbedtls/pull/2506 408 self.problem = None 409 410 def issue_with_line(self, line, filepath, line_number): 411 #pylint: disable=too-many-return-statements 412 413 # Use endswith() rather than the more correct os.path.basename() 414 # because experimentally, it makes a significant difference to 415 # the running time. 416 if filepath.endswith(THIS_FILE_BASE_NAME) and \ 417 line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER: 418 # Avoid false positives from the code in this class. 419 # Also skip the rest of this file, which is highly unlikely to 420 # contain any problematic statements since we put those near the 421 # top of files. 422 return False 423 424 m = self.COPYRIGHT_RE.match(line) 425 if m and m.group(1) != self.COPYRIGHT_HOLDER: 426 self.problem = 'Invalid copyright line' 427 return True 428 429 m = self.SPDX_RE.match(line) 430 if m: 431 if m.group(1) != self.SPDX_HEADER_KEY: 432 self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode() 433 return True 434 if not m.group(3): 435 self.problem = 'Improperly formatted SPDX license identifier' 436 return True 437 if m.group(3) != self.LICENSE_IDENTIFIER: 438 self.problem = 'Wrong SPDX license identifier' 439 return True 440 441 m = self.LICENSE_MENTION_RE.match(line) 442 if m: 443 self.problem = 'Suspicious license mention' 444 return True 445 446 return False 447 448 449class IntegrityChecker: 450 """Sanity-check files under the current directory.""" 451 452 def __init__(self, log_file): 453 """Instantiate the sanity checker. 454 Check files under the current directory. 455 Write a report of issues to log_file.""" 456 build_tree.check_repo_path() 457 self.logger = None 458 self.setup_logger(log_file) 459 self.issues_to_check = [ 460 ShebangIssueTracker(), 461 EndOfFileNewlineIssueTracker(), 462 Utf8BomIssueTracker(), 463 UnicodeIssueTracker(), 464 UnixLineEndingIssueTracker(), 465 WindowsLineEndingIssueTracker(), 466 TrailingWhitespaceIssueTracker(), 467 TabIssueTracker(), 468 MergeArtifactIssueTracker(), 469 LicenseIssueTracker(), 470 ] 471 472 def setup_logger(self, log_file, level=logging.INFO): 473 """Log to log_file if provided, or to stderr if None.""" 474 self.logger = logging.getLogger() 475 self.logger.setLevel(level) 476 if log_file: 477 handler = logging.FileHandler(log_file) 478 self.logger.addHandler(handler) 479 else: 480 console = logging.StreamHandler() 481 self.logger.addHandler(console) 482 483 @staticmethod 484 def collect_files(): 485 """Return the list of files to check. 486 487 These are the regular files commited into Git. 488 """ 489 bytes_output = subprocess.check_output(['git', 'ls-files', '-z']) 490 bytes_filepaths = bytes_output.split(b'\0')[:-1] 491 ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths) 492 # Filter out directories. Normally Git doesn't list directories 493 # (it only knows about the files inside them), but there is 494 # at least one case where 'git ls-files' includes a directory: 495 # submodules. Just skip submodules (and any other directories). 496 ascii_filepaths = [fp for fp in ascii_filepaths 497 if os.path.isfile(fp)] 498 # Prepend './' to files in the top-level directory so that 499 # something like `'/Makefile' in fp` matches in the top-level 500 # directory as well as in subdirectories. 501 return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp) 502 for fp in ascii_filepaths] 503 504 def check_files(self): 505 """Check all files for all issues.""" 506 for issue_to_check in self.issues_to_check: 507 for filepath in self.collect_files(): 508 if issue_to_check.should_check_file(filepath): 509 issue_to_check.check_file_for_issue(filepath) 510 511 def output_issues(self): 512 """Log the issues found and their locations. 513 514 Return 1 if there were issues, 0 otherwise. 515 """ 516 integrity_return_code = 0 517 for issue_to_check in self.issues_to_check: 518 if issue_to_check.files_with_issues: 519 integrity_return_code = 1 520 issue_to_check.output_file_issues(self.logger) 521 return integrity_return_code 522 523 524def run_main(): 525 parser = argparse.ArgumentParser(description=__doc__) 526 parser.add_argument( 527 "-l", "--log_file", type=str, help="path to optional output log", 528 ) 529 check_args = parser.parse_args() 530 integrity_check = IntegrityChecker(check_args.log_file) 531 integrity_check.check_files() 532 return_code = integrity_check.output_issues() 533 sys.exit(return_code) 534 535 536if __name__ == "__main__": 537 run_main() 538