1# Copyright (c) 2020, 2021 The Linux Foundation 2# 3# SPDX-License-Identifier: Apache-2.0 4 5import hashlib 6import os 7import re 8from dataclasses import dataclass 9 10from reuse.project import Project 11from west import log 12 13from zspdx.licenses import LICENSES 14from zspdx.util import getHashes 15 16 17# ScannerConfig contains settings used to configure how the SPDX 18# Document scanning should occur. 19@dataclass(eq=True) 20class ScannerConfig: 21 # when assembling a Package's data, should we auto-conclude the 22 # Package's license, based on the licenses of its Files? 23 shouldConcludePackageLicense: bool = True 24 25 # when assembling a Package's Files' data, should we auto-conclude 26 # each File's license, based on its detected license(s)? 27 shouldConcludeFileLicenses: bool = True 28 29 # number of lines to scan for SPDX-License-Identifier (0 = all) 30 # defaults to 20 31 numLinesScanned: int = 20 32 33 # should we calculate SHA256 hashes for each Package's Files? 34 # note that SHA1 hashes are mandatory, per SPDX 2.3 35 doSHA256: bool = True 36 37 # should we calculate MD5 hashes for each Package's Files? 38 doMD5: bool = False 39 40 41def parseLineForExpression(line): 42 """Return parsed SPDX expression if tag found in line, or None otherwise.""" 43 p = line.partition("SPDX-License-Identifier:") 44 if p[2] == "": 45 return None 46 # strip away trailing comment marks and whitespace, if any 47 expression = p[2].strip() 48 expression = expression.rstrip("/*") 49 expression = expression.strip() 50 return expression 51 52 53def getExpressionData(filePath, numLines): 54 """ 55 Scans the specified file for the first SPDX-License-Identifier: 56 tag in the file. 57 58 Arguments: 59 - filePath: path to file to scan. 60 - numLines: number of lines to scan for an expression before 61 giving up. If 0, will scan the entire file. 62 Returns: parsed expression if found; None if not found. 63 """ 64 log.dbg(f" - getting licenses for {filePath}") 65 66 with open(filePath) as f: 67 try: 68 for lineno, line in enumerate(f, start=1): 69 if lineno > numLines > 0: 70 break 71 expression = parseLineForExpression(line) 72 if expression is not None: 73 return expression 74 except UnicodeDecodeError: 75 # invalid UTF-8 content 76 return None 77 78 # if we get here, we didn't find an expression 79 return None 80 81 82def splitExpression(expression): 83 """ 84 Parse a license expression into its constituent identifiers. 85 86 Arguments: 87 - expression: SPDX license expression 88 Returns: array of split identifiers 89 """ 90 # remove parens and plus sign 91 e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE) 92 93 # remove word operators, ignoring case, leaving a blank space 94 e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE) 95 96 # and split on space 97 e4 = e3.split(" ") 98 99 return sorted(e4) 100 101 102def calculateVerificationCode(pkg): 103 """ 104 Calculate the SPDX Package Verification Code for all files in the package. 105 106 Arguments: 107 - pkg: Package 108 Returns: verification code as string 109 """ 110 hashes = [] 111 for f in pkg.files.values(): 112 hashes.append(f.sha1) 113 hashes.sort() 114 filelist = "".join(hashes) 115 116 hSHA1 = hashlib.sha1(usedforsecurity=False) 117 hSHA1.update(filelist.encode('utf-8')) 118 return hSHA1.hexdigest() 119 120 121def checkLicenseValid(lic, doc): 122 """ 123 Check whether this license ID is a valid SPDX license ID, and add it 124 to the custom license IDs set for this Document if it isn't. 125 126 Arguments: 127 - lic: detected license ID 128 - doc: Document 129 """ 130 if lic not in LICENSES: 131 doc.customLicenseIDs.add(lic) 132 133 134def getPackageLicenses(pkg): 135 """ 136 Extract lists of all concluded and infoInFile licenses seen. 137 138 Arguments: 139 - pkg: Package 140 Returns: sorted list of concluded license exprs, 141 sorted list of infoInFile ID's 142 """ 143 licsConcluded = set() 144 licsFromFiles = set() 145 for f in pkg.files.values(): 146 licsConcluded.add(f.concludedLicense) 147 for licInfo in f.licenseInfoInFile: 148 licsFromFiles.add(licInfo) 149 return sorted(list(licsConcluded)), sorted(list(licsFromFiles)) 150 151 152def normalizeExpression(licsConcluded): 153 """ 154 Combine array of license expressions into one AND'd expression, 155 adding parens where needed. 156 157 Arguments: 158 - licsConcluded: array of license expressions 159 Returns: string with single AND'd expression. 160 """ 161 # return appropriate for simple cases 162 if len(licsConcluded) == 0: 163 return "NOASSERTION" 164 if len(licsConcluded) == 1: 165 return licsConcluded[0] 166 167 # more than one, so we'll need to combine them 168 # if and only if an expression has spaces, it needs parens 169 revised = [] 170 for lic in licsConcluded: 171 if lic in ["NONE", "NOASSERTION"]: 172 continue 173 if " " in lic: 174 revised.append(f"({lic})") 175 else: 176 revised.append(lic) 177 return " AND ".join(revised) 178 179 180def getCopyrightInfo(filePath): 181 """ 182 Scans the specified file for copyright information using REUSE tools. 183 184 Arguments: 185 - filePath: path to file to scan 186 187 Returns: list of copyright statements if found; empty list if not found 188 """ 189 log.dbg(f" - getting copyright info for {filePath}") 190 191 try: 192 project = Project(os.path.dirname(filePath)) 193 infos = project.reuse_info_of(filePath) 194 copyrights = [] 195 196 for info in infos: 197 if info.copyright_lines: 198 copyrights.extend(info.copyright_lines) 199 200 return copyrights 201 except Exception as e: 202 log.wrn(f"Error getting copyright info for {filePath}: {e}") 203 return [] 204 205 206def scanDocument(cfg, doc): 207 """ 208 Scan for licenses and calculate hashes for all Files and Packages 209 in this Document. 210 211 Arguments: 212 - cfg: ScannerConfig 213 - doc: Document 214 """ 215 for pkg in doc.pkgs.values(): 216 log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}") 217 218 # first, gather File data for this package 219 for f in pkg.files.values(): 220 # set relpath based on package's relativeBaseDir 221 f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir) 222 223 # get hashes for file 224 hashes = getHashes(f.abspath) 225 if not hashes: 226 log.wrn(f"unable to get hashes for file {f.abspath}; skipping") 227 continue 228 hSHA1, hSHA256, hMD5 = hashes 229 f.sha1 = hSHA1 230 if cfg.doSHA256: 231 f.sha256 = hSHA256 232 if cfg.doMD5: 233 f.md5 = hMD5 234 235 # get licenses for file 236 expression = getExpressionData(f.abspath, cfg.numLinesScanned) 237 if expression: 238 if cfg.shouldConcludeFileLicenses: 239 f.concludedLicense = expression 240 f.licenseInfoInFile = splitExpression(expression) 241 242 if copyrights := getCopyrightInfo(f.abspath): 243 f.copyrightText = "<text>\n" + "\n".join(copyrights) + "\n</text>" 244 245 # check if any custom license IDs should be flagged for document 246 for lic in f.licenseInfoInFile: 247 checkLicenseValid(lic, doc) 248 249 # now, assemble the Package data 250 licsConcluded, licsFromFiles = getPackageLicenses(pkg) 251 if cfg.shouldConcludePackageLicense: 252 pkg.concludedLicense = normalizeExpression(licsConcluded) 253 pkg.licenseInfoFromFiles = licsFromFiles 254 pkg.verificationCode = calculateVerificationCode(pkg) 255