1# Copyright (c) 2020, 2021 The Linux Foundation
2#
3# SPDX-License-Identifier: Apache-2.0
4
5import hashlib
6import os
7import re
8from dataclasses import dataclass
9
10from reuse.project import Project
11from west import log
12
13from zspdx.licenses import LICENSES
14from zspdx.util import getHashes
15
16
17# ScannerConfig contains settings used to configure how the SPDX
18# Document scanning should occur.
19@dataclass(eq=True)
20class ScannerConfig:
21    # when assembling a Package's data, should we auto-conclude the
22    # Package's license, based on the licenses of its Files?
23    shouldConcludePackageLicense: bool = True
24
25    # when assembling a Package's Files' data, should we auto-conclude
26    # each File's license, based on its detected license(s)?
27    shouldConcludeFileLicenses: bool = True
28
29    # number of lines to scan for SPDX-License-Identifier (0 = all)
30    # defaults to 20
31    numLinesScanned: int = 20
32
33    # should we calculate SHA256 hashes for each Package's Files?
34    # note that SHA1 hashes are mandatory, per SPDX 2.3
35    doSHA256: bool = True
36
37    # should we calculate MD5 hashes for each Package's Files?
38    doMD5: bool = False
39
40
41def parseLineForExpression(line):
42    """Return parsed SPDX expression if tag found in line, or None otherwise."""
43    p = line.partition("SPDX-License-Identifier:")
44    if p[2] == "":
45        return None
46    # strip away trailing comment marks and whitespace, if any
47    expression = p[2].strip()
48    expression = expression.rstrip("/*")
49    expression = expression.strip()
50    return expression
51
52
53def getExpressionData(filePath, numLines):
54    """
55    Scans the specified file for the first SPDX-License-Identifier:
56    tag in the file.
57
58    Arguments:
59        - filePath: path to file to scan.
60        - numLines: number of lines to scan for an expression before
61                    giving up. If 0, will scan the entire file.
62    Returns: parsed expression if found; None if not found.
63    """
64    log.dbg(f"  - getting licenses for {filePath}")
65
66    with open(filePath) as f:
67        try:
68            for lineno, line in enumerate(f, start=1):
69                if lineno > numLines > 0:
70                    break
71                expression = parseLineForExpression(line)
72                if expression is not None:
73                    return expression
74        except UnicodeDecodeError:
75            # invalid UTF-8 content
76            return None
77
78    # if we get here, we didn't find an expression
79    return None
80
81
82def splitExpression(expression):
83    """
84    Parse a license expression into its constituent identifiers.
85
86    Arguments:
87        - expression: SPDX license expression
88    Returns: array of split identifiers
89    """
90    # remove parens and plus sign
91    e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE)
92
93    # remove word operators, ignoring case, leaving a blank space
94    e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE)
95
96    # and split on space
97    e4 = e3.split(" ")
98
99    return sorted(e4)
100
101
102def calculateVerificationCode(pkg):
103    """
104    Calculate the SPDX Package Verification Code for all files in the package.
105
106    Arguments:
107        - pkg: Package
108    Returns: verification code as string
109    """
110    hashes = []
111    for f in pkg.files.values():
112        hashes.append(f.sha1)
113    hashes.sort()
114    filelist = "".join(hashes)
115
116    hSHA1 = hashlib.sha1(usedforsecurity=False)
117    hSHA1.update(filelist.encode('utf-8'))
118    return hSHA1.hexdigest()
119
120
121def checkLicenseValid(lic, doc):
122    """
123    Check whether this license ID is a valid SPDX license ID, and add it
124    to the custom license IDs set for this Document if it isn't.
125
126    Arguments:
127        - lic: detected license ID
128        - doc: Document
129    """
130    if lic not in LICENSES:
131        doc.customLicenseIDs.add(lic)
132
133
134def getPackageLicenses(pkg):
135    """
136    Extract lists of all concluded and infoInFile licenses seen.
137
138    Arguments:
139        - pkg: Package
140    Returns: sorted list of concluded license exprs,
141             sorted list of infoInFile ID's
142    """
143    licsConcluded = set()
144    licsFromFiles = set()
145    for f in pkg.files.values():
146        licsConcluded.add(f.concludedLicense)
147        for licInfo in f.licenseInfoInFile:
148            licsFromFiles.add(licInfo)
149    return sorted(list(licsConcluded)), sorted(list(licsFromFiles))
150
151
152def normalizeExpression(licsConcluded):
153    """
154    Combine array of license expressions into one AND'd expression,
155    adding parens where needed.
156
157    Arguments:
158        - licsConcluded: array of license expressions
159    Returns: string with single AND'd expression.
160    """
161    # return appropriate for simple cases
162    if len(licsConcluded) == 0:
163        return "NOASSERTION"
164    if len(licsConcluded) == 1:
165        return licsConcluded[0]
166
167    # more than one, so we'll need to combine them
168    # if and only if an expression has spaces, it needs parens
169    revised = []
170    for lic in licsConcluded:
171        if lic in ["NONE", "NOASSERTION"]:
172            continue
173        if " " in lic:
174            revised.append(f"({lic})")
175        else:
176            revised.append(lic)
177    return " AND ".join(revised)
178
179
180def getCopyrightInfo(filePath):
181    """
182    Scans the specified file for copyright information using REUSE tools.
183
184    Arguments:
185        - filePath: path to file to scan
186
187    Returns: list of copyright statements if found; empty list if not found
188    """
189    log.dbg(f"  - getting copyright info for {filePath}")
190
191    try:
192        project = Project(os.path.dirname(filePath))
193        infos = project.reuse_info_of(filePath)
194        copyrights = []
195
196        for info in infos:
197            if info.copyright_lines:
198                copyrights.extend(info.copyright_lines)
199
200        return copyrights
201    except Exception as e:
202        log.wrn(f"Error getting copyright info for {filePath}: {e}")
203        return []
204
205
206def scanDocument(cfg, doc):
207    """
208    Scan for licenses and calculate hashes for all Files and Packages
209    in this Document.
210
211    Arguments:
212        - cfg: ScannerConfig
213        - doc: Document
214    """
215    for pkg in doc.pkgs.values():
216        log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}")
217
218        # first, gather File data for this package
219        for f in pkg.files.values():
220            # set relpath based on package's relativeBaseDir
221            f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir)
222
223            # get hashes for file
224            hashes = getHashes(f.abspath)
225            if not hashes:
226                log.wrn(f"unable to get hashes for file {f.abspath}; skipping")
227                continue
228            hSHA1, hSHA256, hMD5 = hashes
229            f.sha1 = hSHA1
230            if cfg.doSHA256:
231                f.sha256 = hSHA256
232            if cfg.doMD5:
233                f.md5 = hMD5
234
235            # get licenses for file
236            expression = getExpressionData(f.abspath, cfg.numLinesScanned)
237            if expression:
238                if cfg.shouldConcludeFileLicenses:
239                    f.concludedLicense = expression
240                f.licenseInfoInFile = splitExpression(expression)
241
242            if copyrights := getCopyrightInfo(f.abspath):
243                f.copyrightText = "<text>\n" + "\n".join(copyrights) + "\n</text>"
244
245            # check if any custom license IDs should be flagged for document
246            for lic in f.licenseInfoInFile:
247                checkLicenseValid(lic, doc)
248
249        # now, assemble the Package data
250        licsConcluded, licsFromFiles = getPackageLicenses(pkg)
251        if cfg.shouldConcludePackageLicense:
252            pkg.concludedLicense = normalizeExpression(licsConcluded)
253        pkg.licenseInfoFromFiles = licsFromFiles
254        pkg.verificationCode = calculateVerificationCode(pkg)
255