scanner.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # Copyright (c) 2020, 2021 The Linux Foundation
  2. #
  3. # SPDX-License-Identifier: Apache-2.0
  4. import hashlib
  5. import os
  6. import re
  7. from west import log
  8. from zspdx.licenses import LICENSES
  9. from zspdx.util import getHashes
  10. # ScannerConfig contains settings used to configure how the SPDX
  11. # Document scanning should occur.
  12. class ScannerConfig:
  13. def __init__(self):
  14. super(ScannerConfig, self).__init__()
  15. # when assembling a Package's data, should we auto-conclude the
  16. # Package's license, based on the licenses of its Files?
  17. self.shouldConcludePackageLicense = True
  18. # when assembling a Package's Files' data, should we auto-conclude
  19. # each File's license, based on its detected license(s)?
  20. self.shouldConcludeFileLicenses = True
  21. # number of lines to scan for SPDX-License-Identifier (0 = all)
  22. # defaults to 20
  23. self.numLinesScanned = 20
  24. # should we calculate SHA256 hashes for each Package's Files?
  25. # note that SHA1 hashes are mandatory, per SPDX 2.2
  26. self.doSHA256 = True
  27. # should we calculate MD5 hashes for each Package's Files?
  28. self.doMD5 = False
  29. def parseLineForExpression(line):
  30. """Return parsed SPDX expression if tag found in line, or None otherwise."""
  31. p = line.partition("SPDX-License-Identifier:")
  32. if p[2] == "":
  33. return None
  34. # strip away trailing comment marks and whitespace, if any
  35. expression = p[2].strip()
  36. expression = expression.rstrip("/*")
  37. expression = expression.strip()
  38. return expression
  39. def getExpressionData(filePath, numLines):
  40. """
  41. Scans the specified file for the first SPDX-License-Identifier:
  42. tag in the file.
  43. Arguments:
  44. - filePath: path to file to scan.
  45. - numLines: number of lines to scan for an expression before
  46. giving up. If 0, will scan the entire file.
  47. Returns: parsed expression if found; None if not found.
  48. """
  49. log.dbg(f" - getting licenses for {filePath}")
  50. with open(filePath, "r") as f:
  51. try:
  52. lineno = 0
  53. for line in f:
  54. lineno += 1
  55. if lineno > numLines > 0:
  56. break
  57. expression = parseLineForExpression(line)
  58. if expression is not None:
  59. return expression
  60. except UnicodeDecodeError:
  61. # invalid UTF-8 content
  62. return None
  63. # if we get here, we didn't find an expression
  64. return None
  65. def splitExpression(expression):
  66. """
  67. Parse a license expression into its constituent identifiers.
  68. Arguments:
  69. - expression: SPDX license expression
  70. Returns: array of split identifiers
  71. """
  72. # remove parens and plus sign
  73. e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE)
  74. # remove word operators, ignoring case, leaving a blank space
  75. e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE)
  76. # and split on space
  77. e4 = e3.split(" ")
  78. return sorted(e4)
  79. def calculateVerificationCode(pkg):
  80. """
  81. Calculate the SPDX Package Verification Code for all files in the package.
  82. Arguments:
  83. - pkg: Package
  84. Returns: verification code as string
  85. """
  86. hashes = []
  87. for f in pkg.files.values():
  88. hashes.append(f.sha1)
  89. hashes.sort()
  90. filelist = "".join(hashes)
  91. hSHA1 = hashlib.sha1()
  92. hSHA1.update(filelist.encode('utf-8'))
  93. return hSHA1.hexdigest()
  94. def checkLicenseValid(lic, doc):
  95. """
  96. Check whether this license ID is a valid SPDX license ID, and add it
  97. to the custom license IDs set for this Document if it isn't.
  98. Arguments:
  99. - lic: detected license ID
  100. - doc: Document
  101. """
  102. if lic not in LICENSES:
  103. doc.customLicenseIDs.add(lic)
  104. def getPackageLicenses(pkg):
  105. """
  106. Extract lists of all concluded and infoInFile licenses seen.
  107. Arguments:
  108. - pkg: Package
  109. Returns: sorted list of concluded license exprs,
  110. sorted list of infoInFile ID's
  111. """
  112. licsConcluded = set()
  113. licsFromFiles = set()
  114. for f in pkg.files.values():
  115. licsConcluded.add(f.concludedLicense)
  116. for licInfo in f.licenseInfoInFile:
  117. licsFromFiles.add(licInfo)
  118. return sorted(list(licsConcluded)), sorted(list(licsFromFiles))
  119. def normalizeExpression(licsConcluded):
  120. """
  121. Combine array of license expressions into one AND'd expression,
  122. adding parens where needed.
  123. Arguments:
  124. - licsConcluded: array of license expressions
  125. Returns: string with single AND'd expression.
  126. """
  127. # return appropriate for simple cases
  128. if len(licsConcluded) == 0:
  129. return "NOASSERTION"
  130. if len(licsConcluded) == 1:
  131. return licsConcluded[0]
  132. # more than one, so we'll need to combine them
  133. # iff an expression has spaces, it needs parens
  134. revised = []
  135. for lic in licsConcluded:
  136. if lic in ["NONE", "NOASSERTION"]:
  137. continue
  138. if " " in lic:
  139. revised.append(f"({lic})")
  140. else:
  141. revised.append(lic)
  142. return " AND ".join(revised)
  143. def scanDocument(cfg, doc):
  144. """
  145. Scan for licenses and calculate hashes for all Files and Packages
  146. in this Document.
  147. Arguments:
  148. - cfg: ScannerConfig
  149. - doc: Document
  150. """
  151. for pkg in doc.pkgs.values():
  152. log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}")
  153. # first, gather File data for this package
  154. for f in pkg.files.values():
  155. # set relpath based on package's relativeBaseDir
  156. f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir)
  157. # get hashes for file
  158. hashes = getHashes(f.abspath)
  159. if not hashes:
  160. log.wrn("unable to get hashes for file {f.abspath}; skipping")
  161. continue
  162. hSHA1, hSHA256, hMD5 = hashes
  163. f.sha1 = hSHA1
  164. if cfg.doSHA256:
  165. f.sha256 = hSHA256
  166. if cfg.doMD5:
  167. f.md5 = hMD5
  168. # get licenses for file
  169. expression = getExpressionData(f.abspath, cfg.numLinesScanned)
  170. if expression:
  171. if cfg.shouldConcludeFileLicenses:
  172. f.concludedLicense = expression
  173. f.licenseInfoInFile = splitExpression(expression)
  174. # check if any custom license IDs should be flagged for document
  175. for lic in f.licenseInfoInFile:
  176. checkLicenseValid(lic, doc)
  177. # now, assemble the Package data
  178. licsConcluded, licsFromFiles = getPackageLicenses(pkg)
  179. if cfg.shouldConcludePackageLicense:
  180. pkg.concludedLicense = normalizeExpression(licsConcluded)
  181. pkg.licenseInfoFromFiles = licsFromFiles
  182. pkg.verificationCode = calculateVerificationCode(pkg)