license_scanner.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #
  2. # Copyright (c) Contributors to the Open 3D Engine Project. For complete copyright and license terms please see the LICENSE at the root of this distribution.
  3. #
  4. # SPDX-License-Identifier: Apache-2.0 OR MIT
  5. #
  6. #
  7. import argparse
  8. from collections import OrderedDict
  9. import fnmatch
  10. import json
  11. import os
  12. import pathlib
  13. import re
  14. import sys
  15. class LicenseScanner:
  16. """Class to contain license scanner.
  17. Scans source tree for license files using provided filename patterns and generates a file
  18. with the contents of all the licenses.
  19. :param config_file: Config file with license patterns and scanner settings
  20. """
  21. DEFAULT_CONFIG_FILE = 'scanner_config.json'
  22. DEFAULT_EXCLUDE_FILE = '.gitignore'
  23. DEFAULT_PACKAGE_INFO_FILE = 'PackageInfo.json'
  24. def __init__(self, config_file=None):
  25. self.config_file = config_file
  26. self.config_data = self._load_config()
  27. self.file_regex = self._load_file_regex(self.config_data['license_patterns'])
  28. self.package_info = self._load_file_regex(self.config_data['package_patterns'])
  29. self.excluded_directories = self._load_file_regex(self.config_data['excluded_directories'])
  30. def _load_config(self):
  31. """Load config from the provided file. Sets default file if one is not provided."""
  32. if not self.config_file:
  33. script_directory = os.path.dirname(os.path.abspath(__file__)) # Default file expected in same dir as script
  34. self.config_file = os.path.join(script_directory, self.DEFAULT_CONFIG_FILE)
  35. try:
  36. with open(self.config_file) as f:
  37. return json.load(f)
  38. except FileNotFoundError:
  39. print('Config file cannot be found')
  40. raise
  41. def _load_file_regex(self, patterns):
  42. """Returns regex object with case-insensitive matching from the list of filename patterns."""
  43. regex_patterns = []
  44. for pattern in patterns:
  45. regex_patterns.append(fnmatch.translate(pattern))
  46. if not regex_patterns:
  47. print(f'Warning: No patterns from {patterns} found')
  48. return None
  49. return re.compile('|'.join(regex_patterns), re.IGNORECASE)
  50. def scan(self, paths=os.curdir):
  51. """Scan directory tree for filenames matching file_regex, package info, and exclusion files.
  52. :param paths: Paths of the directory to run scanner
  53. :return: Package paths and their corresponding file contents
  54. :rtype: Ordered dict
  55. """
  56. files = 0
  57. matching_files = OrderedDict()
  58. excluded_directories = None
  59. if not self.package_info:
  60. self.package_info = self.DEFAULT_PACKAGE_INFO_FILE
  61. if not self.excluded_directories:
  62. print(f'No excluded directory in config, looking for {self.DEFAULT_EXCLUDE_FILE} instead')
  63. for path in paths:
  64. for dirpath, dirnames, filenames in os.walk(path, topdown=True):
  65. dirnames.sort(key=str.casefold) # Ensure that results are sorted
  66. for file in filenames:
  67. if self.file_regex.match(file) or self.package_info.match(file):
  68. file_path = os.path.join(dirpath, file)
  69. matching_file_content = self._get_file_contents(file_path)
  70. matching_files[file_path] = matching_file_content
  71. files += 1
  72. print(f'Matching file: {file_path}')
  73. if self.package_info.match(file):
  74. dirnames[:] = [] # Stop scanning subdirectories if package info file found
  75. if self.DEFAULT_EXCLUDE_FILE in file and not self.excluded_directories:
  76. ignore_list = self._get_file_contents(os.path.join(dirpath, file)).splitlines()
  77. ignore_list.append('.git') # .gitignore doesn't usually have .git in its exclusions
  78. excluded_directories = self._load_file_regex(ignore_list)
  79. # Remove directories that should not be scanned
  80. if self.excluded_directories:
  81. excluded_directories = self.excluded_directories
  82. for dir in dirnames:
  83. if excluded_directories.match(dir):
  84. dirnames.remove(dir)
  85. print(f'{files} files found.')
  86. return matching_files
  87. def _get_file_contents(self, filepath):
  88. try:
  89. with open(filepath, encoding='utf8') as f:
  90. return f.read()
  91. except UnicodeDecodeError:
  92. print(f'Unable to read file: {filepath}')
  93. pass
  94. def create_license_file(self, licenses, filepath='NOTICES.txt'):
  95. """Creates file with all the provided license file contents.
  96. :param licenses: Dict with package paths and their corresponding license file contents
  97. :param filepath: Path to write the file
  98. """
  99. license_separator = '------------------------------------'
  100. with open(filepath, 'w', encoding='utf8') as lf:
  101. for directory, license in licenses.items():
  102. if not self.package_info.match(os.path.basename(directory)):
  103. license_output = '\n\n'.join([
  104. f'{license_separator}',
  105. f'Package path: {os.path.relpath(directory)}',
  106. 'License:',
  107. f'{license}\n'
  108. ])
  109. lf.write(license_output)
  110. return None
  111. def create_package_file(self, packages, filepath='SPDX-Licenses.json', get_contents=False):
  112. """Creates file with all the provided SPDX package info summaries in json.
  113. Optional dirpath parameter will follow the license file path in the package info and return its contents in a dictionary
  114. :param licenses: Dict with package info paths and their corresponding file contents
  115. :param filepath: Path to write the file
  116. :param dirpath: Root path for packages
  117. :rtype: Ordered dict
  118. """
  119. licenses = OrderedDict()
  120. package_json = []
  121. with open(filepath, 'w', encoding='utf8') as pf:
  122. for directory, package in packages.items():
  123. if self.package_info.match(os.path.basename(directory)):
  124. package_obj = json.loads(package)
  125. package_json.append(package_obj)
  126. if get_contents:
  127. license_path = os.path.join(os.path.dirname(directory), pathlib.Path(package_obj['LicenseFile']))
  128. licenses[license_path] = self._get_file_contents(license_path)
  129. else:
  130. licenses[directory] = package
  131. pf.write(json.dumps(package_json, indent=4))
  132. return licenses
  133. def parse_args():
  134. parser = argparse.ArgumentParser(
  135. description='Script to run LicenseScanner and generate license file')
  136. parser.add_argument('--config-file', '-c', type=pathlib.Path, help='Config file for LicenseScanner')
  137. parser.add_argument('--license-file-path', '-l', type=pathlib.Path, help='Create license file in the provided path')
  138. parser.add_argument('--package-file-path', '-p', type=pathlib.Path, help='Create package summary file in the provided path')
  139. parser.add_argument('--scan-path', '-s', default=os.curdir, type=pathlib.Path, nargs='+', help='Path to scan, multiple space separated paths can be used')
  140. return parser.parse_args()
  141. def main():
  142. try:
  143. args = parse_args()
  144. ls = LicenseScanner(args.config_file)
  145. scanned_path_data = ls.scan(args.scan_path)
  146. if args.license_file_path:
  147. ls.create_license_file(scanned_path_data, args.license_file_path)
  148. if args.package_file_path:
  149. ls.create_package_file(scanned_path_data, args.package_file_path)
  150. if args.license_file_path and args.package_file_path:
  151. license_files = ls.create_package_file(scanned_path_data, args.package_file_path, True)
  152. ls.create_license_file(license_files, args.license_file_path)
  153. except FileNotFoundError as e:
  154. print(f'Type: {type(e).__name__}, Error: {e}')
  155. return 1
  156. if __name__ == '__main__':
  157. sys.exit(main())