pslint.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-#
  3. #
  4. # PSL linter written in python
  5. #
  6. # Copyright 2016 Tim Rühsen (tim dot ruehsen at gmx dot de). All rights reserved.
  7. #
  8. # Permission is hereby granted, free of charge, to any person obtaining a
  9. # copy of this software and associated documentation files (the "Software"),
  10. # to deal in the Software without restriction, including without limitation
  11. # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12. # and/or sell copies of the Software, and to permit persons to whom the
  13. # Software is furnished to do so, subject to the following conditions:
  14. #
  15. # The above copyright notice and this permission notice shall be included in
  16. # all copies or substantial portions of the Software.
  17. #
  18. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23. # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24. # DEALINGS IN THE SOFTWARE.
  25. import sys
  26. import codecs
  27. nline = 0
  28. line = ""
  29. orig_line = ""
  30. warnings = 0
  31. errors = 0
  32. skip_order_check = False
  33. def warning(msg):
  34. global warnings, orig_line, nline
  35. print('%d: warning: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
  36. warnings += 1
  37. def error(msg):
  38. global errors, orig_line, nline
  39. print('%d: error: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
  40. errors += 1
  41. # skip_order_check = True
  42. def print_psl(list):
  43. for domain in list:
  44. print(".".join(str(label) for label in reversed(domain)))
  45. def psl_key(s):
  46. if s[0] == '*':
  47. return 0
  48. if s[0] == '!':
  49. return 1
  50. return 2
  51. def check_order(group):
  52. """Check the correct order of a domain group"""
  53. global skip_order_check
  54. try:
  55. if skip_order_check or len(group) < 2:
  56. skip_order_check = False
  57. return
  58. # check if the TLD is the identical within the group
  59. if any(group[0][0] != labels[0] for labels in group):
  60. warning('Domain group TLD is not consistent')
  61. # sort by # of labels, label-by-label (labels are in reversed order)
  62. sorted_group = sorted(group, key = lambda labels: (len(labels), psl_key(labels[-1][0]), labels))
  63. if group != sorted_group:
  64. warning('Incorrectly sorted group of domains')
  65. print(" " + str(group))
  66. print(" " + str(sorted_group))
  67. print("Correct sorting would be:")
  68. print_psl(sorted_group)
  69. finally:
  70. del group[:]
  71. def lint_psl(infile):
  72. """Parses PSL file and performs syntax checking"""
  73. global orig_line, nline
  74. PSL_FLAG_EXCEPTION = (1<<0)
  75. PSL_FLAG_WILDCARD = (1<<1)
  76. PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
  77. PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
  78. PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
  79. line2number = {}
  80. line2flag = {}
  81. group = []
  82. section = 0
  83. icann_sections = 0
  84. private_sections = 0
  85. lines = [line.strip('\n') for line in infile]
  86. for line in lines:
  87. nline += 1
  88. # check for leadind/trailing whitespace
  89. stripped = line.strip()
  90. if stripped != line:
  91. line = line.replace('\t','\\t')
  92. line = line.replace('\r','^M')
  93. orig_line = line
  94. warning('Leading/Trailing whitespace')
  95. orig_line = line
  96. line = stripped
  97. # empty line (end of sorted domain group)
  98. if not line:
  99. # check_order(group)
  100. continue
  101. # check for section begin/end
  102. if line[0:2] == "//":
  103. # check_order(group)
  104. if section == 0:
  105. if line == "// ===BEGIN ICANN DOMAINS===":
  106. section = PSL_FLAG_ICANN
  107. icann_sections += 1
  108. elif line == "// ===BEGIN PRIVATE DOMAINS===":
  109. section = PSL_FLAG_PRIVATE
  110. private_sections += 1
  111. elif line[3:11] == "===BEGIN":
  112. error('Unexpected begin of unknown section')
  113. elif line[3:9] == "===END":
  114. error('End of section without previous begin')
  115. elif section == PSL_FLAG_ICANN:
  116. if line == "// ===END ICANN DOMAINS===":
  117. section = 0
  118. elif line[3:11] == "===BEGIN":
  119. error('Unexpected begin of section: ')
  120. elif line[3:9] == "===END":
  121. error('Unexpected end of section')
  122. elif section == PSL_FLAG_PRIVATE:
  123. if line == "// ===END PRIVATE DOMAINS===":
  124. section = 0
  125. elif line[3:11] == "===BEGIN":
  126. error('Unexpected begin of section')
  127. elif line[3:9] == "===END":
  128. error('Unexpected end of section')
  129. continue # processing of comments ends here
  130. # No rule must be outside of a section
  131. if section == 0:
  132. error('Rule outside of section')
  133. group.append(list(reversed(line.split('.'))))
  134. # decode UTF-8 input into unicode, needed only for python 2.x
  135. try:
  136. if sys.version_info[0] < 3:
  137. line = line.decode('utf-8')
  138. else:
  139. line.encode('utf-8')
  140. except (UnicodeDecodeError, UnicodeEncodeError):
  141. orig_line = None
  142. error('Invalid UTF-8 character')
  143. continue
  144. # each rule must be lowercase (or more exactly: not uppercase and not titlecase)
  145. if line != line.lower():
  146. error('Rule must be lowercase')
  147. # strip leading wildcards
  148. flags = section
  149. # while line[0:2] == '*.':
  150. if line[0:2] == '*.':
  151. flags |= PSL_FLAG_WILDCARD
  152. line = line[2:]
  153. if line[0] == '!':
  154. flags |= PSL_FLAG_EXCEPTION
  155. line = line[1:]
  156. else:
  157. flags |= PSL_FLAG_PLAIN
  158. # wildcard and exception must not combine
  159. if flags & PSL_FLAG_WILDCARD and flags & PSL_FLAG_EXCEPTION:
  160. error('Combination of wildcard and exception')
  161. continue
  162. labels = line.split('.')
  163. if flags & PSL_FLAG_EXCEPTION and len(labels) > 1:
  164. domain = ".".join(str(label) for label in labels[1:])
  165. if not domain in line2flag:
  166. error('Exception without previous wildcard')
  167. elif not line2flag[domain] & PSL_FLAG_WILDCARD:
  168. error('Exception without previous wildcard')
  169. for label in labels:
  170. if not label:
  171. error('Leading/trailing or multiple dot')
  172. continue
  173. if label[0:4] == 'xn--':
  174. error('Punycode found')
  175. continue
  176. if '--' in label:
  177. error('Double minus found')
  178. continue
  179. # allowed are a-z,0-9,- and unicode >= 128 (maybe that can be finetuned a bit !?)
  180. for c in label:
  181. if not c.isalnum() and c != '-' and ord(c) < 128:
  182. error('Illegal character')
  183. break
  184. if line in line2flag:
  185. '''Found existing entry:
  186. Combination of exception and plain rule is contradictionary
  187. !foo.bar + foo.bar
  188. Doublette, since *.foo.bar implies foo.bar:
  189. foo.bar + *.foo.bar
  190. Allowed:
  191. !foo.bar + *.foo.bar
  192. '''
  193. error('Found doublette/ambiguity (previous line was %d)' % line2number[line])
  194. line2number[line] = nline
  195. line2flag[line] = flags
  196. orig_line = None
  197. if section == PSL_FLAG_ICANN:
  198. error('ICANN section not closed')
  199. elif section == PSL_FLAG_PRIVATE:
  200. error('PRIVATE section not closed')
  201. if icann_sections < 1:
  202. warning('No ICANN section found')
  203. elif icann_sections > 1:
  204. warning('%d ICANN sections found' % icann_sections)
  205. if private_sections < 1:
  206. warning('No PRIVATE section found')
  207. elif private_sections > 1:
  208. warning('%d PRIVATE sections found' % private_sections)
  209. def usage():
  210. """Prints the usage"""
  211. print('usage: %s PSLfile' % sys.argv[0])
  212. print('or %s - # To read PSL from STDIN' % sys.argv[0])
  213. exit(1)
  214. def main():
  215. """Check syntax of a PSL file"""
  216. if len(sys.argv) < 2:
  217. usage()
  218. with sys.stdin if sys.argv[-1] == '-' else open(sys.argv[-1], 'r', encoding='utf-8', errors="surrogateescape") as infile:
  219. lint_psl(infile)
  220. return errors != 0
  221. if __name__ == '__main__':
  222. sys.exit(main())