123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-#
- #
- # PSL linter written in python
- #
- # Copyright 2016 Tim Rühsen (tim dot ruehsen at gmx dot de). All rights reserved.
- #
- # Permission is hereby granted, free of charge, to any person obtaining a
- # copy of this software and associated documentation files (the "Software"),
- # to deal in the Software without restriction, including without limitation
- # the rights to use, copy, modify, merge, publish, distribute, sublicense,
- # and/or sell copies of the Software, and to permit persons to whom the
- # Software is furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in
- # all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- # DEALINGS IN THE SOFTWARE.
- import sys
- import codecs
- nline = 0
- line = ""
- orig_line = ""
- warnings = 0
- errors = 0
- skip_order_check = False
- def warning(msg):
- global warnings, orig_line, nline
- print('%d: warning: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
- warnings += 1
- def error(msg):
- global errors, orig_line, nline
- print('%d: error: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
- errors += 1
- # skip_order_check = True
- def print_psl(list):
- for domain in list:
- print(".".join(str(label) for label in reversed(domain)))
- def psl_key(s):
- if s[0] == '*':
- return 0
- if s[0] == '!':
- return 1
- return 2
- def check_order(group):
- """Check the correct order of a domain group"""
- global skip_order_check
- try:
- if skip_order_check or len(group) < 2:
- skip_order_check = False
- return
- # check if the TLD is the identical within the group
- if any(group[0][0] != labels[0] for labels in group):
- warning('Domain group TLD is not consistent')
- # sort by # of labels, label-by-label (labels are in reversed order)
- sorted_group = sorted(group, key = lambda labels: (len(labels), psl_key(labels[-1][0]), labels))
- if group != sorted_group:
- warning('Incorrectly sorted group of domains')
- print(" " + str(group))
- print(" " + str(sorted_group))
- print("Correct sorting would be:")
- print_psl(sorted_group)
- finally:
- del group[:]
- def lint_psl(infile):
- """Parses PSL file and performs syntax checking"""
- global orig_line, nline
- PSL_FLAG_EXCEPTION = (1<<0)
- PSL_FLAG_WILDCARD = (1<<1)
- PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
- PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
- PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
- line2number = {}
- line2flag = {}
- group = []
- section = 0
- icann_sections = 0
- private_sections = 0
- lines = [line.strip('\n') for line in infile]
- for line in lines:
- nline += 1
- # check for leadind/trailing whitespace
- stripped = line.strip()
- if stripped != line:
- line = line.replace('\t','\\t')
- line = line.replace('\r','^M')
- orig_line = line
- warning('Leading/Trailing whitespace')
- orig_line = line
- line = stripped
- # empty line (end of sorted domain group)
- if not line:
- # check_order(group)
- continue
- # check for section begin/end
- if line[0:2] == "//":
- # check_order(group)
- if section == 0:
- if line == "// ===BEGIN ICANN DOMAINS===":
- section = PSL_FLAG_ICANN
- icann_sections += 1
- elif line == "// ===BEGIN PRIVATE DOMAINS===":
- section = PSL_FLAG_PRIVATE
- private_sections += 1
- elif line[3:11] == "===BEGIN":
- error('Unexpected begin of unknown section')
- elif line[3:9] == "===END":
- error('End of section without previous begin')
- elif section == PSL_FLAG_ICANN:
- if line == "// ===END ICANN DOMAINS===":
- section = 0
- elif line[3:11] == "===BEGIN":
- error('Unexpected begin of section: ')
- elif line[3:9] == "===END":
- error('Unexpected end of section')
- elif section == PSL_FLAG_PRIVATE:
- if line == "// ===END PRIVATE DOMAINS===":
- section = 0
- elif line[3:11] == "===BEGIN":
- error('Unexpected begin of section')
- elif line[3:9] == "===END":
- error('Unexpected end of section')
- continue # processing of comments ends here
- # No rule must be outside of a section
- if section == 0:
- error('Rule outside of section')
- group.append(list(reversed(line.split('.'))))
- # decode UTF-8 input into unicode, needed only for python 2.x
- try:
- if sys.version_info[0] < 3:
- line = line.decode('utf-8')
- else:
- line.encode('utf-8')
- except (UnicodeDecodeError, UnicodeEncodeError):
- orig_line = None
- error('Invalid UTF-8 character')
- continue
- # each rule must be lowercase (or more exactly: not uppercase and not titlecase)
- if line != line.lower():
- error('Rule must be lowercase')
- # strip leading wildcards
- flags = section
- # while line[0:2] == '*.':
- if line[0:2] == '*.':
- flags |= PSL_FLAG_WILDCARD
- line = line[2:]
- if line[0] == '!':
- flags |= PSL_FLAG_EXCEPTION
- line = line[1:]
- else:
- flags |= PSL_FLAG_PLAIN
- # wildcard and exception must not combine
- if flags & PSL_FLAG_WILDCARD and flags & PSL_FLAG_EXCEPTION:
- error('Combination of wildcard and exception')
- continue
- labels = line.split('.')
- if flags & PSL_FLAG_EXCEPTION and len(labels) > 1:
- domain = ".".join(str(label) for label in labels[1:])
- if not domain in line2flag:
- error('Exception without previous wildcard')
- elif not line2flag[domain] & PSL_FLAG_WILDCARD:
- error('Exception without previous wildcard')
- for label in labels:
- if not label:
- error('Leading/trailing or multiple dot')
- continue
- if label[0:4] == 'xn--':
- error('Punycode found')
- continue
- if '--' in label:
- error('Double minus found')
- continue
- # allowed are a-z,0-9,- and unicode >= 128 (maybe that can be finetuned a bit !?)
- for c in label:
- if not c.isalnum() and c != '-' and ord(c) < 128:
- error('Illegal character')
- break
- if line in line2flag:
- '''Found existing entry:
- Combination of exception and plain rule is contradictionary
- !foo.bar + foo.bar
- Doublette, since *.foo.bar implies foo.bar:
- foo.bar + *.foo.bar
- Allowed:
- !foo.bar + *.foo.bar
- '''
- error('Found doublette/ambiguity (previous line was %d)' % line2number[line])
- line2number[line] = nline
- line2flag[line] = flags
- orig_line = None
- if section == PSL_FLAG_ICANN:
- error('ICANN section not closed')
- elif section == PSL_FLAG_PRIVATE:
- error('PRIVATE section not closed')
- if icann_sections < 1:
- warning('No ICANN section found')
- elif icann_sections > 1:
- warning('%d ICANN sections found' % icann_sections)
- if private_sections < 1:
- warning('No PRIVATE section found')
- elif private_sections > 1:
- warning('%d PRIVATE sections found' % private_sections)
- def usage():
- """Prints the usage"""
- print('usage: %s PSLfile' % sys.argv[0])
- print('or %s - # To read PSL from STDIN' % sys.argv[0])
- exit(1)
- def main():
- """Check syntax of a PSL file"""
- if len(sys.argv) < 2:
- usage()
- with sys.stdin if sys.argv[-1] == '-' else open(sys.argv[-1], 'r', encoding='utf-8', errors="surrogateescape") as infile:
- lint_psl(infile)
- return errors != 0
- if __name__ == '__main__':
- sys.exit(main())
|