prepare_tlds.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # This Source Code Form is subject to the terms of the Mozilla Public
  2. # License, v. 2.0. If a copy of the MPL was not distributed with this
  3. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4. import codecs
  5. import encodings.idna
  6. import re
  7. import sys
  8. """
  9. Processes a file containing effective TLD data. See the following URL for a
  10. description of effective TLDs and of the file format that this script
  11. processes (although for the latter you're better off just reading this file's
  12. short source code).
  13. https://wiki.mozilla.org/Gecko:Effective_TLD_Service
  14. """
  15. def getEffectiveTLDs(path):
  16. file = codecs.open(path, "r", "UTF-8")
  17. domains = set()
  18. while True:
  19. line = file.readline()
  20. # line always contains a line terminator unless the file is empty
  21. if len(line) == 0:
  22. raise StopIteration
  23. line = line.rstrip()
  24. # comment, empty, or superfluous line for explicitness purposes
  25. if line.startswith("//") or "." not in line:
  26. continue
  27. line = re.split(r"[ \t\n]", line, 1)[0]
  28. entry = EffectiveTLDEntry(line)
  29. domain = entry.domain()
  30. assert domain not in domains, \
  31. "repeating domain %s makes no sense" % domain
  32. domains.add(domain)
  33. yield entry
  34. def _normalizeHostname(domain):
  35. """
  36. Normalizes the given domain, component by component. ASCII components are
  37. lowercased, while non-ASCII components are processed using the ToASCII
  38. algorithm.
  39. """
  40. def convertLabel(label):
  41. if _isASCII(label):
  42. return label.lower()
  43. return encodings.idna.ToASCII(label)
  44. return ".".join(map(convertLabel, domain.split(".")))
  45. def _isASCII(s):
  46. "True if s consists entirely of ASCII characters, false otherwise."
  47. for c in s:
  48. if ord(c) > 127:
  49. return False
  50. return True
  51. class EffectiveTLDEntry:
  52. """
  53. Stores an entry in an effective-TLD name file.
  54. """
  55. _exception = False
  56. _wild = False
  57. def __init__(self, line):
  58. """
  59. Creates a TLD entry from a line of data, which must have been stripped of
  60. the line ending.
  61. """
  62. if line.startswith("!"):
  63. self._exception = True
  64. domain = line[1:]
  65. elif line.startswith("*."):
  66. self._wild = True
  67. domain = line[2:]
  68. else:
  69. domain = line
  70. self._domain = _normalizeHostname(domain)
  71. def domain(self):
  72. "The domain this represents."
  73. return self._domain
  74. def exception(self):
  75. "True if this entry's domain denotes does not denote an effective TLD."
  76. return self._exception
  77. def wild(self):
  78. "True if this entry represents a class of effective TLDs."
  79. return self._wild
  80. #################
  81. # DO EVERYTHING #
  82. #################
  83. def main(output, effective_tld_filename):
  84. """
  85. effective_tld_filename is the effective TLD file to parse.
  86. A C++ array of { domain, exception, wild } entries representing the
  87. eTLD file is then printed to output.
  88. """
  89. def boolStr(b):
  90. if b:
  91. return "true"
  92. return "false"
  93. for etld in getEffectiveTLDs(effective_tld_filename):
  94. exception = boolStr(etld.exception())
  95. wild = boolStr(etld.wild())
  96. output.write('ETLD_ENTRY("%s", %s, %s)\n' % (etld.domain(), exception, wild))
  97. if __name__ == '__main__':
  98. main(sys.stdout, sys.argv[1])