prepare_tlds.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # This Source Code Form is subject to the terms of the Mozilla Public
  2. # License, v. 2.0. If a copy of the MPL was not distributed with this
  3. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4. import codecs
  5. import encodings.idna
  6. import re
  7. import sys
  8. """
  9. Processes a file containing effective TLD data. See the following URL for a
  10. description of effective TLDs and of the file format that this script
  11. processes (although for the latter you're better off just reading this file's
  12. short source code).
  13. http://wiki.mozilla.org/Gecko:Effective_TLD_Service
  14. """
  15. def getEffectiveTLDs(path):
  16. file = codecs.open(path, "r", "UTF-8")
  17. entries = []
  18. domains = set()
  19. for line in file:
  20. # line always contains a line terminator unless the file is empty
  21. if len(line) == 0:
  22. raise StopIteration
  23. line = line.rstrip()
  24. # comment, empty, or superfluous line for explicitness purposes
  25. if line.startswith("//") or "." not in line:
  26. continue
  27. line = re.split(r"[ \t\n]", line, 1)[0]
  28. entry = EffectiveTLDEntry(line)
  29. domain = entry.domain()
  30. assert domain not in domains, \
  31. "repeating domain %s makes no sense" % domain
  32. domains.add(domain)
  33. entries.append(entry)
  34. # Sort the entries so we can use binary search on them.
  35. entries.sort(key=EffectiveTLDEntry.domain)
  36. return entries
  37. def _normalizeHostname(domain):
  38. """
  39. Normalizes the given domain, component by component. ASCII components are
  40. lowercased, while non-ASCII components are processed using the ToASCII
  41. algorithm.
  42. """
  43. def convertLabel(label):
  44. if _isASCII(label):
  45. return label.lower()
  46. return encodings.idna.ToASCII(label)
  47. return ".".join(map(convertLabel, domain.split(".")))
  48. def _isASCII(s):
  49. "True if s consists entirely of ASCII characters, false otherwise."
  50. for c in s:
  51. if ord(c) > 127:
  52. return False
  53. return True
  54. class EffectiveTLDEntry:
  55. """
  56. Stores an entry in an effective-TLD name file.
  57. """
  58. _exception = False
  59. _wild = False
  60. def __init__(self, line):
  61. """
  62. Creates a TLD entry from a line of data, which must have been stripped of
  63. the line ending.
  64. """
  65. if line.startswith("!"):
  66. self._exception = True
  67. domain = line[1:]
  68. elif line.startswith("*."):
  69. self._wild = True
  70. domain = line[2:]
  71. else:
  72. domain = line
  73. self._domain = _normalizeHostname(domain)
  74. def domain(self):
  75. "The domain this represents."
  76. return self._domain
  77. def exception(self):
  78. "True if this entry's domain denotes does not denote an effective TLD."
  79. return self._exception
  80. def wild(self):
  81. "True if this entry represents a class of effective TLDs."
  82. return self._wild
  83. #################
  84. # DO EVERYTHING #
  85. #################
  86. def main(output, effective_tld_filename):
  87. """
  88. effective_tld_filename is the effective TLD file to parse.
  89. A C++ array of { domain, exception, wild } entries representing the
  90. eTLD file is then printed to output.
  91. """
  92. def boolStr(b):
  93. if b:
  94. return "true"
  95. return "false"
  96. for etld in getEffectiveTLDs(effective_tld_filename):
  97. exception = boolStr(etld.exception())
  98. wild = boolStr(etld.wild())
  99. output.write('ETLD_ENTRY("%s", %s, %s)\n' % (etld.domain(), exception, wild))
  100. if __name__ == '__main__':
  101. main(sys.stdout, sys.argv[1])