create_regex_tables 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. # Copyright (C) 2010, 2013 Apple Inc. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions
  5. # are met:
  6. # 1. Redistributions of source code must retain the above copyright
  7. # notice, this list of conditions and the following disclaimer.
  8. # 2. Redistributions in binary form must reproduce the above copyright
  9. # notice, this list of conditions and the following disclaimer in the
  10. # documentation and/or other materials provided with the distribution.
  11. #
  12. # THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  13. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  14. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  15. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
  16. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  17. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  18. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  19. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  20. # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  21. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  22. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. import sys
  24. types = {
  25. "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
  26. "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]},
  27. "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
  28. "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
  29. "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]},
  30. "digits": { "UseTable" : False, "data": [('0', '9')]},
  31. "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] }
  32. }
  33. entriesPerLine = 50
  34. arrays = "";
  35. functions = "";
  36. emitTables = (len(sys.argv) < 2 or sys.argv[1] != "--no-tables")
  37. for name, classes in types.items():
  38. ranges = [];
  39. size = 0;
  40. for _class in classes["data"]:
  41. if type(_class) == str:
  42. ranges.append((ord(_class), ord(_class)))
  43. elif type(_class) == int:
  44. ranges.append((_class, _class))
  45. else:
  46. (min, max) = _class;
  47. if type(min) == str:
  48. min = ord(min)
  49. if type(max) == str:
  50. max = ord(max)
  51. if max > 0x7f and min <= 0x7f:
  52. ranges.append((min, 0x7f))
  53. min = 0x80
  54. ranges.append((min,max))
  55. ranges.sort();
  56. if emitTables and classes["UseTable"] and (not "Inverse" in classes):
  57. array = ("static const char _%sData[65536] = {\n" % name);
  58. i = 0
  59. for (min,max) in ranges:
  60. while i < min:
  61. i = i + 1
  62. array += ('0,')
  63. if (i % entriesPerLine == 0) and (i != 0):
  64. array += ('\n')
  65. while i <= max:
  66. i = i + 1
  67. if (i == 65536):
  68. array += ("1")
  69. else:
  70. array += ('1,')
  71. if (i % entriesPerLine == 0) and (i != 0):
  72. array += ('\n')
  73. while i < 0xffff:
  74. array += ("0,")
  75. i = i + 1;
  76. if (i % entriesPerLine == 0) and (i != 0):
  77. array += ('\n')
  78. if i == 0xffff:
  79. array += ("0")
  80. array += ("\n};\n");
  81. array += ("DEFINE_REMOTE_VAR(const char *, regExp_%sData, &(_%sData[0]));\n\n" % (name,name));
  82. arrays += array
  83. # Generate createFunction:
  84. function = "";
  85. function += ("CharacterClass* %sCreate()\n" % name)
  86. function += ("{\n")
  87. if emitTables and classes["UseTable"]:
  88. if "Inverse" in classes:
  89. function += (" CharacterClass* characterClass = new CharacterClass(REMOTE_VAR_VALUE(regExp_%sData), true);\n" % (classes["Inverse"]))
  90. else:
  91. function += (" CharacterClass* characterClass = new CharacterClass(REMOTE_VAR_VALUE(regExp_%sData), false);\n" % (name))
  92. else:
  93. function += (" CharacterClass* characterClass = new CharacterClass;\n")
  94. for (min, max) in ranges:
  95. if (min == max):
  96. if (min > 127):
  97. function += (" characterClass->m_matchesUnicode.append(0x%04x);\n" % min)
  98. else:
  99. function += (" characterClass->m_matches.append(0x%02x);\n" % min)
  100. continue
  101. if (min > 127) or (max > 127):
  102. function += (" characterClass->m_rangesUnicode.append(CharacterRange(0x%04x, 0x%04x));\n" % (min, max))
  103. else:
  104. function += (" characterClass->m_ranges.append(CharacterRange(0x%02x, 0x%02x));\n" % (min, max))
  105. function += (" return characterClass;\n")
  106. function += ("}\n\n")
  107. functions += function
  108. if (len(sys.argv) > 1):
  109. f = open(sys.argv[-1], "w")
  110. f.write(arrays)
  111. f.write(functions)
  112. f.close()
  113. else:
  114. print(arrays)
  115. print(functions)