ucaps_fetch.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python3
  2. # Script used to dump case mappings from
  3. # the Unicode Character Database to the `ucaps.h` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. import os
  7. import sys
  8. from typing import Final, List, Tuple
  9. from urllib.request import urlopen
  10. if __name__ == "__main__":
  11. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  12. from methods import generate_copyright_header
  13. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt"
  14. lower_to_upper: List[Tuple[str, str]] = []
  15. upper_to_lower: List[Tuple[str, str]] = []
  16. def parse_unicode_data() -> None:
  17. lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
  18. for line in lines:
  19. split_line: List[str] = line.split(";")
  20. code_value: str = split_line[0].strip()
  21. uppercase_mapping: str = split_line[12].strip()
  22. lowercase_mapping: str = split_line[13].strip()
  23. if uppercase_mapping:
  24. lower_to_upper.append((f"0x{code_value}", f"0x{uppercase_mapping}"))
  25. if lowercase_mapping:
  26. upper_to_lower.append((f"0x{code_value}", f"0x{lowercase_mapping}"))
  27. def make_cap_table(table_name: str, len_name: str, table: List[Tuple[str, str]]) -> str:
  28. result: str = f"static const int {table_name}[{len_name}][2] = {{\n"
  29. for first, second in table:
  30. result += f"\t{{ {first}, {second} }},\n"
  31. result += "};\n\n"
  32. return result
  33. def generate_ucaps_fetch() -> None:
  34. parse_unicode_data()
  35. source: str = generate_copyright_header("ucaps.h")
  36. source += f"""
  37. #ifndef UCAPS_H
  38. #define UCAPS_H
  39. // This file was generated using the `misc/scripts/ucaps_fetch.py` script.
  40. #define LTU_LEN {len(lower_to_upper)}
  41. #define UTL_LEN {len(upper_to_lower)}\n\n"""
  42. source += make_cap_table("caps_table", "LTU_LEN", lower_to_upper)
  43. source += make_cap_table("reverse_caps_table", "UTL_LEN", upper_to_lower)
  44. source += """static int _find_upper(int ch) {
  45. \tint low = 0;
  46. \tint high = LTU_LEN - 1;
  47. \tint middle;
  48. \twhile (low <= high) {
  49. \t\tmiddle = (low + high) / 2;
  50. \t\tif (ch < caps_table[middle][0]) {
  51. \t\t\thigh = middle - 1; // Search low end of array.
  52. \t\t} else if (caps_table[middle][0] < ch) {
  53. \t\t\tlow = middle + 1; // Search high end of array.
  54. \t\t} else {
  55. \t\t\treturn caps_table[middle][1];
  56. \t\t}
  57. \t}
  58. \treturn ch;
  59. }
  60. static int _find_lower(int ch) {
  61. \tint low = 0;
  62. \tint high = UTL_LEN - 1;
  63. \tint middle;
  64. \twhile (low <= high) {
  65. \t\tmiddle = (low + high) / 2;
  66. \t\tif (ch < reverse_caps_table[middle][0]) {
  67. \t\t\thigh = middle - 1; // Search low end of array.
  68. \t\t} else if (reverse_caps_table[middle][0] < ch) {
  69. \t\t\tlow = middle + 1; // Search high end of array.
  70. \t\t} else {
  71. \t\t\treturn reverse_caps_table[middle][1];
  72. \t\t}
  73. \t}
  74. \treturn ch;
  75. }
  76. #endif // UCAPS_H
  77. """
  78. ucaps_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/ucaps.h")
  79. with open(ucaps_path, "w", newline="\n") as f:
  80. f.write(source)
  81. print("`ucaps.h` generated successfully.")
  82. if __name__ == "__main__":
  83. generate_ucaps_fetch()