unidata_to_charset.awk 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. # unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
  2. #
  3. # Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
  4. #
  5. # This library is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU Lesser General Public
  7. # License as published by the Free Software Foundation; either
  8. # version 3 of the License, or (at your option) any later version.
  9. #
  10. # This library is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. # Lesser General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU Lesser General Public
  16. # License along with this library; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. # Utilities
  19. ###########
  20. # Print MESSAGE to standard error, and exit with STATUS.
  21. function die(status, message) {
  22. print "unidata_to_charset.awk:", message | "cat 1>&2";
  23. exit_status = status;
  24. exit exit_status;
  25. }
  26. # Parse the string S as a hexadecimal number. Note that R, C, and B are
  27. # local variables that need not be set by callers. Most Awk
  28. # implementations have an 'strtonum' function that we could use, but it
  29. # is not part of POSIX.
  30. function hex(s, r, c, b) {
  31. if (length(s) == 0) {
  32. die(1, "Cannot parse empty string as hexadecimal.");
  33. }
  34. r = 0;
  35. for (i = 1; i <= length(s); i++) {
  36. c = substr(s, i, 1);
  37. b = 0;
  38. if (c == "0") { b = 0; }
  39. else if (c == "1") { b = 1; }
  40. else if (c == "2") { b = 2; }
  41. else if (c == "3") { b = 3; }
  42. else if (c == "4") { b = 4; }
  43. else if (c == "5") { b = 5; }
  44. else if (c == "6") { b = 6; }
  45. else if (c == "7") { b = 7; }
  46. else if (c == "8") { b = 8; }
  47. else if (c == "9") { b = 9; }
  48. else if (c == "A") { b = 10; }
  49. else if (c == "B") { b = 11; }
  50. else if (c == "C") { b = 12; }
  51. else if (c == "D") { b = 13; }
  52. else if (c == "E") { b = 14; }
  53. else if (c == "F") { b = 15; }
  54. else { die(1, "Invalid hexadecimal character: " c); }
  55. r *= 16;
  56. r += b;
  57. }
  58. return r;
  59. }
  60. # Program initialization
  61. ########################
  62. BEGIN {
  63. # The columns are separated by semicolons.
  64. FS = ";";
  65. # This will help us handle errors.
  66. exit_status = 0;
  67. # List of charsets.
  68. all_charsets_count = 0;
  69. all_charsets[all_charsets_count++] = "lower_case";
  70. all_charsets[all_charsets_count++] = "upper_case";
  71. all_charsets[all_charsets_count++] = "title_case";
  72. all_charsets[all_charsets_count++] = "letter";
  73. all_charsets[all_charsets_count++] = "digit";
  74. all_charsets[all_charsets_count++] = "hex_digit";
  75. all_charsets[all_charsets_count++] = "letter_plus_digit";
  76. all_charsets[all_charsets_count++] = "graphic";
  77. all_charsets[all_charsets_count++] = "whitespace";
  78. all_charsets[all_charsets_count++] = "printing";
  79. all_charsets[all_charsets_count++] = "iso_control";
  80. all_charsets[all_charsets_count++] = "punctuation";
  81. all_charsets[all_charsets_count++] = "symbol";
  82. all_charsets[all_charsets_count++] = "blank";
  83. all_charsets[all_charsets_count++] = "ascii";
  84. all_charsets[all_charsets_count++] = "empty";
  85. all_charsets[all_charsets_count++] = "designated";
  86. # Initialize charset state table.
  87. for (i in all_charsets) {
  88. cs = all_charsets[i];
  89. state[cs, "start"] = -1;
  90. state[cs, "end"] = -1;
  91. state[cs, "count"] = 0;
  92. }
  93. }
  94. # Comments
  95. ##########
  96. # Skip comments so we can include a copyright notice in the data file.
  97. /^#/ {
  98. next;
  99. }
  100. # Record initialization
  101. #######################
  102. # In this block we give names to each field, and do some basic
  103. # initialization.
  104. {
  105. codepoint = hex($1);
  106. name = $2;
  107. category = $3;
  108. uppercase = $13;
  109. lowercase = $14;
  110. codepoint_end = codepoint;
  111. charset_count = 0;
  112. }
  113. # Some pairs of lines in UnicodeData.txt delimit ranges of
  114. # characters.
  115. name ~ /First>$/ {
  116. getline;
  117. last_name = name;
  118. sub(/First>$/, "Last>", last_name);
  119. if (last_name != $2) {
  120. die(1, "Invalid range in Unicode data.");
  121. exit_status = 1;
  122. exit 1;
  123. }
  124. codepoint_end = hex($1);
  125. }
  126. # Character set predicates
  127. ##########################
  128. ## The lower_case character set
  129. ###############################
  130. # For Unicode, we follow Java's specification: a character is
  131. # lowercase if
  132. # * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
  133. # * the Unicode attribute table does not give a lowercase mapping
  134. # for it, and
  135. # * at least one of the following is true:
  136. # o the Unicode attribute table gives a mapping to uppercase
  137. # for the character, or
  138. # o the name for the character in the Unicode attribute table
  139. # contains the words "SMALL LETTER" or "SMALL LIGATURE".
  140. (codepoint < 8192 || codepoint > 12287) &&
  141. lowercase == "" &&
  142. (uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
  143. charsets[charset_count++] = "lower_case";
  144. }
  145. ## The upper_case character set
  146. ###############################
  147. # For Unicode, we follow Java's specification: a character is
  148. # uppercase if
  149. # * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
  150. # * the Unicode attribute table does not give an uppercase mapping
  151. # for it (this excludes titlecase characters), and
  152. # * at least one of the following is true:
  153. # o the Unicode attribute table gives a mapping to lowercase
  154. # for the character, or
  155. # o the name for the character in the Unicode attribute table
  156. # contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
  157. (codepoint < 8192 || codepoint > 12287) &&
  158. uppercase == "" &&
  159. (lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
  160. charsets[charset_count++] = "upper_case";
  161. }
  162. ## The title_case character set
  163. ###############################
  164. # A character is titlecase if it has the category Lt in the character
  165. # attribute database.
  166. category == "Lt" {
  167. charsets[charset_count++] = "title_case";
  168. }
  169. ## The letter character set
  170. ###########################
  171. # A letter is any character with one of the letter categories (Lu, Ll,
  172. # Lt, Lm, Lo) in the Unicode character database.
  173. category == "Lu" ||
  174. category == "Ll" ||
  175. category == "Lt" ||
  176. category == "Lm" ||
  177. category == "Lo" {
  178. charsets[charset_count++] = "letter";
  179. charsets[charset_count++] = "letter_plus_digit";
  180. }
  181. ## The digit character set
  182. ##########################
  183. # A character is a digit if it has the category Nd in the character
  184. # attribute database. In Latin-1 and ASCII, the only such characters
  185. # are 0123456789. In Unicode, there are other digit characters in
  186. # other code blocks, such as Gujarati digits and Tibetan digits.
  187. category == "Nd" {
  188. charsets[charset_count++] = "digit";
  189. charsets[charset_count++] = "letter_plus_digit";
  190. }
  191. ## The hex_digit character set
  192. ##############################
  193. # The only hex digits are 0123456789abcdefABCDEF.
  194. (codepoint >= 48 && codepoint <= 57) ||
  195. (codepoint >= 65 && codepoint <= 70) ||
  196. (codepoint >= 97 && codepoint <= 102) {
  197. charsets[charset_count++] = "hex_digit";
  198. }
  199. ## The graphic character set
  200. ############################
  201. # Characters that would 'use ink' when printed
  202. category ~ /L|M|N|P|S/ {
  203. charsets[charset_count++] = "graphic";
  204. charsets[charset_count++] = "printing";
  205. }
  206. ## The whitespace character set
  207. ###############################
  208. # A whitespace character is either
  209. # * a character with one of the space, line, or paragraph separator
  210. # categories (Zs, Zl or Zp) of the Unicode character database.
  211. # * U+0009 (09) Horizontal tabulation (\t control-I)
  212. # * U+000A (10) Line feed (\n control-J)
  213. # * U+000B (11) Vertical tabulation (\v control-K)
  214. # * U+000C (12) Form feed (\f control-L)
  215. # * U+000D (13) Carriage return (\r control-M)
  216. category ~ /Zs|Zl|Zp/ ||
  217. (codepoint >= 9 && codepoint <= 13) {
  218. charsets[charset_count++] = "whitespace";
  219. charsets[charset_count++] = "printing";
  220. }
  221. ## The iso_control character set
  222. ################################
  223. # The ISO control characters are the Unicode/Latin-1 characters in the
  224. # ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
  225. (codepoint >= 0 && codepoint <= 31) ||
  226. (codepoint >= 127 && codepoint <= 159) {
  227. charsets[charset_count++] = "iso_control";
  228. }
  229. ## The punctuation character set
  230. ################################
  231. # A punctuation character is any character that has one of the
  232. # punctuation categories in the Unicode character database (Pc, Pd,
  233. # Ps, Pe, Pi, Pf, or Po.)
  234. # Note that srfi-14 gives conflicting requirements!! It claims that
  235. # only the Unicode punctuation is necessary, but, explicitly calls out
  236. # the soft hyphen character (U+00AD) as punctution. Current versions
  237. # of Unicode consider U+00AD to be a formatting character, not
  238. # punctuation.
  239. category ~ /P/ {
  240. charsets[charset_count++] = "punctuation";
  241. }
  242. ## The symbol character set
  243. ###########################
  244. # A symbol is any character that has one of the symbol categories in
  245. # the Unicode character database (Sm, Sc, Sk, or So).
  246. category ~ /S/ {
  247. charsets[charset_count++] = "symbol";
  248. }
  249. ## The blank character set
  250. ##########################
  251. # Blank chars are horizontal whitespace. A blank character is either
  252. # * a character with the space separator category (Zs) in the
  253. # Unicode character database.
  254. # * U+0009 (9) Horizontal tabulation (\t control-I)
  255. category ~ /Zs/ || codepoint == 9 {
  256. charsets[charset_count++] = "blank";
  257. }
  258. ## The ascii character set
  259. ##########################
  260. codepoint <= 127 {
  261. charsets[charset_count++] = "ascii";
  262. }
  263. ## The designated character set
  264. ###############################
  265. # Designated -- All characters except for the surrogates
  266. category !~ /Cs/ {
  267. charsets[charset_count++] = "designated";
  268. }
  269. ## Other character sets
  270. #######################
  271. # Note that the "letter_plus_digit" and "printing" character sets, which
  272. # are unions of other character sets, are included in the patterns
  273. # matching their constituent parts (i.e., the "letter_plus_digit"
  274. # character set is included as part of the "letter" and "digit"
  275. # patterns).
  276. #
  277. # Also, the "empty" character is computed by doing precisely nothing!
  278. # Keeping track of state
  279. ########################
  280. # Update the state for each charset.
  281. {
  282. for (i = 0; i < charset_count; i++) {
  283. cs = charsets[i];
  284. if (state[cs, "start"] == -1) {
  285. state[cs, "start"] = codepoint;
  286. state[cs, "end"] = codepoint_end;
  287. } else if (state[cs, "end"] + 1 == codepoint) {
  288. state[cs, "end"] = codepoint_end;
  289. } else {
  290. count = state[cs, "count"];
  291. state[cs, "count"]++;
  292. state[cs, "ranges", count, 0] = state[cs, "start"];
  293. state[cs, "ranges", count, 1] = state[cs, "end"];
  294. state[cs, "start"] = codepoint;
  295. state[cs, "end"] = codepoint_end;
  296. }
  297. }
  298. }
  299. # Printing and error handling
  300. #############################
  301. END {
  302. # Normally, an exit statement runs all the 'END' blocks before
  303. # actually exiting. We use the 'exit_status' variable to short
  304. # circuit the rest of the 'END' block by reissuing the exit
  305. # statement.
  306. if (exit_status != 0) {
  307. exit exit_status;
  308. }
  309. # Write a bit of a header.
  310. print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
  311. print("");
  312. print("/* This file is #include'd by srfi-14.c. */");
  313. print("");
  314. print("/* This file was generated from");
  315. print(" https://unicode.org/Public/UNIDATA/UnicodeData.txt");
  316. print(" with the unidata_to_charset.awk script. */");
  317. print("");
  318. for (i = 0; i < all_charsets_count; i++) {
  319. cs = all_charsets[i];
  320. # Extra logic to ensure that the last range is included.
  321. if (state[cs, "start"] != -1) {
  322. count = state[cs, "count"];
  323. state[cs, "count"]++;
  324. state[cs, "ranges", count, 0] = state[cs, "start"];
  325. state[cs, "ranges", count, 1] = state[cs, "end"];
  326. }
  327. count = state[cs, "count"];
  328. print("static const scm_t_char_range cs_" cs "_ranges[] = {");
  329. for (j = 0; j < count; j++) {
  330. rstart = state[cs, "ranges", j, 0];
  331. rend = state[cs, "ranges", j, 1];
  332. if (j + 1 < count) {
  333. printf(" {0x%04x, 0x%04x},\n", rstart, rend);
  334. } else {
  335. printf(" {0x%04x, 0x%04x}\n", rstart, rend);
  336. }
  337. }
  338. print("};");
  339. print("");
  340. count = state[cs, "count"];
  341. printf("static const size_t cs_%s_len = %d;\n", cs, count);
  342. if (i + 1 < all_charsets_count) {
  343. print("");
  344. }
  345. }
  346. }
  347. # And we're done.