123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418 |
- # unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
- #
- # Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 3 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- # Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this library; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- # Utilities
- ###########
- # Print MESSAGE to standard error, and exit with STATUS.
- function die(status, message) {
- print "unidata_to_charset.awk:", message | "cat 1>&2";
- exit_status = status;
- exit exit_status;
- }
- # Parse the string S as a hexadecimal number. Note that R, C, and B are
- # local variables that need not be set by callers. Most Awk
- # implementations have an 'strtonum' function that we could use, but it
- # is not part of POSIX.
- function hex(s, r, c, b) {
- if (length(s) == 0) {
- die(1, "Cannot parse empty string as hexadecimal.");
- }
- r = 0;
- for (i = 1; i <= length(s); i++) {
- c = substr(s, i, 1);
- b = 0;
- if (c == "0") { b = 0; }
- else if (c == "1") { b = 1; }
- else if (c == "2") { b = 2; }
- else if (c == "3") { b = 3; }
- else if (c == "4") { b = 4; }
- else if (c == "5") { b = 5; }
- else if (c == "6") { b = 6; }
- else if (c == "7") { b = 7; }
- else if (c == "8") { b = 8; }
- else if (c == "9") { b = 9; }
- else if (c == "A") { b = 10; }
- else if (c == "B") { b = 11; }
- else if (c == "C") { b = 12; }
- else if (c == "D") { b = 13; }
- else if (c == "E") { b = 14; }
- else if (c == "F") { b = 15; }
- else { die(1, "Invalid hexadecimal character: " c); }
- r *= 16;
- r += b;
- }
- return r;
- }
- # Program initialization
- ########################
- BEGIN {
- # The columns are separated by semicolons.
- FS = ";";
- # This will help us handle errors.
- exit_status = 0;
- # List of charsets.
- all_charsets_count = 0;
- all_charsets[all_charsets_count++] = "lower_case";
- all_charsets[all_charsets_count++] = "upper_case";
- all_charsets[all_charsets_count++] = "title_case";
- all_charsets[all_charsets_count++] = "letter";
- all_charsets[all_charsets_count++] = "digit";
- all_charsets[all_charsets_count++] = "hex_digit";
- all_charsets[all_charsets_count++] = "letter_plus_digit";
- all_charsets[all_charsets_count++] = "graphic";
- all_charsets[all_charsets_count++] = "whitespace";
- all_charsets[all_charsets_count++] = "printing";
- all_charsets[all_charsets_count++] = "iso_control";
- all_charsets[all_charsets_count++] = "punctuation";
- all_charsets[all_charsets_count++] = "symbol";
- all_charsets[all_charsets_count++] = "blank";
- all_charsets[all_charsets_count++] = "ascii";
- all_charsets[all_charsets_count++] = "empty";
- all_charsets[all_charsets_count++] = "designated";
- # Initialize charset state table.
- for (i in all_charsets) {
- cs = all_charsets[i];
- state[cs, "start"] = -1;
- state[cs, "end"] = -1;
- state[cs, "count"] = 0;
- }
- }
- # Comments
- ##########
- # Skip comments so we can include a copyright notice in the data file.
- /^#/ {
- next;
- }
- # Record initialization
- #######################
- # In this block we give names to each field, and do some basic
- # initialization.
- {
- codepoint = hex($1);
- name = $2;
- category = $3;
- uppercase = $13;
- lowercase = $14;
- codepoint_end = codepoint;
- charset_count = 0;
- }
- # Some pairs of lines in UnicodeData.txt delimit ranges of
- # characters.
- name ~ /First>$/ {
- getline;
- last_name = name;
- sub(/First>$/, "Last>", last_name);
- if (last_name != $2) {
- die(1, "Invalid range in Unicode data.");
- exit_status = 1;
- exit 1;
- }
- codepoint_end = hex($1);
- }
- # Character set predicates
- ##########################
- ## The lower_case character set
- ###############################
- # For Unicode, we follow Java's specification: a character is
- # lowercase if
- # * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
- # * the Unicode attribute table does not give a lowercase mapping
- # for it, and
- # * at least one of the following is true:
- # o the Unicode attribute table gives a mapping to uppercase
- # for the character, or
- # o the name for the character in the Unicode attribute table
- # contains the words "SMALL LETTER" or "SMALL LIGATURE".
- (codepoint < 8192 || codepoint > 12287) &&
- lowercase == "" &&
- (uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
- charsets[charset_count++] = "lower_case";
- }
- ## The upper_case character set
- ###############################
- # For Unicode, we follow Java's specification: a character is
- # uppercase if
- # * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
- # * the Unicode attribute table does not give an uppercase mapping
- # for it (this excludes titlecase characters), and
- # * at least one of the following is true:
- # o the Unicode attribute table gives a mapping to lowercase
- # for the character, or
- # o the name for the character in the Unicode attribute table
- # contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
- (codepoint < 8192 || codepoint > 12287) &&
- uppercase == "" &&
- (lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
- charsets[charset_count++] = "upper_case";
- }
- ## The title_case character set
- ###############################
- # A character is titlecase if it has the category Lt in the character
- # attribute database.
- category == "Lt" {
- charsets[charset_count++] = "title_case";
- }
- ## The letter character set
- ###########################
- # A letter is any character with one of the letter categories (Lu, Ll,
- # Lt, Lm, Lo) in the Unicode character database.
- category == "Lu" ||
- category == "Ll" ||
- category == "Lt" ||
- category == "Lm" ||
- category == "Lo" {
- charsets[charset_count++] = "letter";
- charsets[charset_count++] = "letter_plus_digit";
- }
- ## The digit character set
- ##########################
- # A character is a digit if it has the category Nd in the character
- # attribute database. In Latin-1 and ASCII, the only such characters
- # are 0123456789. In Unicode, there are other digit characters in
- # other code blocks, such as Gujarati digits and Tibetan digits.
- category == "Nd" {
- charsets[charset_count++] = "digit";
- charsets[charset_count++] = "letter_plus_digit";
- }
- ## The hex_digit character set
- ##############################
- # The only hex digits are 0123456789abcdefABCDEF.
- (codepoint >= 48 && codepoint <= 57) ||
- (codepoint >= 65 && codepoint <= 70) ||
- (codepoint >= 97 && codepoint <= 102) {
- charsets[charset_count++] = "hex_digit";
- }
- ## The graphic character set
- ############################
- # Characters that would 'use ink' when printed
- category ~ /L|M|N|P|S/ {
- charsets[charset_count++] = "graphic";
- charsets[charset_count++] = "printing";
- }
- ## The whitespace character set
- ###############################
- # A whitespace character is either
- # * a character with one of the space, line, or paragraph separator
- # categories (Zs, Zl or Zp) of the Unicode character database.
- # * U+0009 (09) Horizontal tabulation (\t control-I)
- # * U+000A (10) Line feed (\n control-J)
- # * U+000B (11) Vertical tabulation (\v control-K)
- # * U+000C (12) Form feed (\f control-L)
- # * U+000D (13) Carriage return (\r control-M)
- category ~ /Zs|Zl|Zp/ ||
- (codepoint >= 9 && codepoint <= 13) {
- charsets[charset_count++] = "whitespace";
- charsets[charset_count++] = "printing";
- }
- ## The iso_control character set
- ################################
- # The ISO control characters are the Unicode/Latin-1 characters in the
- # ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
- (codepoint >= 0 && codepoint <= 31) ||
- (codepoint >= 127 && codepoint <= 159) {
- charsets[charset_count++] = "iso_control";
- }
- ## The punctuation character set
- ################################
- # A punctuation character is any character that has one of the
- # punctuation categories in the Unicode character database (Pc, Pd,
- # Ps, Pe, Pi, Pf, or Po.)
- # Note that srfi-14 gives conflicting requirements!! It claims that
- # only the Unicode punctuation is necessary, but, explicitly calls out
- # the soft hyphen character (U+00AD) as punctution. Current versions
- # of Unicode consider U+00AD to be a formatting character, not
- # punctuation.
- category ~ /P/ {
- charsets[charset_count++] = "punctuation";
- }
- ## The symbol character set
- ###########################
- # A symbol is any character that has one of the symbol categories in
- # the Unicode character database (Sm, Sc, Sk, or So).
- category ~ /S/ {
- charsets[charset_count++] = "symbol";
- }
- ## The blank character set
- ##########################
- # Blank chars are horizontal whitespace. A blank character is either
- # * a character with the space separator category (Zs) in the
- # Unicode character database.
- # * U+0009 (9) Horizontal tabulation (\t control-I)
- category ~ /Zs/ || codepoint == 9 {
- charsets[charset_count++] = "blank";
- }
- ## The ascii character set
- ##########################
- codepoint <= 127 {
- charsets[charset_count++] = "ascii";
- }
- ## The designated character set
- ###############################
- # Designated -- All characters except for the surrogates
- category !~ /Cs/ {
- charsets[charset_count++] = "designated";
- }
- ## Other character sets
- #######################
- # Note that the "letter_plus_digit" and "printing" character sets, which
- # are unions of other character sets, are included in the patterns
- # matching their constituent parts (i.e., the "letter_plus_digit"
- # character set is included as part of the "letter" and "digit"
- # patterns).
- #
- # Also, the "empty" character is computed by doing precisely nothing!
- # Keeping track of state
- ########################
- # Update the state for each charset.
- {
- for (i = 0; i < charset_count; i++) {
- cs = charsets[i];
- if (state[cs, "start"] == -1) {
- state[cs, "start"] = codepoint;
- state[cs, "end"] = codepoint_end;
- } else if (state[cs, "end"] + 1 == codepoint) {
- state[cs, "end"] = codepoint_end;
- } else {
- count = state[cs, "count"];
- state[cs, "count"]++;
- state[cs, "ranges", count, 0] = state[cs, "start"];
- state[cs, "ranges", count, 1] = state[cs, "end"];
- state[cs, "start"] = codepoint;
- state[cs, "end"] = codepoint_end;
- }
- }
- }
- # Printing and error handling
- #############################
- END {
- # Normally, an exit statement runs all the 'END' blocks before
- # actually exiting. We use the 'exit_status' variable to short
- # circuit the rest of the 'END' block by reissuing the exit
- # statement.
- if (exit_status != 0) {
- exit exit_status;
- }
- # Write a bit of a header.
- print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
- print("");
- print("/* This file is #include'd by srfi-14.c. */");
- print("");
- print("/* This file was generated from");
- print(" https://unicode.org/Public/UNIDATA/UnicodeData.txt");
- print(" with the unidata_to_charset.awk script. */");
- print("");
- for (i = 0; i < all_charsets_count; i++) {
- cs = all_charsets[i];
- # Extra logic to ensure that the last range is included.
- if (state[cs, "start"] != -1) {
- count = state[cs, "count"];
- state[cs, "count"]++;
- state[cs, "ranges", count, 0] = state[cs, "start"];
- state[cs, "ranges", count, 1] = state[cs, "end"];
- }
- count = state[cs, "count"];
- print("static const scm_t_char_range cs_" cs "_ranges[] = {");
- for (j = 0; j < count; j++) {
- rstart = state[cs, "ranges", j, 0];
- rend = state[cs, "ranges", j, 1];
- if (j + 1 < count) {
- printf(" {0x%04x, 0x%04x},\n", rstart, rend);
- } else {
- printf(" {0x%04x, 0x%04x}\n", rstart, rend);
- }
- }
- print("};");
- print("");
- count = state[cs, "count"];
- printf("static const size_t cs_%s_len = %d;\n", cs, count);
- if (i + 1 < all_charsets_count) {
- print("");
- }
- }
- }
- # And we're done.
|