123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454 |
- #include <stdio.h>
- #include <string.h>
- #include "misc.h"
- typedef uint32_t uchar;
- typedef int cclass_t;
- /* A local uchar-oriented analogue of strbuf */
- typedef struct ucharbuf {
- uchar *buf;
- size_t len, size;
- } ucharbuf;
- static ucharbuf *ucharbuf_new(void)
- {
- ucharbuf *ub = snew(ucharbuf);
- ub->buf = NULL;
- ub->len = ub->size = 0;
- return ub;
- }
- static void ucharbuf_append(ucharbuf *ub, uchar c)
- {
- /* Use the _nm variant because this is used for passphrases */
- sgrowarray_nm(ub->buf, ub->size, ub->len);
- ub->buf[ub->len++] = c;
- }
- static void ucharbuf_free(ucharbuf *ub)
- {
- if (ub->buf) {
- memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
- sfree(ub->buf);
- }
- sfree(ub);
- }
- /*
- * Constants relating to the arithmetic decomposition mapping of
- * Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
- * constant names match those in the spec.
- */
- enum {
- SBase = 0xAC00, /* base index for precomposed Hangul */
- LBase = 0x1100, /* base index for L (leading consonant) jamo */
- VBase = 0x1161, /* base index for V (vowel) jamo */
- TBase = 0x11A7, /* base index for T (trailing consonant) jamo */
- LCount = 19, /* number of L jamo */
- VCount = 21, /* number of V jamo */
- TCount = 28, /* number of T jamo, including not having one at all */
- NCount = VCount * TCount, /* number of Hangul for each L jamo */
- SCount = LCount * NCount, /* number of Hangul in total */
- };
- static cclass_t combining_class(uchar c)
- {
- struct range {
- uchar start, end;
- cclass_t cclass;
- };
- static const struct range ranges[] = {
- #include "unicode/combining_classes.h"
- };
- const struct range *start = ranges, *end = start + lenof(ranges);
- while (end > start) {
- const struct range *curr = start + (end-start) / 2;
- if (c < curr->start)
- end = curr;
- else if (c > curr->end)
- start = curr + 1;
- else
- return curr->cclass;
- }
- return 0;
- };
- static unsigned decompose_char(uchar c, uchar *out)
- {
- struct decomp {
- uchar composed, dec0, dec1;
- };
- static const struct decomp decomps[] = {
- #include "unicode/canonical_decomp.h"
- };
- if (c - SBase < SCount) {
- /* Arithmetically decompose a Hangul character into jamo */
- uchar SIndex = c - SBase;
- uchar LIndex = SIndex / NCount;
- uchar VIndex = SIndex % NCount / TCount;
- uchar TIndex = SIndex % TCount;
- unsigned n = 0;
- out[n++] = LBase + LIndex;
- out[n++] = VBase + VIndex;
- if (TIndex)
- out[n++] = TBase + TIndex;
- return n;
- }
- const struct decomp *start = decomps, *end = start + lenof(decomps);
- while (end > start) {
- const struct decomp *curr = start + (end-start) / 2;
- if (c < curr->composed)
- end = curr;
- else if (c > curr->composed)
- start = curr + 1;
- else {
- out[0] = curr->dec0;
- if (curr->dec1) {
- out[1] = curr->dec1;
- return 2;
- } else {
- return 1;
- }
- }
- }
- return 0;
- };
- static uchar compose_chars(uchar S, uchar C)
- {
- struct comp {
- uchar dec0, dec1, composed;
- };
- static const struct comp comps[] = {
- #include "unicode/canonical_comp.h"
- };
- if (S - LBase < LCount && C - VBase < VCount) {
- /* Arithmetically compose an L and V jamo into a Hangul LV
- * character */
- return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
- }
- if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
- C - TBase < TCount) {
- /* Arithmetically compose an LV Hangul character and a T jamo
- * into a Hangul LVT character */
- return S + C - TBase;
- }
- const struct comp *start = comps, *end = start + lenof(comps);
- while (end > start) {
- const struct comp *curr = start + (end-start) / 2;
- if (S < curr->dec0)
- end = curr;
- else if (S > curr->dec0)
- start = curr + 1;
- else if (C < curr->dec1)
- end = curr;
- else if (C > curr->dec1)
- start = curr + 1;
- else
- return curr->composed;
- }
- return 0;
- };
- /*
- * Recursively decompose a sequence of Unicode characters. The output
- * is written to 'out', as a sequence of native-byte-order uchar.
- */
- static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
- {
- uchar decomposed[3];
- while (len-- > 0) {
- uchar c = *str++;
- unsigned n = decompose_char(c, decomposed);
- if (n == 0) {
- /* This character is indecomposable */
- ucharbuf_append(out, c);
- } else {
- /* This character has been decomposed into up to 3
- * characters, so we must now recursively decompose those */
- recursively_decompose(decomposed, n, out);
- }
- }
- }
- /*
- * Reorder combining marks according to the Canonical Ordering
- * Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
- *
- * The algorithm is phrased mechanistically, but the essence is: among
- * any contiguous sequence of combining marks (that is, characters
- * with cclass > 0), sort them by their cclass - but _stably_, i.e.
- * breaking ties in cclass by preserving the original order of the
- * characters in question.
- */
- static void canonical_ordering(uchar *str, size_t len)
- {
- for (size_t i = 1; i < len; i++) {
- cclass_t cclass = combining_class(str[i]);
- if (cclass == 0)
- continue;
- size_t j = i;
- while (j > 0 && combining_class(str[j-1]) > cclass) {
- uchar tmp = str[j-1];
- str[j-1] = str[j];
- str[j] = tmp;
- j--;
- }
- }
- }
- /*
- * Canonically recompose characters according to the Canonical
- * Composition Algorithm (definition D117 in Unicode 15.0.0 section
- * 3.11).
- */
- static size_t canonical_composition(uchar *str, size_t len)
- {
- const uchar *in = str;
- uchar *out = str;
- uchar *last_starter = NULL;
- cclass_t highest_cclass_between = -1;
- while (len > 0) {
- len--;
- uchar c = *in++;
- cclass_t cclass = combining_class(c);
- if (last_starter && highest_cclass_between < cclass) {
- uchar composed = compose_chars(*last_starter, c);
- if (composed) {
- *last_starter = composed;
- continue;
- }
- }
- if (cclass == 0) {
- last_starter = out;
- highest_cclass_between = -1;
- } else if (cclass > highest_cclass_between) {
- highest_cclass_between = cclass;
- }
- *out++ = c;
- }
- return out - str;
- }
- /*
- * Render a string into NFD.
- */
- static ucharbuf *nfd(ucharbuf *input)
- {
- ucharbuf *output = ucharbuf_new();
- /*
- * Definition D118 in Unicode 15.0.0 section 3.11, referring to
- * D68 in section 3.7: recursively decompose characters, then
- * reorder combining marks.
- */
- recursively_decompose(input->buf, input->len, output);
- canonical_ordering(output->buf, output->len);
- return output;
- }
- /*
- * Render a string into NFC.
- */
- static ucharbuf *nfc(ucharbuf *input)
- {
- /*
- * Definition D120 in Unicode 15.0.0 section 3.11: render the
- * string into NFD, then apply the canonical composition algorithm.
- */
- ucharbuf *output = nfd(input);
- output->len = canonical_composition(output->buf, output->len);
- return output;
- }
- /*
- * Convert a UTF-8 string into NFC, returning it as UTF-8 again.
- */
- strbuf *utf8_to_nfc(ptrlen input)
- {
- BinarySource src[1];
- BinarySource_BARE_INIT_PL(src, input);
- ucharbuf *inbuf = ucharbuf_new();
- while (get_avail(src))
- ucharbuf_append(inbuf, decode_utf8(src, NULL));
- ucharbuf *outbuf = nfc(inbuf);
- strbuf *output = strbuf_new_nm();
- for (size_t i = 0; i < outbuf->len; i++)
- put_utf8_char(output, outbuf->buf[i]);
- ucharbuf_free(inbuf);
- ucharbuf_free(outbuf);
- return output;
- }
- #ifdef TEST
- void out_of_memory(void)
- {
- fprintf(stderr, "out of memory!\n");
- exit(2);
- }
- static int pass, fail;
- static void subtest(const char *filename, int lineno, const char *subdesc,
- char nftype, ucharbuf *input, ucharbuf *expected)
- {
- /*
- * Convert input into either NFC or NFD, and check it's equal to
- * expected
- */
- ucharbuf *nf;
- switch (nftype) {
- case 'C':
- nf = nfc(input);
- break;
- case 'D':
- nf = nfd(input);
- break;
- default:
- unreachable("bad nftype");
- }
- if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
- pass++;
- } else {
- printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
- for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
- printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
- printf("]) -> [");
- for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
- printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
- printf("] != [");
- for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
- printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
- printf("]\n");
- fail++;
- }
- ucharbuf_free(nf);
- }
- static void run_tests(const char *filename, FILE *fp)
- {
- for (int lineno = 1;; lineno++) {
- char *line = chomp(fgetline(fp));
- if (!line)
- break;
- /* Strip section dividers which begin with @ */
- if (*line == '@') {
- sfree(line);
- continue;
- }
- /* Strip comments, if any */
- ptrlen pl = ptrlen_from_asciz(line);
- {
- const char *p = memchr(pl.ptr, '#', pl.len);
- if (p)
- pl.len = p - (const char *)pl.ptr;
- }
- /* Strip trailing space */
- while (pl.len > 0 &&
- (((char *)pl.ptr)[pl.len-1] == ' ' ||
- ((char *)pl.ptr)[pl.len-1] == '\t'))
- pl.len--;
- /* Skip empty lines */
- if (!pl.len) {
- sfree(line);
- continue;
- }
- /* Break up at semicolons, expecting five fields, each of
- * which we decode into hex code points */
- ucharbuf *fields[5];
- for (size_t i = 0; i < lenof(fields); i++) {
- ptrlen field = ptrlen_get_word(&pl, ";");
- fields[i] = ucharbuf_new();
- ptrlen chr;
- while ((chr = ptrlen_get_word(&field, " ")).len) {
- char *chrstr = mkstr(chr);
- uchar c = strtoul(chrstr, NULL, 16);
- sfree(chrstr);
- ucharbuf_append(fields[i], c);
- }
- }
- subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
- subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
- subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
- subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
- subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
- subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
- subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
- subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
- subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
- subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
- for (size_t i = 0; i < lenof(fields); i++)
- ucharbuf_free(fields[i]);
- sfree(line);
- }
- }
- int main(int argc, char **argv)
- {
- if (argc != 2) {
- fprintf(stderr, "test_unicode_norm: give an input file "
- "of tests or '-'\n");
- return 1;
- }
- const char *filename = argv[1];
- if (!strcmp(filename, "-")) {
- run_tests("<standard input>", stdin);
- } else {
- FILE *fp = fopen(filename, "r");
- if (!fp) {
- fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
- filename);
- return 1;
- }
- run_tests(filename, fp);
- fclose(fp);
- }
- printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
- return fail != 0;
- }
- #endif
|