unicode-norm.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. #include <stdio.h>
  2. #include <string.h>
  3. #include "misc.h"
  4. typedef uint32_t uchar;
  5. typedef int cclass_t;
  6. /* A local uchar-oriented analogue of strbuf */
  7. typedef struct ucharbuf {
  8. uchar *buf;
  9. size_t len, size;
  10. } ucharbuf;
  11. static ucharbuf *ucharbuf_new(void)
  12. {
  13. ucharbuf *ub = snew(ucharbuf);
  14. ub->buf = NULL;
  15. ub->len = ub->size = 0;
  16. return ub;
  17. }
  18. static void ucharbuf_append(ucharbuf *ub, uchar c)
  19. {
  20. /* Use the _nm variant because this is used for passphrases */
  21. sgrowarray_nm(ub->buf, ub->size, ub->len);
  22. ub->buf[ub->len++] = c;
  23. }
  24. static void ucharbuf_free(ucharbuf *ub)
  25. {
  26. if (ub->buf) {
  27. memset(ub->buf, 0, ub->size * sizeof(*ub->buf));
  28. sfree(ub->buf);
  29. }
  30. sfree(ub);
  31. }
  32. /*
  33. * Constants relating to the arithmetic decomposition mapping of
  34. * Hangul to jamo, from section 3.12 of Unicode 15.0.0. The following
  35. * constant names match those in the spec.
  36. */
  37. enum {
  38. SBase = 0xAC00, /* base index for precomposed Hangul */
  39. LBase = 0x1100, /* base index for L (leading consonant) jamo */
  40. VBase = 0x1161, /* base index for V (vowel) jamo */
  41. TBase = 0x11A7, /* base index for T (trailing consonant) jamo */
  42. LCount = 19, /* number of L jamo */
  43. VCount = 21, /* number of V jamo */
  44. TCount = 28, /* number of T jamo, including not having one at all */
  45. NCount = VCount * TCount, /* number of Hangul for each L jamo */
  46. SCount = LCount * NCount, /* number of Hangul in total */
  47. };
  48. static cclass_t combining_class(uchar c)
  49. {
  50. struct range {
  51. uchar start, end;
  52. cclass_t cclass;
  53. };
  54. static const struct range ranges[] = {
  55. #include "unicode/combining_classes.h"
  56. };
  57. const struct range *start = ranges, *end = start + lenof(ranges);
  58. while (end > start) {
  59. const struct range *curr = start + (end-start) / 2;
  60. if (c < curr->start)
  61. end = curr;
  62. else if (c > curr->end)
  63. start = curr + 1;
  64. else
  65. return curr->cclass;
  66. }
  67. return 0;
  68. };
  69. static unsigned decompose_char(uchar c, uchar *out)
  70. {
  71. struct decomp {
  72. uchar composed, dec0, dec1;
  73. };
  74. static const struct decomp decomps[] = {
  75. #include "unicode/canonical_decomp.h"
  76. };
  77. if (c - SBase < SCount) {
  78. /* Arithmetically decompose a Hangul character into jamo */
  79. uchar SIndex = c - SBase;
  80. uchar LIndex = SIndex / NCount;
  81. uchar VIndex = SIndex % NCount / TCount;
  82. uchar TIndex = SIndex % TCount;
  83. unsigned n = 0;
  84. out[n++] = LBase + LIndex;
  85. out[n++] = VBase + VIndex;
  86. if (TIndex)
  87. out[n++] = TBase + TIndex;
  88. return n;
  89. }
  90. const struct decomp *start = decomps, *end = start + lenof(decomps);
  91. while (end > start) {
  92. const struct decomp *curr = start + (end-start) / 2;
  93. if (c < curr->composed)
  94. end = curr;
  95. else if (c > curr->composed)
  96. start = curr + 1;
  97. else {
  98. out[0] = curr->dec0;
  99. if (curr->dec1) {
  100. out[1] = curr->dec1;
  101. return 2;
  102. } else {
  103. return 1;
  104. }
  105. }
  106. }
  107. return 0;
  108. };
  109. static uchar compose_chars(uchar S, uchar C)
  110. {
  111. struct comp {
  112. uchar dec0, dec1, composed;
  113. };
  114. static const struct comp comps[] = {
  115. #include "unicode/canonical_comp.h"
  116. };
  117. if (S - LBase < LCount && C - VBase < VCount) {
  118. /* Arithmetically compose an L and V jamo into a Hangul LV
  119. * character */
  120. return SBase + (S - LBase) * NCount + (C - VBase) * TCount;
  121. }
  122. if (S - SBase < SCount && (S - SBase) % TCount == 0 &&
  123. C - TBase < TCount) {
  124. /* Arithmetically compose an LV Hangul character and a T jamo
  125. * into a Hangul LVT character */
  126. return S + C - TBase;
  127. }
  128. const struct comp *start = comps, *end = start + lenof(comps);
  129. while (end > start) {
  130. const struct comp *curr = start + (end-start) / 2;
  131. if (S < curr->dec0)
  132. end = curr;
  133. else if (S > curr->dec0)
  134. start = curr + 1;
  135. else if (C < curr->dec1)
  136. end = curr;
  137. else if (C > curr->dec1)
  138. start = curr + 1;
  139. else
  140. return curr->composed;
  141. }
  142. return 0;
  143. };
  144. /*
  145. * Recursively decompose a sequence of Unicode characters. The output
  146. * is written to 'out', as a sequence of native-byte-order uchar.
  147. */
  148. static void recursively_decompose(const uchar *str, size_t len, ucharbuf *out)
  149. {
  150. uchar decomposed[3];
  151. while (len-- > 0) {
  152. uchar c = *str++;
  153. unsigned n = decompose_char(c, decomposed);
  154. if (n == 0) {
  155. /* This character is indecomposable */
  156. ucharbuf_append(out, c);
  157. } else {
  158. /* This character has been decomposed into up to 3
  159. * characters, so we must now recursively decompose those */
  160. recursively_decompose(decomposed, n, out);
  161. }
  162. }
  163. }
  164. /*
  165. * Reorder combining marks according to the Canonical Ordering
  166. * Algorithm (definition D109 in Unicode 15.0.0 section 3.11).
  167. *
  168. * The algorithm is phrased mechanistically, but the essence is: among
  169. * any contiguous sequence of combining marks (that is, characters
  170. * with cclass > 0), sort them by their cclass - but _stably_, i.e.
  171. * breaking ties in cclass by preserving the original order of the
  172. * characters in question.
  173. */
  174. static void canonical_ordering(uchar *str, size_t len)
  175. {
  176. for (size_t i = 1; i < len; i++) {
  177. cclass_t cclass = combining_class(str[i]);
  178. if (cclass == 0)
  179. continue;
  180. size_t j = i;
  181. while (j > 0 && combining_class(str[j-1]) > cclass) {
  182. uchar tmp = str[j-1];
  183. str[j-1] = str[j];
  184. str[j] = tmp;
  185. j--;
  186. }
  187. }
  188. }
  189. /*
  190. * Canonically recompose characters according to the Canonical
  191. * Composition Algorithm (definition D117 in Unicode 15.0.0 section
  192. * 3.11).
  193. */
  194. static size_t canonical_composition(uchar *str, size_t len)
  195. {
  196. const uchar *in = str;
  197. uchar *out = str;
  198. uchar *last_starter = NULL;
  199. cclass_t highest_cclass_between = -1;
  200. while (len > 0) {
  201. len--;
  202. uchar c = *in++;
  203. cclass_t cclass = combining_class(c);
  204. if (last_starter && highest_cclass_between < cclass) {
  205. uchar composed = compose_chars(*last_starter, c);
  206. if (composed) {
  207. *last_starter = composed;
  208. continue;
  209. }
  210. }
  211. if (cclass == 0) {
  212. last_starter = out;
  213. highest_cclass_between = -1;
  214. } else if (cclass > highest_cclass_between) {
  215. highest_cclass_between = cclass;
  216. }
  217. *out++ = c;
  218. }
  219. return out - str;
  220. }
  221. /*
  222. * Render a string into NFD.
  223. */
  224. static ucharbuf *nfd(ucharbuf *input)
  225. {
  226. ucharbuf *output = ucharbuf_new();
  227. /*
  228. * Definition D118 in Unicode 15.0.0 section 3.11, referring to
  229. * D68 in section 3.7: recursively decompose characters, then
  230. * reorder combining marks.
  231. */
  232. recursively_decompose(input->buf, input->len, output);
  233. canonical_ordering(output->buf, output->len);
  234. return output;
  235. }
  236. /*
  237. * Render a string into NFC.
  238. */
  239. static ucharbuf *nfc(ucharbuf *input)
  240. {
  241. /*
  242. * Definition D120 in Unicode 15.0.0 section 3.11: render the
  243. * string into NFD, then apply the canonical composition algorithm.
  244. */
  245. ucharbuf *output = nfd(input);
  246. output->len = canonical_composition(output->buf, output->len);
  247. return output;
  248. }
  249. /*
  250. * Convert a UTF-8 string into NFC, returning it as UTF-8 again.
  251. */
  252. strbuf *utf8_to_nfc(ptrlen input)
  253. {
  254. BinarySource src[1];
  255. BinarySource_BARE_INIT_PL(src, input);
  256. ucharbuf *inbuf = ucharbuf_new();
  257. while (get_avail(src))
  258. ucharbuf_append(inbuf, decode_utf8(src, NULL));
  259. ucharbuf *outbuf = nfc(inbuf);
  260. strbuf *output = strbuf_new_nm();
  261. for (size_t i = 0; i < outbuf->len; i++)
  262. put_utf8_char(output, outbuf->buf[i]);
  263. ucharbuf_free(inbuf);
  264. ucharbuf_free(outbuf);
  265. return output;
  266. }
  267. #ifdef TEST
  268. void out_of_memory(void)
  269. {
  270. fprintf(stderr, "out of memory!\n");
  271. exit(2);
  272. }
  273. static int pass, fail;
  274. static void subtest(const char *filename, int lineno, const char *subdesc,
  275. char nftype, ucharbuf *input, ucharbuf *expected)
  276. {
  277. /*
  278. * Convert input into either NFC or NFD, and check it's equal to
  279. * expected
  280. */
  281. ucharbuf *nf;
  282. switch (nftype) {
  283. case 'C':
  284. nf = nfc(input);
  285. break;
  286. case 'D':
  287. nf = nfd(input);
  288. break;
  289. default:
  290. unreachable("bad nftype");
  291. }
  292. if (nf->len == expected->len && !memcmp(nf->buf, expected->buf, nf->len)) {
  293. pass++;
  294. } else {
  295. printf("%s:%d: failed %s: NF%c([", filename, lineno, subdesc, nftype);
  296. for (size_t pos = 0; pos < input->len; pos += sizeof(uchar))
  297. printf("%s%04X", pos ? " " : "", (unsigned)input->buf[pos]);
  298. printf("]) -> [");
  299. for (size_t pos = 0; pos < nf->len; pos += sizeof(uchar))
  300. printf("%s%04X", pos ? " " : "", (unsigned)nf->buf[pos]);
  301. printf("] != [");
  302. for (size_t pos = 0; pos < expected->len; pos += sizeof(uchar))
  303. printf("%s%04X", pos ? " " : "", (unsigned)expected->buf[pos]);
  304. printf("]\n");
  305. fail++;
  306. }
  307. ucharbuf_free(nf);
  308. }
  309. static void run_tests(const char *filename, FILE *fp)
  310. {
  311. for (int lineno = 1;; lineno++) {
  312. char *line = chomp(fgetline(fp));
  313. if (!line)
  314. break;
  315. /* Strip section dividers which begin with @ */
  316. if (*line == '@') {
  317. sfree(line);
  318. continue;
  319. }
  320. /* Strip comments, if any */
  321. ptrlen pl = ptrlen_from_asciz(line);
  322. {
  323. const char *p = memchr(pl.ptr, '#', pl.len);
  324. if (p)
  325. pl.len = p - (const char *)pl.ptr;
  326. }
  327. /* Strip trailing space */
  328. while (pl.len > 0 &&
  329. (((char *)pl.ptr)[pl.len-1] == ' ' ||
  330. ((char *)pl.ptr)[pl.len-1] == '\t'))
  331. pl.len--;
  332. /* Skip empty lines */
  333. if (!pl.len) {
  334. sfree(line);
  335. continue;
  336. }
  337. /* Break up at semicolons, expecting five fields, each of
  338. * which we decode into hex code points */
  339. ucharbuf *fields[5];
  340. for (size_t i = 0; i < lenof(fields); i++) {
  341. ptrlen field = ptrlen_get_word(&pl, ";");
  342. fields[i] = ucharbuf_new();
  343. ptrlen chr;
  344. while ((chr = ptrlen_get_word(&field, " ")).len) {
  345. char *chrstr = mkstr(chr);
  346. uchar c = strtoul(chrstr, NULL, 16);
  347. sfree(chrstr);
  348. ucharbuf_append(fields[i], c);
  349. }
  350. }
  351. subtest(filename, lineno, "NFC(c1) = c2", 'C', fields[0], fields[1]);
  352. subtest(filename, lineno, "NFC(c2) = c2", 'C', fields[1], fields[1]);
  353. subtest(filename, lineno, "NFC(c3) = c2", 'C', fields[2], fields[1]);
  354. subtest(filename, lineno, "NFC(c4) = c4", 'C', fields[3], fields[3]);
  355. subtest(filename, lineno, "NFC(c5) = c4", 'C', fields[4], fields[3]);
  356. subtest(filename, lineno, "NFD(c1) = c3", 'D', fields[0], fields[2]);
  357. subtest(filename, lineno, "NFD(c2) = c3", 'D', fields[1], fields[2]);
  358. subtest(filename, lineno, "NFD(c3) = c3", 'D', fields[2], fields[2]);
  359. subtest(filename, lineno, "NFD(c4) = c5", 'D', fields[3], fields[4]);
  360. subtest(filename, lineno, "NFD(c5) = c5", 'D', fields[4], fields[4]);
  361. for (size_t i = 0; i < lenof(fields); i++)
  362. ucharbuf_free(fields[i]);
  363. sfree(line);
  364. }
  365. }
  366. int main(int argc, char **argv)
  367. {
  368. if (argc != 2) {
  369. fprintf(stderr, "test_unicode_norm: give an input file "
  370. "of tests or '-'\n");
  371. return 1;
  372. }
  373. const char *filename = argv[1];
  374. if (!strcmp(filename, "-")) {
  375. run_tests("<standard input>", stdin);
  376. } else {
  377. FILE *fp = fopen(filename, "r");
  378. if (!fp) {
  379. fprintf(stderr, "test_unicode_norm: unable to open '%s'\n",
  380. filename);
  381. return 1;
  382. }
  383. run_tests(filename, fp);
  384. fclose(fp);
  385. }
  386. printf("pass %d fail %d total %d\n", pass, fail, pass + fail);
  387. return fail != 0;
  388. }
  389. #endif