123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- /*
- * Check a UTF-8 string to ensure every character in it is part of the
- * version of Unicode that we understand.
- *
- * (If it isn't, then we don't know what combining properties it has,
- * so we can't safely NFC it and rely on the result not changing when
- * we later update our Unicode version.)
- */
- #include "misc.h"
- #include "unicode/version.h"
- static bool known(unsigned c)
- {
- struct range {
- unsigned start, end;
- };
- static const struct range ranges[] = {
- #include "unicode/known_chars.h"
- };
- const struct range *start = ranges, *end = start + lenof(ranges);
- while (end > start) {
- const struct range *curr = start + (end-start) / 2;
- if (c < curr->start)
- end = curr;
- else if (c > curr->end)
- start = curr + 1;
- else
- return true;
- }
- return false;
- };
- char *utf8_unknown_char(ptrlen input)
- {
- BinarySource src[1];
- BinarySource_BARE_INIT_PL(src, input);
- for (size_t nchars = 0; get_avail(src); nchars++) {
- DecodeUTF8Failure err;
- unsigned c = decode_utf8(src, &err);
- if (err != DUTF8_SUCCESS)
- return dupprintf(
- "cannot normalise this string: UTF-8 decoding error "
- "at character position %"SIZEu", byte position %"SIZEu": %s",
- nchars, src->pos, decode_utf8_error_strings[err]);
- if (!known(c))
- return dupprintf(
- "cannot stably normalise this string: code point %04X "
- "(at character position %"SIZEu", byte position %"SIZEu") "
- "is not in Unicode %s", c, nchars, src->pos,
- UNICODE_VERSION_SHORT);
- }
- return NULL;
- }
|