unicode-known.c 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. /*
  2. * Check a UTF-8 string to ensure every character in it is part of the
  3. * version of Unicode that we understand.
  4. *
  5. * (If it isn't, then we don't know what combining properties it has,
  6. * so we can't safely NFC it and rely on the result not changing when
  7. * we later update our Unicode version.)
  8. */
  9. #include "misc.h"
  10. #include "unicode/version.h"
  11. static bool known(unsigned c)
  12. {
  13. struct range {
  14. unsigned start, end;
  15. };
  16. static const struct range ranges[] = {
  17. #include "unicode/known_chars.h"
  18. };
  19. const struct range *start = ranges, *end = start + lenof(ranges);
  20. while (end > start) {
  21. const struct range *curr = start + (end-start) / 2;
  22. if (c < curr->start)
  23. end = curr;
  24. else if (c > curr->end)
  25. start = curr + 1;
  26. else
  27. return true;
  28. }
  29. return false;
  30. };
  31. char *utf8_unknown_char(ptrlen input)
  32. {
  33. BinarySource src[1];
  34. BinarySource_BARE_INIT_PL(src, input);
  35. for (size_t nchars = 0; get_avail(src); nchars++) {
  36. DecodeUTF8Failure err;
  37. unsigned c = decode_utf8(src, &err);
  38. if (err != DUTF8_SUCCESS)
  39. return dupprintf(
  40. "cannot normalise this string: UTF-8 decoding error "
  41. "at character position %"SIZEu", byte position %"SIZEu": %s",
  42. nchars, src->pos, decode_utf8_error_strings[err]);
  43. if (!known(c))
  44. return dupprintf(
  45. "cannot stably normalise this string: code point %04X "
  46. "(at character position %"SIZEu", byte position %"SIZEu") "
  47. "is not in Unicode %s", c, nchars, src->pos,
  48. UNICODE_VERSION_SHORT);
  49. }
  50. return NULL;
  51. }