decode_utf8.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. /*
  2. * Decode a single UTF-8 character.
  3. */
  4. #include "putty.h"
  5. #include "misc.h"
  6. unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err)
  7. {
  8. /* Permit user to pass NULL as the err pointer */
  9. DecodeUTF8Failure dummy;
  10. if (!err) err = &dummy;
  11. /* If the source has no byte available, this will return 0, which
  12. * we'll return immediately and is a reasonable error return anyway */
  13. unsigned char c = get_byte(src);
  14. /* One-byte cases. */
  15. if (c < 0x80) {
  16. *err = DUTF8_SUCCESS;
  17. return c;
  18. } else if (c < 0xC0) {
  19. *err = DUTF8_SPURIOUS_CONTINUATION;
  20. return 0xFFFD;
  21. }
  22. unsigned long wc, min;
  23. size_t ncont;
  24. if (c < 0xE0) {
  25. wc = c & 0x1F; ncont = 1; min = 0x80;
  26. } else if (c < 0xF0) {
  27. wc = c & 0x0F; ncont = 2; min = 0x800;
  28. } else if (c < 0xF8) {
  29. wc = c & 0x07; ncont = 3; min = 0x10000;
  30. } else if (c < 0xFC) {
  31. wc = c & 0x03; ncont = 4; min = 0x200000;
  32. } else if (c < 0xFE) {
  33. wc = c & 0x01; ncont = 5; min = 0x4000000;
  34. } else {
  35. *err = DUTF8_ILLEGAL_BYTE; /* FE or FF */
  36. return 0xFFFD;
  37. }
  38. while (ncont-- > 0) {
  39. if (!get_avail(src)) {
  40. *err = DUTF8_E_OUT_OF_DATA;
  41. return 0xFFFD;
  42. }
  43. unsigned char cont = get_byte(src);
  44. if (!(0x80 <= cont && cont < 0xC0)) {
  45. BinarySource_REWIND_TO(src, src->pos - 1);
  46. *err = DUTF8_TRUNCATED_SEQUENCE;
  47. return 0xFFFD;
  48. }
  49. wc = (wc << 6) | (cont & 0x3F);
  50. }
  51. if (wc < min) {
  52. *err = DUTF8_OVERLONG_ENCODING;
  53. return 0xFFFD;
  54. }
  55. if (0xD800 <= wc && wc < 0xE000) {
  56. *err = DUTF8_ENCODED_SURROGATE;
  57. return 0xFFFD;
  58. }
  59. if (wc > 0x10FFFF) {
  60. *err = DUTF8_CODE_POINT_TOO_BIG;
  61. return 0xFFFD; /* outside Unicode range */
  62. }
  63. *err = DUTF8_SUCCESS;
  64. return wc;
  65. }
  66. const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES] = {
  67. #define MSG_ENTRY(sym, string) string,
  68. DECODE_UTF8_FAILURE_LIST(MSG_ENTRY)
  69. #undef MSG_ENTRY
  70. };
  71. #ifdef TEST
  72. #include <stdio.h>
  73. void out_of_memory(void)
  74. {
  75. fprintf(stderr, "out of memory!\n");
  76. exit(2);
  77. }
  78. static const char *const decode_utf8_error_syms[DUTF8_N_FAILURE_CODES] = {
  79. #define SYM_ENTRY(sym, string) #sym,
  80. DECODE_UTF8_FAILURE_LIST(SYM_ENTRY)
  81. #undef SYM_ENTRY
  82. };
  83. bool dotest(const char *file, int line, const char *input, size_t ninput,
  84. const unsigned long *chars, size_t nchars)
  85. {
  86. BinarySource src[1];
  87. BinarySource_BARE_INIT(src, input, ninput);
  88. size_t noutput = 0;
  89. printf("%s:%d: test start\n", file, line);
  90. while (get_avail(src)) {
  91. size_t before = src->pos;
  92. DecodeUTF8Failure err;
  93. unsigned long wc = decode_utf8(src, &err);
  94. printf("%s:%d in+%"SIZEu" out+%"SIZEu":", file, line, before, noutput);
  95. while (before < src->pos)
  96. printf(" %02x", (unsigned)(unsigned char)(input[before++]));
  97. printf(" -> U-%08lx %s\n", wc, decode_utf8_error_syms[err]);
  98. if (noutput >= nchars) {
  99. printf("%s:%d: FAIL: expected no further output\n", file, line);
  100. return false;
  101. }
  102. if (chars[noutput] != wc) {
  103. printf("%s:%d: FAIL: expected U-%08lx\n",
  104. file, line, chars[noutput]);
  105. return false;
  106. }
  107. noutput++;
  108. DecodeUTF8Failure expected_err;
  109. if (wc == 0xFFFD) {
  110. /* In the 'chars' array, any occurrence of 0xFFFD is followed
  111. * by the expected error code */
  112. assert(noutput < nchars && "bad test data");
  113. expected_err = chars[noutput++];
  114. } else {
  115. /* Expect success status to go with any non-FFFD character */
  116. expected_err = DUTF8_SUCCESS;
  117. }
  118. if (err != expected_err) {
  119. printf("%s:%d: FAIL: expected %s\n", file, line,
  120. decode_utf8_error_syms[expected_err]);
  121. return false;
  122. }
  123. }
  124. if (noutput < nchars) {
  125. printf("%s:%d: FAIL: expected further output\n", file, line);
  126. return false;
  127. }
  128. printf("%s:%d: pass\n", file, line);
  129. return true;
  130. }
  131. #define DOTEST(input, ...) do { \
  132. static const unsigned long chars[] = { __VA_ARGS__ }; \
  133. ntest++; \
  134. if (dotest(__FILE__, __LINE__, input, sizeof(input)-1, \
  135. chars, lenof(chars))) \
  136. npass++; \
  137. } while (0)
  138. int main(void)
  139. {
  140. int ntest = 0, npass = 0;
  141. DOTEST("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5",
  142. 0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5);
  143. /* First sequence of each length */
  144. DOTEST("\x00", 0x0000);
  145. DOTEST("\xC2\x80", 0x0080);
  146. DOTEST("\xE0\xA0\x80", 0x0800);
  147. DOTEST("\xF0\x90\x80\x80", 0x00010000);
  148. DOTEST("\xF8\x88\x80\x80\x80",
  149. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00200000 */
  150. DOTEST("\xFC\x84\x80\x80\x80\x80",
  151. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x04000000 */
  152. /* Last sequence of each length */
  153. DOTEST("\x7F", 0x007F);
  154. DOTEST("\xDF\xBF", 0x07FF);
  155. DOTEST("\xEF\xBF\xBF", 0xFFFF);
  156. DOTEST("\xF7\xBF\xBF\xBF",
  157. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x001FFFFF */
  158. DOTEST("\xFB\xBF\xBF\xBF\xBF",
  159. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x03FFFFFF */
  160. DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF",
  161. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x7FFFFFFF */
  162. /* Endpoints of the surrogate range */
  163. DOTEST("\xED\x9F\xBF", 0xD7FF);
  164. DOTEST("\xED\xA0\x80", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xD800 */
  165. DOTEST("\xED\xBF\xBF", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xDFFF */
  166. DOTEST("\xEE\x80\x80", 0xE000);
  167. /* REPLACEMENT CHARACTER itself */
  168. DOTEST("\xEF\xBF\xBD", 0xFFFD, DUTF8_SUCCESS); /* FFFD but no error! */
  169. /* Endpoints of the legal Unicode range */
  170. DOTEST("\xF4\x8F\xBF\xBF", 0x0010FFFF);
  171. DOTEST("\xF4\x90\x80\x80", 0xFFFD,
  172. DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00110000 */
  173. /* Spurious continuation bytes, each shown as a separate failure */
  174. DOTEST("\x80 \x81\x82 \xBD\xBE\xBF",
  175. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  176. 0x0020,
  177. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  178. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  179. 0x0020,
  180. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  181. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  182. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION);
  183. /* Truncated sequences, each shown as just one failure. The last
  184. * one gets a different error code because the sequence is
  185. * interrupted by the end of the string instead of another
  186. * character, so that if the string were a prefix of a longer
  187. * chunk of data then that would not _necessarily_ indicate an
  188. * error */
  189. DOTEST("\xC2\xE0\xA0\xF0\x90\x80\xF8\x88\x80\x80\xFC\x84\x80\x80\x80",
  190. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  191. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  192. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  193. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  194. 0xFFFD, DUTF8_E_OUT_OF_DATA);
  195. DOTEST("\xC2 \xE0\xA0 \xF0\x90\x80 \xF8\x88\x80\x80 \xFC\x84\x80\x80\x80",
  196. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  197. 0x0020,
  198. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  199. 0x0020,
  200. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  201. 0x0020,
  202. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  203. 0x0020,
  204. 0xFFFD, DUTF8_E_OUT_OF_DATA);
  205. /* Illegal bytes */
  206. DOTEST("\xFE\xFF", 0xFFFD, DUTF8_ILLEGAL_BYTE, 0xFFFD, DUTF8_ILLEGAL_BYTE);
  207. /* Overlong sequences */
  208. DOTEST("\xC1\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  209. DOTEST("\xE0\x9F\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  210. DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  211. DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  212. DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  213. DOTEST("\xC0\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  214. DOTEST("\xE0\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  215. DOTEST("\xF0\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  216. DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  217. DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  218. printf("%d tests %d passed", ntest, npass);
  219. if (npass < ntest) {
  220. printf(" %d FAILED\n", ntest-npass);
  221. return 1;
  222. } else {
  223. printf("\n");
  224. return 0;
  225. }
  226. }
  227. #endif