utf8.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878
  1. /*
  2. * utf8.c - routines to handle UTF-8.
  3. */
  4. #ifndef ENUM_CHARSETS
  5. #include "charset.h"
  6. #include "internal.h"
  7. /*
  8. * UTF-8 has no associated data, so `charset' may be ignored.
  9. */
  10. static void read_utf8(charset_spec const *charset, long int input_chr,
  11. charset_state *state,
  12. void (*emit)(void *ctx, long int output), void *emitctx)
  13. {
  14. UNUSEDARG(charset);
  15. /*
  16. * For reading UTF-8, the `state' word contains:
  17. *
  18. * - in bits 29-31, the number of bytes expected to be in the
  19. * current multibyte character (which we can tell instantly
  20. * from the first byte, of course).
  21. *
  22. * - in bits 26-28, the number of bytes _seen so far_ in the
  23. * current multibyte character.
  24. *
  25. * - in the remainder of the word, the current value of the
  26. * character, which is shifted upwards by 6 bits to
  27. * accommodate each new byte.
  28. *
  29. * As required, the state is zero when we are not in the middle
  30. * of a multibyte character at all.
  31. *
  32. * For example, when reading E9 8D 8B, starting at state=0:
  33. *
  34. * - after E9, the state is 0x64000009
  35. * - after 8D, the state is 0x6800024d
  36. * - after 8B, the state conceptually becomes 0x6c00934b, at
  37. * which point we notice we've got as many characters as we
  38. * were expecting, output U+934B, and reset the state to
  39. * zero.
  40. *
  41. * Note that the maximum number of bits we might need to store
  42. * in the character value field is 25 (U+7FFFFFFF contains 31
  43. * bits, but we will never actually store its full value
  44. * because when we receive the last 6 bits in the final
  45. * continuation byte we will output it and revert the state to
  46. * zero). Hence the character value field never collides with
  47. * the byte counts.
  48. */
  49. if (input_chr < 0x80) {
  50. /*
  51. * Single-byte character. If the state is nonzero before
  52. * coming here, output an error for an incomplete sequence.
  53. * Then output the character.
  54. */
  55. if (state->s0 != 0) {
  56. emit(emitctx, ERROR);
  57. state->s0 = 0;
  58. }
  59. emit(emitctx, input_chr);
  60. } else if (input_chr == 0xFE || input_chr == 0xFF) {
  61. /*
  62. * FE and FF bytes should _never_ occur in UTF-8. They are
  63. * automatic errors; if the state was nonzero to start
  64. * with, output a further error for an incomplete sequence.
  65. */
  66. if (state->s0 != 0) {
  67. emit(emitctx, ERROR);
  68. state->s0 = 0;
  69. }
  70. emit(emitctx, ERROR);
  71. } else if (input_chr >= 0x80 && input_chr < 0xC0) {
  72. /*
  73. * Continuation byte. Output an error for an unexpected
  74. * continuation byte, if the state is zero.
  75. */
  76. if (state->s0 == 0) {
  77. emit(emitctx, ERROR);
  78. } else {
  79. unsigned long charval;
  80. unsigned long topstuff;
  81. int bytes;
  82. /*
  83. * Otherwise, accumulate more of the character value.
  84. */
  85. charval = state->s0 & 0x03ffffffL;
  86. charval = (charval << 6) | (input_chr & 0x3F);
  87. /*
  88. * Check the byte counts; if we have not reached the
  89. * end of the character, update the state and return.
  90. */
  91. topstuff = state->s0 & 0xfc000000L;
  92. topstuff += 0x04000000L; /* add one to the byte count */
  93. if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
  94. state->s0 = topstuff | charval;
  95. return;
  96. }
  97. /*
  98. * Now we know we've reached the end of the character.
  99. * `charval' is the Unicode value. We should check for
  100. * various invalid things, and then either output
  101. * charval or an error. In all cases we reset the state
  102. * to zero.
  103. */
  104. bytes = topstuff >> 29;
  105. state->s0 = 0;
  106. if (charval >= 0xD800 && charval < 0xE000) {
  107. /*
  108. * Surrogates (0xD800-0xDFFF) may never be encoded
  109. * in UTF-8. A surrogate pair in Unicode should
  110. * have been encoded as a single UTF-8 character
  111. * occupying more than three bytes.
  112. */
  113. emit(emitctx, ERROR);
  114. } else if (charval == 0xFFFE || charval == 0xFFFF) {
  115. /*
  116. * U+FFFE and U+FFFF are invalid Unicode characters
  117. * and may never be encoded in UTF-8. (This is one
  118. * reason why U+FFFF is our way of signalling an
  119. * error to our `emit' function :-)
  120. */
  121. emit(emitctx, ERROR);
  122. } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
  123. (charval <= 0x7FFL && bytes > 2) ||
  124. (charval <= 0xFFFFL && bytes > 3) ||
  125. (charval <= 0x1FFFFFL && bytes > 4) ||
  126. (charval <= 0x3FFFFFFL && bytes > 5)) {
  127. /*
  128. * Overlong sequences are not to be tolerated,
  129. * under any circumstances.
  130. */
  131. emit(emitctx, ERROR);
  132. } else {
  133. /*
  134. * Oh, all right. We'll let this one off.
  135. */
  136. emit(emitctx, charval);
  137. }
  138. }
  139. } else {
  140. /*
  141. * Lead byte. First output an error for an incomplete
  142. * sequence, if the state is nonzero.
  143. */
  144. if (state->s0 != 0)
  145. emit(emitctx, ERROR);
  146. /*
  147. * Now deal with the lead byte: work out the number of
  148. * bytes we expect to see in this character, and extract
  149. * the initial bits of it too.
  150. */
  151. if (input_chr >= 0xC0 && input_chr < 0xE0) {
  152. state->s0 = 0x44000000L | (input_chr & 0x1F);
  153. } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
  154. state->s0 = 0x64000000L | (input_chr & 0x0F);
  155. } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
  156. state->s0 = 0x84000000L | (input_chr & 0x07);
  157. } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
  158. state->s0 = 0xa4000000L | (input_chr & 0x03);
  159. } else if (input_chr >= 0xFC && input_chr < 0xFE) {
  160. state->s0 = 0xc4000000L | (input_chr & 0x01);
  161. }
  162. }
  163. }
  164. /*
  165. * UTF-8 is a stateless multi-byte encoding (in the sense that just
  166. * after any character has been completed, the state is always the
  167. * same); hence when writing it, there is no need to use the
  168. * charset_state.
  169. */
  170. static void write_utf8(charset_spec const *charset, long int input_chr,
  171. charset_state *state,
  172. void (*emit)(void *ctx, long int output), void *emitctx)
  173. {
  174. UNUSEDARG(charset);
  175. UNUSEDARG(state);
  176. /*
  177. * Refuse to output any illegal code points.
  178. */
  179. if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
  180. (input_chr >= 0xD800 && input_chr < 0xE000)) {
  181. emit(emitctx, ERROR);
  182. } else if (input_chr < 0x80) { /* one-byte character */
  183. emit(emitctx, input_chr);
  184. } else if (input_chr < 0x800) { /* two-byte character */
  185. emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
  186. emit(emitctx, 0x80 | (0x3F & (input_chr )));
  187. } else if (input_chr < 0x10000) { /* three-byte character */
  188. emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
  189. emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
  190. emit(emitctx, 0x80 | (0x3F & (input_chr )));
  191. } else if (input_chr < 0x200000) { /* four-byte character */
  192. emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
  193. emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
  194. emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
  195. emit(emitctx, 0x80 | (0x3F & (input_chr )));
  196. } else if (input_chr < 0x4000000) {/* five-byte character */
  197. emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
  198. emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
  199. emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
  200. emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
  201. emit(emitctx, 0x80 | (0x3F & (input_chr )));
  202. } else { /* six-byte character */
  203. emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
  204. emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
  205. emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
  206. emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
  207. emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
  208. emit(emitctx, 0x80 | (0x3F & (input_chr )));
  209. }
  210. }
  211. #ifdef TESTMODE
  212. #include <stdio.h>
  213. #include <stdarg.h>
  214. int total_errs = 0;
  215. void utf8_emit(void *ctx, long output)
  216. {
  217. wchar_t **p = (wchar_t **)ctx;
  218. *(*p)++ = output;
  219. }
  220. void utf8_read_test(int line, char *input, int inlen, ...)
  221. {
  222. va_list ap;
  223. wchar_t *p, str[512];
  224. int i;
  225. charset_state state;
  226. unsigned long l;
  227. state.s0 = 0;
  228. p = str;
  229. for (i = 0; i < inlen; i++)
  230. read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
  231. va_start(ap, inlen);
  232. l = 0;
  233. for (i = 0; i < p - str; i++) {
  234. l = va_arg(ap, long int);
  235. if (l == -1) {
  236. printf("%d: correct string shorter than output\n", line);
  237. total_errs++;
  238. break;
  239. }
  240. if (l != str[i]) {
  241. printf("%d: char %d came out as %08x, should be %08x\n",
  242. line, i, str[i], (unsigned)l);
  243. total_errs++;
  244. }
  245. }
  246. if (l != -1) {
  247. l = va_arg(ap, long int);
  248. if (l != -1) {
  249. printf("%d: correct string longer than output\n", line);
  250. total_errs++;
  251. }
  252. }
  253. va_end(ap);
  254. }
  255. void utf8_write_test(int line, const long *input, int inlen, ...)
  256. {
  257. va_list ap;
  258. wchar_t *p, str[512];
  259. int i;
  260. charset_state state;
  261. unsigned long l;
  262. state.s0 = 0;
  263. p = str;
  264. for (i = 0; i < inlen; i++)
  265. write_utf8(NULL, input[i], &state, utf8_emit, &p);
  266. va_start(ap, inlen);
  267. l = 0;
  268. for (i = 0; i < p - str; i++) {
  269. l = va_arg(ap, long int);
  270. if (l == -1) {
  271. printf("%d: correct string shorter than output\n", line);
  272. total_errs++;
  273. break;
  274. }
  275. if (l != str[i]) {
  276. printf("%d: char %d came out as %08x, should be %08x\n",
  277. line, i, str[i], (unsigned)l);
  278. total_errs++;
  279. }
  280. }
  281. if (l != -1) {
  282. l = va_arg(ap, long int);
  283. if (l != -1) {
  284. printf("%d: correct string longer than output\n", line);
  285. total_errs++;
  286. }
  287. }
  288. va_end(ap);
  289. }
  290. /* Macro to concoct the first three parameters of utf8_read_test. */
  291. #define TESTSTR(x) __LINE__, x, lenof(x)
  292. int main(void)
  293. {
  294. printf("read tests beginning\n");
  295. utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
  296. 0x000003BA, /* GREEK SMALL LETTER KAPPA */
  297. 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
  298. 0x000003C3, /* GREEK SMALL LETTER SIGMA */
  299. 0x000003BC, /* GREEK SMALL LETTER MU */
  300. 0x000003B5, /* GREEK SMALL LETTER EPSILON */
  301. 0, -1);
  302. utf8_read_test(TESTSTR("\x00"),
  303. 0x00000000, /* <control> */
  304. 0, -1);
  305. utf8_read_test(TESTSTR("\xC2\x80"),
  306. 0x00000080, /* <control> */
  307. 0, -1);
  308. utf8_read_test(TESTSTR("\xE0\xA0\x80"),
  309. 0x00000800, /* <no name available> */
  310. 0, -1);
  311. utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
  312. 0x00010000, /* <no name available> */
  313. 0, -1);
  314. utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
  315. 0x00200000, /* <no name available> */
  316. 0, -1);
  317. utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
  318. 0x04000000, /* <no name available> */
  319. 0, -1);
  320. utf8_read_test(TESTSTR("\x7F"),
  321. 0x0000007F, /* <control> */
  322. 0, -1);
  323. utf8_read_test(TESTSTR("\xDF\xBF"),
  324. 0x000007FF, /* <no name available> */
  325. 0, -1);
  326. utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
  327. 0x0000FFFD, /* REPLACEMENT CHARACTER */
  328. 0, -1);
  329. utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
  330. ERROR, /* <no name available> (invalid char) */
  331. 0, -1);
  332. utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
  333. 0x001FFFFF, /* <no name available> */
  334. 0, -1);
  335. utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
  336. 0x03FFFFFF, /* <no name available> */
  337. 0, -1);
  338. utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
  339. 0x7FFFFFFF, /* <no name available> */
  340. 0, -1);
  341. utf8_read_test(TESTSTR("\xED\x9F\xBF"),
  342. 0x0000D7FF, /* <no name available> */
  343. 0, -1);
  344. utf8_read_test(TESTSTR("\xEE\x80\x80"),
  345. 0x0000E000, /* <Private Use, First> */
  346. 0, -1);
  347. utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
  348. 0x0000FFFD, /* REPLACEMENT CHARACTER */
  349. 0, -1);
  350. utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
  351. 0x0010FFFF, /* <no name available> */
  352. 0, -1);
  353. utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
  354. 0x00110000, /* <no name available> */
  355. 0, -1);
  356. utf8_read_test(TESTSTR("\x80"),
  357. ERROR, /* (unexpected continuation byte) */
  358. 0, -1);
  359. utf8_read_test(TESTSTR("\xBF"),
  360. ERROR, /* (unexpected continuation byte) */
  361. 0, -1);
  362. utf8_read_test(TESTSTR("\x80\xBF"),
  363. ERROR, /* (unexpected continuation byte) */
  364. ERROR, /* (unexpected continuation byte) */
  365. 0, -1);
  366. utf8_read_test(TESTSTR("\x80\xBF\x80"),
  367. ERROR, /* (unexpected continuation byte) */
  368. ERROR, /* (unexpected continuation byte) */
  369. ERROR, /* (unexpected continuation byte) */
  370. 0, -1);
  371. utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
  372. ERROR, /* (unexpected continuation byte) */
  373. ERROR, /* (unexpected continuation byte) */
  374. ERROR, /* (unexpected continuation byte) */
  375. ERROR, /* (unexpected continuation byte) */
  376. 0, -1);
  377. utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
  378. ERROR, /* (unexpected continuation byte) */
  379. ERROR, /* (unexpected continuation byte) */
  380. ERROR, /* (unexpected continuation byte) */
  381. ERROR, /* (unexpected continuation byte) */
  382. ERROR, /* (unexpected continuation byte) */
  383. 0, -1);
  384. utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
  385. ERROR, /* (unexpected continuation byte) */
  386. ERROR, /* (unexpected continuation byte) */
  387. ERROR, /* (unexpected continuation byte) */
  388. ERROR, /* (unexpected continuation byte) */
  389. ERROR, /* (unexpected continuation byte) */
  390. ERROR, /* (unexpected continuation byte) */
  391. 0, -1);
  392. utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
  393. ERROR, /* (unexpected continuation byte) */
  394. ERROR, /* (unexpected continuation byte) */
  395. ERROR, /* (unexpected continuation byte) */
  396. ERROR, /* (unexpected continuation byte) */
  397. ERROR, /* (unexpected continuation byte) */
  398. ERROR, /* (unexpected continuation byte) */
  399. ERROR, /* (unexpected continuation byte) */
  400. 0, -1);
  401. utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
  402. ERROR, /* (unexpected continuation byte) */
  403. ERROR, /* (unexpected continuation byte) */
  404. ERROR, /* (unexpected continuation byte) */
  405. ERROR, /* (unexpected continuation byte) */
  406. ERROR, /* (unexpected continuation byte) */
  407. ERROR, /* (unexpected continuation byte) */
  408. ERROR, /* (unexpected continuation byte) */
  409. ERROR, /* (unexpected continuation byte) */
  410. ERROR, /* (unexpected continuation byte) */
  411. ERROR, /* (unexpected continuation byte) */
  412. ERROR, /* (unexpected continuation byte) */
  413. ERROR, /* (unexpected continuation byte) */
  414. ERROR, /* (unexpected continuation byte) */
  415. ERROR, /* (unexpected continuation byte) */
  416. ERROR, /* (unexpected continuation byte) */
  417. ERROR, /* (unexpected continuation byte) */
  418. ERROR, /* (unexpected continuation byte) */
  419. ERROR, /* (unexpected continuation byte) */
  420. ERROR, /* (unexpected continuation byte) */
  421. ERROR, /* (unexpected continuation byte) */
  422. ERROR, /* (unexpected continuation byte) */
  423. ERROR, /* (unexpected continuation byte) */
  424. ERROR, /* (unexpected continuation byte) */
  425. ERROR, /* (unexpected continuation byte) */
  426. ERROR, /* (unexpected continuation byte) */
  427. ERROR, /* (unexpected continuation byte) */
  428. ERROR, /* (unexpected continuation byte) */
  429. ERROR, /* (unexpected continuation byte) */
  430. ERROR, /* (unexpected continuation byte) */
  431. ERROR, /* (unexpected continuation byte) */
  432. ERROR, /* (unexpected continuation byte) */
  433. ERROR, /* (unexpected continuation byte) */
  434. ERROR, /* (unexpected continuation byte) */
  435. ERROR, /* (unexpected continuation byte) */
  436. ERROR, /* (unexpected continuation byte) */
  437. ERROR, /* (unexpected continuation byte) */
  438. ERROR, /* (unexpected continuation byte) */
  439. ERROR, /* (unexpected continuation byte) */
  440. ERROR, /* (unexpected continuation byte) */
  441. ERROR, /* (unexpected continuation byte) */
  442. ERROR, /* (unexpected continuation byte) */
  443. ERROR, /* (unexpected continuation byte) */
  444. ERROR, /* (unexpected continuation byte) */
  445. ERROR, /* (unexpected continuation byte) */
  446. ERROR, /* (unexpected continuation byte) */
  447. ERROR, /* (unexpected continuation byte) */
  448. ERROR, /* (unexpected continuation byte) */
  449. ERROR, /* (unexpected continuation byte) */
  450. ERROR, /* (unexpected continuation byte) */
  451. ERROR, /* (unexpected continuation byte) */
  452. ERROR, /* (unexpected continuation byte) */
  453. ERROR, /* (unexpected continuation byte) */
  454. ERROR, /* (unexpected continuation byte) */
  455. ERROR, /* (unexpected continuation byte) */
  456. ERROR, /* (unexpected continuation byte) */
  457. ERROR, /* (unexpected continuation byte) */
  458. ERROR, /* (unexpected continuation byte) */
  459. ERROR, /* (unexpected continuation byte) */
  460. ERROR, /* (unexpected continuation byte) */
  461. ERROR, /* (unexpected continuation byte) */
  462. ERROR, /* (unexpected continuation byte) */
  463. ERROR, /* (unexpected continuation byte) */
  464. ERROR, /* (unexpected continuation byte) */
  465. ERROR, /* (unexpected continuation byte) */
  466. 0, -1);
  467. utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
  468. ERROR, /* (incomplete sequence) */
  469. 0x00000020, /* SPACE */
  470. ERROR, /* (incomplete sequence) */
  471. 0x00000020, /* SPACE */
  472. ERROR, /* (incomplete sequence) */
  473. 0x00000020, /* SPACE */
  474. ERROR, /* (incomplete sequence) */
  475. 0x00000020, /* SPACE */
  476. ERROR, /* (incomplete sequence) */
  477. 0x00000020, /* SPACE */
  478. ERROR, /* (incomplete sequence) */
  479. 0x00000020, /* SPACE */
  480. ERROR, /* (incomplete sequence) */
  481. 0x00000020, /* SPACE */
  482. ERROR, /* (incomplete sequence) */
  483. 0x00000020, /* SPACE */
  484. 0, -1);
  485. utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
  486. ERROR, /* (incomplete sequence) */
  487. 0x00000020, /* SPACE */
  488. ERROR, /* (incomplete sequence) */
  489. 0x00000020, /* SPACE */
  490. ERROR, /* (incomplete sequence) */
  491. 0x00000020, /* SPACE */
  492. ERROR, /* (incomplete sequence) */
  493. 0x00000020, /* SPACE */
  494. ERROR, /* (incomplete sequence) */
  495. 0x00000020, /* SPACE */
  496. ERROR, /* (incomplete sequence) */
  497. 0x00000020, /* SPACE */
  498. ERROR, /* (incomplete sequence) */
  499. 0x00000020, /* SPACE */
  500. ERROR, /* (incomplete sequence) */
  501. 0x00000020, /* SPACE */
  502. ERROR, /* (incomplete sequence) */
  503. 0x00000020, /* SPACE */
  504. ERROR, /* (incomplete sequence) */
  505. 0x00000020, /* SPACE */
  506. ERROR, /* (incomplete sequence) */
  507. 0x00000020, /* SPACE */
  508. ERROR, /* (incomplete sequence) */
  509. 0x00000020, /* SPACE */
  510. ERROR, /* (incomplete sequence) */
  511. 0x00000020, /* SPACE */
  512. ERROR, /* (incomplete sequence) */
  513. 0x00000020, /* SPACE */
  514. ERROR, /* (incomplete sequence) */
  515. 0x00000020, /* SPACE */
  516. ERROR, /* (incomplete sequence) */
  517. 0x00000020, /* SPACE */
  518. 0, -1);
  519. utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
  520. ERROR, /* (incomplete sequence) */
  521. 0x00000020, /* SPACE */
  522. ERROR, /* (incomplete sequence) */
  523. 0x00000020, /* SPACE */
  524. ERROR, /* (incomplete sequence) */
  525. 0x00000020, /* SPACE */
  526. ERROR, /* (incomplete sequence) */
  527. 0x00000020, /* SPACE */
  528. ERROR, /* (incomplete sequence) */
  529. 0x00000020, /* SPACE */
  530. ERROR, /* (incomplete sequence) */
  531. 0x00000020, /* SPACE */
  532. ERROR, /* (incomplete sequence) */
  533. 0x00000020, /* SPACE */
  534. ERROR, /* (incomplete sequence) */
  535. 0x00000020, /* SPACE */
  536. 0, -1);
  537. utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
  538. ERROR, /* (incomplete sequence) */
  539. 0x00000020, /* SPACE */
  540. ERROR, /* (incomplete sequence) */
  541. 0x00000020, /* SPACE */
  542. ERROR, /* (incomplete sequence) */
  543. 0x00000020, /* SPACE */
  544. ERROR, /* (incomplete sequence) */
  545. 0x00000020, /* SPACE */
  546. 0, -1);
  547. utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
  548. ERROR, /* (incomplete sequence) */
  549. 0x00000020, /* SPACE */
  550. ERROR, /* (incomplete sequence) */
  551. 0x00000020, /* SPACE */
  552. 0, -1);
  553. utf8_read_test(TESTSTR("\xC0"),
  554. ERROR, /* (incomplete sequence) */
  555. 0, -1);
  556. utf8_read_test(TESTSTR("\xE0\x80"),
  557. ERROR, /* (incomplete sequence) */
  558. 0, -1);
  559. utf8_read_test(TESTSTR("\xF0\x80\x80"),
  560. ERROR, /* (incomplete sequence) */
  561. 0, -1);
  562. utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
  563. ERROR, /* (incomplete sequence) */
  564. 0, -1);
  565. utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
  566. ERROR, /* (incomplete sequence) */
  567. 0, -1);
  568. utf8_read_test(TESTSTR("\xDF"),
  569. ERROR, /* (incomplete sequence) */
  570. 0, -1);
  571. utf8_read_test(TESTSTR("\xEF\xBF"),
  572. ERROR, /* (incomplete sequence) */
  573. 0, -1);
  574. utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
  575. ERROR, /* (incomplete sequence) */
  576. 0, -1);
  577. utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
  578. ERROR, /* (incomplete sequence) */
  579. 0, -1);
  580. utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
  581. ERROR, /* (incomplete sequence) */
  582. 0, -1);
  583. utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
  584. ERROR, /* (incomplete sequence) */
  585. ERROR, /* (incomplete sequence) */
  586. ERROR, /* (incomplete sequence) */
  587. ERROR, /* (incomplete sequence) */
  588. ERROR, /* (incomplete sequence) */
  589. ERROR, /* (incomplete sequence) */
  590. ERROR, /* (incomplete sequence) */
  591. ERROR, /* (incomplete sequence) */
  592. ERROR, /* (incomplete sequence) */
  593. ERROR, /* (incomplete sequence) */
  594. 0, -1);
  595. utf8_read_test(TESTSTR("\xFE"),
  596. ERROR, /* (invalid UTF-8 byte) */
  597. 0, -1);
  598. utf8_read_test(TESTSTR("\xFF"),
  599. ERROR, /* (invalid UTF-8 byte) */
  600. 0, -1);
  601. utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
  602. ERROR, /* (invalid UTF-8 byte) */
  603. ERROR, /* (invalid UTF-8 byte) */
  604. ERROR, /* (invalid UTF-8 byte) */
  605. ERROR, /* (invalid UTF-8 byte) */
  606. 0, -1);
  607. utf8_read_test(TESTSTR("\xC0\xAF"),
  608. ERROR, /* SOLIDUS (overlong form of 2F) */
  609. 0, -1);
  610. utf8_read_test(TESTSTR("\xE0\x80\xAF"),
  611. ERROR, /* SOLIDUS (overlong form of 2F) */
  612. 0, -1);
  613. utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
  614. ERROR, /* SOLIDUS (overlong form of 2F) */
  615. 0, -1);
  616. utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
  617. ERROR, /* SOLIDUS (overlong form of 2F) */
  618. 0, -1);
  619. utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
  620. ERROR, /* SOLIDUS (overlong form of 2F) */
  621. 0, -1);
  622. utf8_read_test(TESTSTR("\xC1\xBF"),
  623. ERROR, /* <control> (overlong form of 7F) */
  624. 0, -1);
  625. utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
  626. ERROR, /* <no name available> (overlong form of DF BF) */
  627. 0, -1);
  628. utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
  629. ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
  630. 0, -1);
  631. utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
  632. ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
  633. 0, -1);
  634. utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
  635. ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
  636. 0, -1);
  637. utf8_read_test(TESTSTR("\xC0\x80"),
  638. ERROR, /* <control> (overlong form of 00) */
  639. 0, -1);
  640. utf8_read_test(TESTSTR("\xE0\x80\x80"),
  641. ERROR, /* <control> (overlong form of 00) */
  642. 0, -1);
  643. utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
  644. ERROR, /* <control> (overlong form of 00) */
  645. 0, -1);
  646. utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
  647. ERROR, /* <control> (overlong form of 00) */
  648. 0, -1);
  649. utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
  650. ERROR, /* <control> (overlong form of 00) */
  651. 0, -1);
  652. utf8_read_test(TESTSTR("\xED\xA0\x80"),
  653. ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
  654. 0, -1);
  655. utf8_read_test(TESTSTR("\xED\xAD\xBF"),
  656. ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
  657. 0, -1);
  658. utf8_read_test(TESTSTR("\xED\xAE\x80"),
  659. ERROR, /* <Private Use High Surrogate, First> (surrogate) */
  660. 0, -1);
  661. utf8_read_test(TESTSTR("\xED\xAF\xBF"),
  662. ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
  663. 0, -1);
  664. utf8_read_test(TESTSTR("\xED\xB0\x80"),
  665. ERROR, /* <Low Surrogate, First> (surrogate) */
  666. 0, -1);
  667. utf8_read_test(TESTSTR("\xED\xBE\x80"),
  668. ERROR, /* <no name available> (surrogate) */
  669. 0, -1);
  670. utf8_read_test(TESTSTR("\xED\xBF\xBF"),
  671. ERROR, /* <Low Surrogate, Last> (surrogate) */
  672. 0, -1);
  673. utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
  674. ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
  675. ERROR, /* <Low Surrogate, First> (surrogate) */
  676. 0, -1);
  677. utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
  678. ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
  679. ERROR, /* <Low Surrogate, Last> (surrogate) */
  680. 0, -1);
  681. utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
  682. ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
  683. ERROR, /* <Low Surrogate, First> (surrogate) */
  684. 0, -1);
  685. utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
  686. ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
  687. ERROR, /* <Low Surrogate, Last> (surrogate) */
  688. 0, -1);
  689. utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
  690. ERROR, /* <Private Use High Surrogate, First> (surrogate) */
  691. ERROR, /* <Low Surrogate, First> (surrogate) */
  692. 0, -1);
  693. utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
  694. ERROR, /* <Private Use High Surrogate, First> (surrogate) */
  695. ERROR, /* <Low Surrogate, Last> (surrogate) */
  696. 0, -1);
  697. utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
  698. ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
  699. ERROR, /* <Low Surrogate, First> (surrogate) */
  700. 0, -1);
  701. utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
  702. ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
  703. ERROR, /* <Low Surrogate, Last> (surrogate) */
  704. 0, -1);
  705. utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
  706. ERROR, /* <no name available> (invalid char) */
  707. 0, -1);
  708. utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
  709. ERROR, /* <no name available> (invalid char) */
  710. 0, -1);
  711. printf("read tests completed\n");
  712. printf("write tests beginning\n");
  713. {
  714. const static long str[] =
  715. {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
  716. utf8_write_test(TESTSTR(str),
  717. 0xCE, 0xBA,
  718. 0xE1, 0xBD, 0xB9,
  719. 0xCF, 0x83,
  720. 0xCE, 0xBC,
  721. 0xCE, 0xB5,
  722. 0, -1);
  723. }
  724. {
  725. const static long str[] = {0x0000L, 0};
  726. utf8_write_test(TESTSTR(str),
  727. 0x00,
  728. 0, -1);
  729. }
  730. {
  731. const static long str[] = {0x0080L, 0};
  732. utf8_write_test(TESTSTR(str),
  733. 0xC2, 0x80,
  734. 0, -1);
  735. }
  736. {
  737. const static long str[] = {0x0800L, 0};
  738. utf8_write_test(TESTSTR(str),
  739. 0xE0, 0xA0, 0x80,
  740. 0, -1);
  741. }
  742. {
  743. const static long str[] = {0x00010000L, 0};
  744. utf8_write_test(TESTSTR(str),
  745. 0xF0, 0x90, 0x80, 0x80,
  746. 0, -1);
  747. }
  748. {
  749. const static long str[] = {0x00200000L, 0};
  750. utf8_write_test(TESTSTR(str),
  751. 0xF8, 0x88, 0x80, 0x80, 0x80,
  752. 0, -1);
  753. }
  754. {
  755. const static long str[] = {0x04000000L, 0};
  756. utf8_write_test(TESTSTR(str),
  757. 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
  758. 0, -1);
  759. }
  760. {
  761. const static long str[] = {0x007FL, 0};
  762. utf8_write_test(TESTSTR(str),
  763. 0x7F,
  764. 0, -1);
  765. }
  766. {
  767. const static long str[] = {0x07FFL, 0};
  768. utf8_write_test(TESTSTR(str),
  769. 0xDF, 0xBF,
  770. 0, -1);
  771. }
  772. {
  773. const static long str[] = {0xFFFDL, 0};
  774. utf8_write_test(TESTSTR(str),
  775. 0xEF, 0xBF, 0xBD,
  776. 0, -1);
  777. }
  778. {
  779. const static long str[] = {0xFFFFL, 0};
  780. utf8_write_test(TESTSTR(str),
  781. ERROR,
  782. 0, -1);
  783. }
  784. {
  785. const static long str[] = {0x001FFFFFL, 0};
  786. utf8_write_test(TESTSTR(str),
  787. 0xF7, 0xBF, 0xBF, 0xBF,
  788. 0, -1);
  789. }
  790. {
  791. const static long str[] = {0x03FFFFFFL, 0};
  792. utf8_write_test(TESTSTR(str),
  793. 0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
  794. 0, -1);
  795. }
  796. {
  797. const static long str[] = {0x7FFFFFFFL, 0};
  798. utf8_write_test(TESTSTR(str),
  799. 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
  800. 0, -1);
  801. }
  802. {
  803. const static long str[] = {0xD7FFL, 0};
  804. utf8_write_test(TESTSTR(str),
  805. 0xED, 0x9F, 0xBF,
  806. 0, -1);
  807. }
  808. {
  809. const static long str[] = {0xD800L, 0};
  810. utf8_write_test(TESTSTR(str),
  811. ERROR,
  812. 0, -1);
  813. }
  814. {
  815. const static long str[] = {0xD800L, 0xDC00L, 0};
  816. utf8_write_test(TESTSTR(str),
  817. ERROR,
  818. ERROR,
  819. 0, -1);
  820. }
  821. {
  822. const static long str[] = {0xDFFFL, 0};
  823. utf8_write_test(TESTSTR(str),
  824. ERROR,
  825. 0, -1);
  826. }
  827. {
  828. const static long str[] = {0xE000L, 0};
  829. utf8_write_test(TESTSTR(str),
  830. 0xEE, 0x80, 0x80,
  831. 0, -1);
  832. }
  833. printf("write tests completed\n");
  834. printf("total: %d errors\n", total_errs);
  835. return (total_errs != 0);
  836. }
  837. #endif /* TESTMODE */
  838. const charset_spec charset_CS_UTF8 = {
  839. CS_UTF8, read_utf8, write_utf8, NULL
  840. };
  841. #else /* ENUM_CHARSETS */
  842. ENUM_CHARSET(CS_UTF8)
  843. #endif /* ENUM_CHARSETS */