utf8.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. /* This Source Code Form is subject to the terms of the Mozilla Public
  2. * License, v. 2.0. If a copy of the MPL was not distributed with this
  3. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  4. #include "seccomon.h"
  5. #include "secport.h"
  6. /*
  7. * From RFC 2044:
  8. *
  9. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  10. * 0000 0000-0000 007F 0xxxxxxx
  11. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  12. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  13. * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  14. * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  15. * 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
  16. */
  17. /*
  18. * From http://www.imc.org/draft-hoffman-utf16
  19. *
  20. * For U on [0x00010000,0x0010FFFF]: Let U' = U - 0x00010000
  21. *
  22. * U' = yyyyyyyyyyxxxxxxxxxx
  23. * W1 = 110110yyyyyyyyyy
  24. * W2 = 110111xxxxxxxxxx
  25. */
  26. /*
  27. * This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit
  28. * character values. If you wish to use this code for working with
  29. * host byte order values, define the following:
  30. *
  31. * #if IS_BIG_ENDIAN
  32. * #define L_0 0
  33. * #define L_1 1
  34. * #define L_2 2
  35. * #define L_3 3
  36. * #define H_0 0
  37. * #define H_1 1
  38. * #else / * not everyone has elif * /
  39. * #if IS_LITTLE_ENDIAN
  40. * #define L_0 3
  41. * #define L_1 2
  42. * #define L_2 1
  43. * #define L_3 0
  44. * #define H_0 1
  45. * #define H_1 0
  46. * #else
  47. * #error "PDP and NUXI support deferred"
  48. * #endif / * IS_LITTLE_ENDIAN * /
  49. * #endif / * IS_BIG_ENDIAN * /
  50. */
  51. #define L_0 0
  52. #define L_1 1
  53. #define L_2 2
  54. #define L_3 3
  55. #define H_0 0
  56. #define H_1 1
  57. #define BAD_UTF8 ((PRUint32)-1)
  58. /*
  59. * Parse a single UTF-8 character per the spec. in section 3.9 (D36)
  60. * of Unicode 4.0.0.
  61. *
  62. * Parameters:
  63. * index - Points to the byte offset in inBuf of character to read. On success,
  64. * updated to the offset of the following character.
  65. * inBuf - Input buffer, UTF-8 encoded
  66. * inbufLen - Length of input buffer, in bytes.
  67. *
  68. * Returns:
  69. * Success - The UCS4 encoded character
  70. * Failure - BAD_UTF8
  71. */
  72. static PRUint32
  73. sec_port_read_utf8(unsigned int *index, unsigned char *inBuf, unsigned int inBufLen)
  74. {
  75. PRUint32 result;
  76. unsigned int i = *index;
  77. int bytes_left;
  78. PRUint32 min_value;
  79. PORT_Assert(i < inBufLen);
  80. if ((inBuf[i] & 0x80) == 0x00) {
  81. result = inBuf[i++];
  82. bytes_left = 0;
  83. min_value = 0;
  84. } else if ((inBuf[i] & 0xE0) == 0xC0) {
  85. result = inBuf[i++] & 0x1F;
  86. bytes_left = 1;
  87. min_value = 0x80;
  88. } else if ((inBuf[i] & 0xF0) == 0xE0) {
  89. result = inBuf[i++] & 0x0F;
  90. bytes_left = 2;
  91. min_value = 0x800;
  92. } else if ((inBuf[i] & 0xF8) == 0xF0) {
  93. result = inBuf[i++] & 0x07;
  94. bytes_left = 3;
  95. min_value = 0x10000;
  96. } else {
  97. return BAD_UTF8;
  98. }
  99. while (bytes_left--) {
  100. if (i >= inBufLen || (inBuf[i] & 0xC0) != 0x80)
  101. return BAD_UTF8;
  102. result = (result << 6) | (inBuf[i++] & 0x3F);
  103. }
  104. /* Check for overlong sequences, surrogates, and outside unicode range */
  105. if (result < min_value || (result & 0xFFFFF800) == 0xD800 || result > 0x10FFFF) {
  106. return BAD_UTF8;
  107. }
  108. *index = i;
  109. return result;
  110. }
  111. PRBool
  112. sec_port_ucs4_utf8_conversion_function(
  113. PRBool toUnicode,
  114. unsigned char *inBuf,
  115. unsigned int inBufLen,
  116. unsigned char *outBuf,
  117. unsigned int maxOutBufLen,
  118. unsigned int *outBufLen)
  119. {
  120. PORT_Assert((unsigned int *)NULL != outBufLen);
  121. if (toUnicode) {
  122. unsigned int i, len = 0;
  123. for (i = 0; i < inBufLen;) {
  124. if ((inBuf[i] & 0x80) == 0x00)
  125. i += 1;
  126. else if ((inBuf[i] & 0xE0) == 0xC0)
  127. i += 2;
  128. else if ((inBuf[i] & 0xF0) == 0xE0)
  129. i += 3;
  130. else if ((inBuf[i] & 0xF8) == 0xF0)
  131. i += 4;
  132. else
  133. return PR_FALSE;
  134. len += 4;
  135. }
  136. if (len > maxOutBufLen) {
  137. *outBufLen = len;
  138. return PR_FALSE;
  139. }
  140. len = 0;
  141. for (i = 0; i < inBufLen;) {
  142. PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
  143. if (ucs4 == BAD_UTF8)
  144. return PR_FALSE;
  145. outBuf[len + L_0] = 0x00;
  146. outBuf[len + L_1] = (unsigned char)(ucs4 >> 16);
  147. outBuf[len + L_2] = (unsigned char)(ucs4 >> 8);
  148. outBuf[len + L_3] = (unsigned char)ucs4;
  149. len += 4;
  150. }
  151. *outBufLen = len;
  152. return PR_TRUE;
  153. } else {
  154. unsigned int i, len = 0;
  155. PORT_Assert((inBufLen % 4) == 0);
  156. if ((inBufLen % 4) != 0) {
  157. *outBufLen = 0;
  158. return PR_FALSE;
  159. }
  160. for (i = 0; i < inBufLen; i += 4) {
  161. if ((inBuf[i + L_0] > 0x00) || (inBuf[i + L_1] > 0x10)) {
  162. *outBufLen = 0;
  163. return PR_FALSE;
  164. } else if (inBuf[i + L_1] >= 0x01)
  165. len += 4;
  166. else if (inBuf[i + L_2] >= 0x08)
  167. len += 3;
  168. else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80))
  169. len += 2;
  170. else
  171. len += 1;
  172. }
  173. if (len > maxOutBufLen) {
  174. *outBufLen = len;
  175. return PR_FALSE;
  176. }
  177. len = 0;
  178. for (i = 0; i < inBufLen; i += 4) {
  179. if (inBuf[i + L_1] >= 0x01) {
  180. /* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  181. /* 00000000 000abcde fghijklm nopqrstu ->
  182. 11110abc 10defghi 10jklmno 10pqrstu */
  183. outBuf[len + 0] = 0xF0 | ((inBuf[i + L_1] & 0x1C) >> 2);
  184. outBuf[len + 1] = 0x80 | ((inBuf[i + L_1] & 0x03) << 4) | ((inBuf[i + L_2] & 0xF0) >> 4);
  185. outBuf[len + 2] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
  186. outBuf[len + 3] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
  187. len += 4;
  188. } else if (inBuf[i + L_2] >= 0x08) {
  189. /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
  190. /* 00000000 00000000 abcdefgh ijklmnop ->
  191. 1110abcd 10efghij 10klmnop */
  192. outBuf[len + 0] = 0xE0 | ((inBuf[i + L_2] & 0xF0) >> 4);
  193. outBuf[len + 1] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
  194. outBuf[len + 2] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
  195. len += 3;
  196. } else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) {
  197. /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
  198. /* 00000000 00000000 00000abc defghijk ->
  199. 110abcde 10fghijk */
  200. outBuf[len + 0] = 0xC0 | ((inBuf[i + L_2] & 0x07) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
  201. outBuf[len + 1] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
  202. len += 2;
  203. } else {
  204. /* 0000 0000-0000 007F -> 0xxxxxx */
  205. /* 00000000 00000000 00000000 0abcdefg ->
  206. 0abcdefg */
  207. outBuf[len + 0] = (inBuf[i + L_3] & 0x7F);
  208. len += 1;
  209. }
  210. }
  211. *outBufLen = len;
  212. return PR_TRUE;
  213. }
  214. }
  215. PRBool
  216. sec_port_ucs2_utf8_conversion_function(
  217. PRBool toUnicode,
  218. unsigned char *inBuf,
  219. unsigned int inBufLen,
  220. unsigned char *outBuf,
  221. unsigned int maxOutBufLen,
  222. unsigned int *outBufLen)
  223. {
  224. PORT_Assert((unsigned int *)NULL != outBufLen);
  225. if (toUnicode) {
  226. unsigned int i, len = 0;
  227. for (i = 0; i < inBufLen;) {
  228. if ((inBuf[i] & 0x80) == 0x00) {
  229. i += 1;
  230. len += 2;
  231. } else if ((inBuf[i] & 0xE0) == 0xC0) {
  232. i += 2;
  233. len += 2;
  234. } else if ((inBuf[i] & 0xF0) == 0xE0) {
  235. i += 3;
  236. len += 2;
  237. } else if ((inBuf[i] & 0xF8) == 0xF0) {
  238. i += 4;
  239. len += 4;
  240. } else
  241. return PR_FALSE;
  242. }
  243. if (len > maxOutBufLen) {
  244. *outBufLen = len;
  245. return PR_FALSE;
  246. }
  247. len = 0;
  248. for (i = 0; i < inBufLen;) {
  249. PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
  250. if (ucs4 == BAD_UTF8)
  251. return PR_FALSE;
  252. if (ucs4 < 0x10000) {
  253. outBuf[len + H_0] = (unsigned char)(ucs4 >> 8);
  254. outBuf[len + H_1] = (unsigned char)ucs4;
  255. len += 2;
  256. } else {
  257. ucs4 -= 0x10000;
  258. outBuf[len + 0 + H_0] = (unsigned char)(0xD8 | ((ucs4 >> 18) & 0x3));
  259. outBuf[len + 0 + H_1] = (unsigned char)(ucs4 >> 10);
  260. outBuf[len + 2 + H_0] = (unsigned char)(0xDC | ((ucs4 >> 8) & 0x3));
  261. outBuf[len + 2 + H_1] = (unsigned char)ucs4;
  262. len += 4;
  263. }
  264. }
  265. *outBufLen = len;
  266. return PR_TRUE;
  267. } else {
  268. unsigned int i, len = 0;
  269. PORT_Assert((inBufLen % 2) == 0);
  270. if ((inBufLen % 2) != 0) {
  271. *outBufLen = 0;
  272. return PR_FALSE;
  273. }
  274. for (i = 0; i < inBufLen; i += 2) {
  275. if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00))
  276. len += 1;
  277. else if (inBuf[i + H_0] < 0x08)
  278. len += 2;
  279. else if (((inBuf[i + H_0] & 0xFC) == 0xD8)) {
  280. if (((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)) {
  281. i += 2;
  282. len += 4;
  283. } else {
  284. return PR_FALSE;
  285. }
  286. } else if ((inBuf[i + H_0] & 0xFC) == 0xDC) {
  287. return PR_FALSE;
  288. } else {
  289. len += 3;
  290. }
  291. }
  292. if (len > maxOutBufLen) {
  293. *outBufLen = len;
  294. return PR_FALSE;
  295. }
  296. len = 0;
  297. for (i = 0; i < inBufLen; i += 2) {
  298. if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) {
  299. /* 0000-007F -> 0xxxxxx */
  300. /* 00000000 0abcdefg -> 0abcdefg */
  301. outBuf[len] = inBuf[i + H_1] & 0x7F;
  302. len += 1;
  303. } else if (inBuf[i + H_0] < 0x08) {
  304. /* 0080-07FF -> 110xxxxx 10xxxxxx */
  305. /* 00000abc defghijk -> 110abcde 10fghijk */
  306. outBuf[len + 0] = 0xC0 | ((inBuf[i + H_0] & 0x07) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
  307. outBuf[len + 1] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0);
  308. len += 2;
  309. } else if ((inBuf[i + H_0] & 0xFC) == 0xD8) {
  310. int abcde, BCDE;
  311. PORT_Assert(((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC));
  312. /* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  313. /* 110110BC DEfghijk 110111lm nopqrstu ->
  314. { Let abcde = BCDE + 1 }
  315. 11110abc 10defghi 10jklmno 10pqrstu */
  316. BCDE = ((inBuf[i + H_0] & 0x03) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
  317. abcde = BCDE + 1;
  318. outBuf[len + 0] = 0xF0 | ((abcde & 0x1C) >> 2);
  319. outBuf[len + 1] = 0x80 | ((abcde & 0x03) << 4) | ((inBuf[i + 0 + H_1] & 0x3C) >> 2);
  320. outBuf[len + 2] = 0x80 | ((inBuf[i + 0 + H_1] & 0x03) << 4) | ((inBuf[i + 2 + H_0] & 0x03) << 2) | ((inBuf[i + 2 + H_1] & 0xC0) >> 6);
  321. outBuf[len + 3] = 0x80 | ((inBuf[i + 2 + H_1] & 0x3F) >> 0);
  322. i += 2;
  323. len += 4;
  324. } else {
  325. /* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
  326. /* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */
  327. outBuf[len + 0] = 0xE0 | ((inBuf[i + H_0] & 0xF0) >> 4);
  328. outBuf[len + 1] = 0x80 | ((inBuf[i + H_0] & 0x0F) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
  329. outBuf[len + 2] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0);
  330. len += 3;
  331. }
  332. }
  333. *outBufLen = len;
  334. return PR_TRUE;
  335. }
  336. }
  337. PRBool
  338. sec_port_iso88591_utf8_conversion_function(
  339. const unsigned char *inBuf,
  340. unsigned int inBufLen,
  341. unsigned char *outBuf,
  342. unsigned int maxOutBufLen,
  343. unsigned int *outBufLen)
  344. {
  345. unsigned int i, len = 0;
  346. PORT_Assert((unsigned int *)NULL != outBufLen);
  347. for (i = 0; i < inBufLen; i++) {
  348. if ((inBuf[i] & 0x80) == 0x00)
  349. len += 1;
  350. else
  351. len += 2;
  352. }
  353. if (len > maxOutBufLen) {
  354. *outBufLen = len;
  355. return PR_FALSE;
  356. }
  357. len = 0;
  358. for (i = 0; i < inBufLen; i++) {
  359. if ((inBuf[i] & 0x80) == 0x00) {
  360. /* 00-7F -> 0xxxxxxx */
  361. /* 0abcdefg -> 0abcdefg */
  362. outBuf[len] = inBuf[i];
  363. len += 1;
  364. } else {
  365. /* 80-FF <- 110xxxxx 10xxxxxx */
  366. /* 00000000 abcdefgh -> 110000ab 10cdefgh */
  367. outBuf[len + 0] = 0xC0 | ((inBuf[i] & 0xC0) >> 6);
  368. outBuf[len + 1] = 0x80 | ((inBuf[i] & 0x3F) >> 0);
  369. len += 2;
  370. }
  371. }
  372. *outBufLen = len;
  373. return PR_TRUE;
  374. }