u8-mbtouc-unsafe-aux.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /* Conversion UTF-8 to UCS-4.
  2. Copyright (C) 2001-2002, 2006-2007, 2009-2013 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This program is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "unistr.h"
  17. #if defined IN_LIBUNISTRING || HAVE_INLINE
  18. int
  19. u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n)
  20. {
  21. uint8_t c = *s;
  22. if (c >= 0xc2)
  23. {
  24. if (c < 0xe0)
  25. {
  26. if (n >= 2)
  27. {
  28. #if CONFIG_UNICODE_SAFETY
  29. if ((s[1] ^ 0x80) < 0x40)
  30. #endif
  31. {
  32. *puc = ((unsigned int) (c & 0x1f) << 6)
  33. | (unsigned int) (s[1] ^ 0x80);
  34. return 2;
  35. }
  36. #if CONFIG_UNICODE_SAFETY
  37. /* invalid multibyte character */
  38. #endif
  39. }
  40. else
  41. {
  42. /* incomplete multibyte character */
  43. *puc = 0xfffd;
  44. return 1;
  45. }
  46. }
  47. else if (c < 0xf0)
  48. {
  49. if (n >= 3)
  50. {
  51. #if CONFIG_UNICODE_SAFETY
  52. if ((s[1] ^ 0x80) < 0x40)
  53. {
  54. if ((s[2] ^ 0x80) < 0x40)
  55. {
  56. if ((c >= 0xe1 || s[1] >= 0xa0)
  57. && (c != 0xed || s[1] < 0xa0))
  58. #endif
  59. {
  60. *puc = ((unsigned int) (c & 0x0f) << 12)
  61. | ((unsigned int) (s[1] ^ 0x80) << 6)
  62. | (unsigned int) (s[2] ^ 0x80);
  63. return 3;
  64. }
  65. #if CONFIG_UNICODE_SAFETY
  66. /* invalid multibyte character */
  67. *puc = 0xfffd;
  68. return 3;
  69. }
  70. /* invalid multibyte character */
  71. *puc = 0xfffd;
  72. return 2;
  73. }
  74. /* invalid multibyte character */
  75. #endif
  76. }
  77. else
  78. {
  79. /* incomplete multibyte character */
  80. *puc = 0xfffd;
  81. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  82. return 1;
  83. else
  84. return 2;
  85. }
  86. }
  87. else if (c < 0xf8)
  88. {
  89. if (n >= 4)
  90. {
  91. #if CONFIG_UNICODE_SAFETY
  92. if ((s[1] ^ 0x80) < 0x40)
  93. {
  94. if ((s[2] ^ 0x80) < 0x40)
  95. {
  96. if ((s[3] ^ 0x80) < 0x40)
  97. {
  98. if ((c >= 0xf1 || s[1] >= 0x90)
  99. #if 1
  100. && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
  101. #endif
  102. )
  103. #endif
  104. {
  105. *puc = ((unsigned int) (c & 0x07) << 18)
  106. | ((unsigned int) (s[1] ^ 0x80) << 12)
  107. | ((unsigned int) (s[2] ^ 0x80) << 6)
  108. | (unsigned int) (s[3] ^ 0x80);
  109. return 4;
  110. }
  111. #if CONFIG_UNICODE_SAFETY
  112. /* invalid multibyte character */
  113. *puc = 0xfffd;
  114. return 4;
  115. }
  116. /* invalid multibyte character */
  117. *puc = 0xfffd;
  118. return 3;
  119. }
  120. /* invalid multibyte character */
  121. *puc = 0xfffd;
  122. return 2;
  123. }
  124. /* invalid multibyte character */
  125. #endif
  126. }
  127. else
  128. {
  129. /* incomplete multibyte character */
  130. *puc = 0xfffd;
  131. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  132. return 1;
  133. else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
  134. return 2;
  135. else
  136. return 3;
  137. }
  138. }
  139. #if 0
  140. else if (c < 0xfc)
  141. {
  142. if (n >= 5)
  143. {
  144. #if CONFIG_UNICODE_SAFETY
  145. if ((s[1] ^ 0x80) < 0x40)
  146. {
  147. if ((s[2] ^ 0x80) < 0x40)
  148. {
  149. if ((s[3] ^ 0x80) < 0x40)
  150. {
  151. if ((s[4] ^ 0x80) < 0x40)
  152. {
  153. if (c >= 0xf9 || s[1] >= 0x88)
  154. #endif
  155. {
  156. *puc = ((unsigned int) (c & 0x03) << 24)
  157. | ((unsigned int) (s[1] ^ 0x80) << 18)
  158. | ((unsigned int) (s[2] ^ 0x80) << 12)
  159. | ((unsigned int) (s[3] ^ 0x80) << 6)
  160. | (unsigned int) (s[4] ^ 0x80);
  161. return 5;
  162. }
  163. #if CONFIG_UNICODE_SAFETY
  164. /* invalid multibyte character */
  165. *puc = 0xfffd;
  166. return 5;
  167. }
  168. /* invalid multibyte character */
  169. *puc = 0xfffd;
  170. return 4;
  171. }
  172. /* invalid multibyte character */
  173. *puc = 0xfffd;
  174. return 3;
  175. }
  176. /* invalid multibyte character */
  177. return 2;
  178. }
  179. /* invalid multibyte character */
  180. #endif
  181. }
  182. else
  183. {
  184. /* incomplete multibyte character */
  185. *puc = 0xfffd;
  186. return n;
  187. }
  188. }
  189. else if (c < 0xfe)
  190. {
  191. if (n >= 6)
  192. {
  193. #if CONFIG_UNICODE_SAFETY
  194. if ((s[1] ^ 0x80) < 0x40)
  195. {
  196. if ((s[2] ^ 0x80) < 0x40)
  197. {
  198. if ((s[3] ^ 0x80) < 0x40)
  199. {
  200. if ((s[4] ^ 0x80) < 0x40)
  201. {
  202. if ((s[5] ^ 0x80) < 0x40)
  203. {
  204. if (c >= 0xfd || s[1] >= 0x84)
  205. #endif
  206. {
  207. *puc = ((unsigned int) (c & 0x01) << 30)
  208. | ((unsigned int) (s[1] ^ 0x80) << 24)
  209. | ((unsigned int) (s[2] ^ 0x80) << 18)
  210. | ((unsigned int) (s[3] ^ 0x80) << 12)
  211. | ((unsigned int) (s[4] ^ 0x80) << 6)
  212. | (unsigned int) (s[5] ^ 0x80);
  213. return 6;
  214. }
  215. #if CONFIG_UNICODE_SAFETY
  216. /* invalid multibyte character */
  217. *puc = 0xfffd;
  218. return 6;
  219. }
  220. /* invalid multibyte character */
  221. *puc = 0xfffd;
  222. return 5;
  223. }
  224. /* invalid multibyte character */
  225. *puc = 0xfffd;
  226. return 4;
  227. }
  228. /* invalid multibyte character */
  229. *puc = 0xfffd;
  230. return 3;
  231. }
  232. /* invalid multibyte character */
  233. return 2;
  234. }
  235. /* invalid multibyte character */
  236. #endif
  237. }
  238. else
  239. {
  240. /* incomplete multibyte character */
  241. *puc = 0xfffd;
  242. return n;
  243. }
  244. }
  245. #endif
  246. }
  247. /* invalid multibyte character */
  248. *puc = 0xfffd;
  249. return 1;
  250. }
  251. #endif