u8-mbtouc-unsafe.c 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /* Look at first character in UTF-8 string.
  2. Copyright (C) 1999-2002, 2006-2007, 2009-2013 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This program is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. #if defined IN_LIBUNISTRING
  16. /* Tell unistr.h to declare u8_mbtouc_unsafe as 'extern', not
  17. 'static inline'. */
  18. # include "unistring-notinline.h"
  19. #endif
  20. /* Specification. */
  21. #include "unistr.h"
  22. #if !HAVE_INLINE
  23. int
  24. u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n)
  25. {
  26. uint8_t c = *s;
  27. if (c < 0x80)
  28. {
  29. *puc = c;
  30. return 1;
  31. }
  32. else if (c >= 0xc2)
  33. {
  34. if (c < 0xe0)
  35. {
  36. if (n >= 2)
  37. {
  38. #if CONFIG_UNICODE_SAFETY
  39. if ((s[1] ^ 0x80) < 0x40)
  40. #endif
  41. {
  42. *puc = ((unsigned int) (c & 0x1f) << 6)
  43. | (unsigned int) (s[1] ^ 0x80);
  44. return 2;
  45. }
  46. #if CONFIG_UNICODE_SAFETY
  47. /* invalid multibyte character */
  48. #endif
  49. }
  50. else
  51. {
  52. /* incomplete multibyte character */
  53. *puc = 0xfffd;
  54. return 1;
  55. }
  56. }
  57. else if (c < 0xf0)
  58. {
  59. if (n >= 3)
  60. {
  61. #if CONFIG_UNICODE_SAFETY
  62. if ((s[1] ^ 0x80) < 0x40)
  63. {
  64. if ((s[2] ^ 0x80) < 0x40)
  65. {
  66. if ((c >= 0xe1 || s[1] >= 0xa0)
  67. && (c != 0xed || s[1] < 0xa0))
  68. #endif
  69. {
  70. *puc = ((unsigned int) (c & 0x0f) << 12)
  71. | ((unsigned int) (s[1] ^ 0x80) << 6)
  72. | (unsigned int) (s[2] ^ 0x80);
  73. return 3;
  74. }
  75. #if CONFIG_UNICODE_SAFETY
  76. /* invalid multibyte character */
  77. *puc = 0xfffd;
  78. return 3;
  79. }
  80. /* invalid multibyte character */
  81. *puc = 0xfffd;
  82. return 2;
  83. }
  84. /* invalid multibyte character */
  85. #endif
  86. }
  87. else
  88. {
  89. /* incomplete multibyte character */
  90. *puc = 0xfffd;
  91. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  92. return 1;
  93. else
  94. return 2;
  95. }
  96. }
  97. else if (c < 0xf8)
  98. {
  99. if (n >= 4)
  100. {
  101. #if CONFIG_UNICODE_SAFETY
  102. if ((s[1] ^ 0x80) < 0x40)
  103. {
  104. if ((s[2] ^ 0x80) < 0x40)
  105. {
  106. if ((s[3] ^ 0x80) < 0x40)
  107. {
  108. if ((c >= 0xf1 || s[1] >= 0x90)
  109. #if 1
  110. && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
  111. #endif
  112. )
  113. #endif
  114. {
  115. *puc = ((unsigned int) (c & 0x07) << 18)
  116. | ((unsigned int) (s[1] ^ 0x80) << 12)
  117. | ((unsigned int) (s[2] ^ 0x80) << 6)
  118. | (unsigned int) (s[3] ^ 0x80);
  119. return 4;
  120. }
  121. #if CONFIG_UNICODE_SAFETY
  122. /* invalid multibyte character */
  123. *puc = 0xfffd;
  124. return 4;
  125. }
  126. /* invalid multibyte character */
  127. *puc = 0xfffd;
  128. return 3;
  129. }
  130. /* invalid multibyte character */
  131. *puc = 0xfffd;
  132. return 2;
  133. }
  134. /* invalid multibyte character */
  135. #endif
  136. }
  137. else
  138. {
  139. /* incomplete multibyte character */
  140. *puc = 0xfffd;
  141. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  142. return 1;
  143. else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
  144. return 2;
  145. else
  146. return 3;
  147. }
  148. }
  149. #if 0
  150. else if (c < 0xfc)
  151. {
  152. if (n >= 5)
  153. {
  154. #if CONFIG_UNICODE_SAFETY
  155. if ((s[1] ^ 0x80) < 0x40)
  156. {
  157. if ((s[2] ^ 0x80) < 0x40)
  158. {
  159. if ((s[3] ^ 0x80) < 0x40)
  160. {
  161. if ((s[4] ^ 0x80) < 0x40)
  162. {
  163. if (c >= 0xf9 || s[1] >= 0x88)
  164. #endif
  165. {
  166. *puc = ((unsigned int) (c & 0x03) << 24)
  167. | ((unsigned int) (s[1] ^ 0x80) << 18)
  168. | ((unsigned int) (s[2] ^ 0x80) << 12)
  169. | ((unsigned int) (s[3] ^ 0x80) << 6)
  170. | (unsigned int) (s[4] ^ 0x80);
  171. return 5;
  172. }
  173. #if CONFIG_UNICODE_SAFETY
  174. /* invalid multibyte character */
  175. *puc = 0xfffd;
  176. return 5;
  177. }
  178. /* invalid multibyte character */
  179. *puc = 0xfffd;
  180. return 4;
  181. }
  182. /* invalid multibyte character */
  183. *puc = 0xfffd;
  184. return 3;
  185. }
  186. /* invalid multibyte character */
  187. return 2;
  188. }
  189. /* invalid multibyte character */
  190. #endif
  191. }
  192. else
  193. {
  194. /* incomplete multibyte character */
  195. *puc = 0xfffd;
  196. return n;
  197. }
  198. }
  199. else if (c < 0xfe)
  200. {
  201. if (n >= 6)
  202. {
  203. #if CONFIG_UNICODE_SAFETY
  204. if ((s[1] ^ 0x80) < 0x40)
  205. {
  206. if ((s[2] ^ 0x80) < 0x40)
  207. {
  208. if ((s[3] ^ 0x80) < 0x40)
  209. {
  210. if ((s[4] ^ 0x80) < 0x40)
  211. {
  212. if ((s[5] ^ 0x80) < 0x40)
  213. {
  214. if (c >= 0xfd || s[1] >= 0x84)
  215. #endif
  216. {
  217. *puc = ((unsigned int) (c & 0x01) << 30)
  218. | ((unsigned int) (s[1] ^ 0x80) << 24)
  219. | ((unsigned int) (s[2] ^ 0x80) << 18)
  220. | ((unsigned int) (s[3] ^ 0x80) << 12)
  221. | ((unsigned int) (s[4] ^ 0x80) << 6)
  222. | (unsigned int) (s[5] ^ 0x80);
  223. return 6;
  224. }
  225. #if CONFIG_UNICODE_SAFETY
  226. /* invalid multibyte character */
  227. *puc = 0xfffd;
  228. return 6;
  229. }
  230. /* invalid multibyte character */
  231. *puc = 0xfffd;
  232. return 5;
  233. }
  234. /* invalid multibyte character */
  235. *puc = 0xfffd;
  236. return 4;
  237. }
  238. /* invalid multibyte character */
  239. *puc = 0xfffd;
  240. return 3;
  241. }
  242. /* invalid multibyte character */
  243. return 2;
  244. }
  245. /* invalid multibyte character */
  246. #endif
  247. }
  248. else
  249. {
  250. /* incomplete multibyte character */
  251. *puc = 0xfffd;
  252. return n;
  253. }
  254. }
  255. #endif
  256. }
  257. /* invalid multibyte character */
  258. *puc = 0xfffd;
  259. return 1;
  260. }
  261. #endif