u8-mbtouc.c 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. /* Look at first character in UTF-8 string.
  2. Copyright (C) 1999-2002, 2006-2007, 2009-2012 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This program is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. #if defined IN_LIBUNISTRING
  16. /* Tell unistr.h to declare u8_mbtouc as 'extern', not 'static inline'. */
  17. # include "unistring-notinline.h"
  18. #endif
  19. /* Specification. */
  20. #include "unistr.h"
  21. #if !HAVE_INLINE
  22. int
  23. u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
  24. {
  25. uint8_t c = *s;
  26. if (c < 0x80)
  27. {
  28. *puc = c;
  29. return 1;
  30. }
  31. else if (c >= 0xc2)
  32. {
  33. if (c < 0xe0)
  34. {
  35. if (n >= 2)
  36. {
  37. if ((s[1] ^ 0x80) < 0x40)
  38. {
  39. *puc = ((unsigned int) (c & 0x1f) << 6)
  40. | (unsigned int) (s[1] ^ 0x80);
  41. return 2;
  42. }
  43. /* invalid multibyte character */
  44. }
  45. else
  46. {
  47. /* incomplete multibyte character */
  48. *puc = 0xfffd;
  49. return 1;
  50. }
  51. }
  52. else if (c < 0xf0)
  53. {
  54. if (n >= 3)
  55. {
  56. if ((s[1] ^ 0x80) < 0x40)
  57. {
  58. if ((s[2] ^ 0x80) < 0x40)
  59. {
  60. if ((c >= 0xe1 || s[1] >= 0xa0)
  61. && (c != 0xed || s[1] < 0xa0))
  62. {
  63. *puc = ((unsigned int) (c & 0x0f) << 12)
  64. | ((unsigned int) (s[1] ^ 0x80) << 6)
  65. | (unsigned int) (s[2] ^ 0x80);
  66. return 3;
  67. }
  68. /* invalid multibyte character */
  69. *puc = 0xfffd;
  70. return 3;
  71. }
  72. /* invalid multibyte character */
  73. *puc = 0xfffd;
  74. return 2;
  75. }
  76. /* invalid multibyte character */
  77. }
  78. else
  79. {
  80. /* incomplete multibyte character */
  81. *puc = 0xfffd;
  82. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  83. return 1;
  84. else
  85. return 2;
  86. }
  87. }
  88. else if (c < 0xf8)
  89. {
  90. if (n >= 4)
  91. {
  92. if ((s[1] ^ 0x80) < 0x40)
  93. {
  94. if ((s[2] ^ 0x80) < 0x40)
  95. {
  96. if ((s[3] ^ 0x80) < 0x40)
  97. {
  98. if ((c >= 0xf1 || s[1] >= 0x90)
  99. #if 1
  100. && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
  101. #endif
  102. )
  103. {
  104. *puc = ((unsigned int) (c & 0x07) << 18)
  105. | ((unsigned int) (s[1] ^ 0x80) << 12)
  106. | ((unsigned int) (s[2] ^ 0x80) << 6)
  107. | (unsigned int) (s[3] ^ 0x80);
  108. return 4;
  109. }
  110. /* invalid multibyte character */
  111. *puc = 0xfffd;
  112. return 4;
  113. }
  114. /* invalid multibyte character */
  115. *puc = 0xfffd;
  116. return 3;
  117. }
  118. /* invalid multibyte character */
  119. *puc = 0xfffd;
  120. return 2;
  121. }
  122. /* invalid multibyte character */
  123. }
  124. else
  125. {
  126. /* incomplete multibyte character */
  127. *puc = 0xfffd;
  128. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  129. return 1;
  130. else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
  131. return 2;
  132. else
  133. return 3;
  134. }
  135. }
  136. #if 0
  137. else if (c < 0xfc)
  138. {
  139. if (n >= 5)
  140. {
  141. if ((s[1] ^ 0x80) < 0x40)
  142. {
  143. if ((s[2] ^ 0x80) < 0x40)
  144. {
  145. if ((s[3] ^ 0x80) < 0x40)
  146. {
  147. if ((s[4] ^ 0x80) < 0x40)
  148. {
  149. if (c >= 0xf9 || s[1] >= 0x88)
  150. {
  151. *puc = ((unsigned int) (c & 0x03) << 24)
  152. | ((unsigned int) (s[1] ^ 0x80) << 18)
  153. | ((unsigned int) (s[2] ^ 0x80) << 12)
  154. | ((unsigned int) (s[3] ^ 0x80) << 6)
  155. | (unsigned int) (s[4] ^ 0x80);
  156. return 5;
  157. }
  158. /* invalid multibyte character */
  159. *puc = 0xfffd;
  160. return 5;
  161. }
  162. /* invalid multibyte character */
  163. *puc = 0xfffd;
  164. return 4;
  165. }
  166. /* invalid multibyte character */
  167. *puc = 0xfffd;
  168. return 3;
  169. }
  170. /* invalid multibyte character */
  171. return 2;
  172. }
  173. /* invalid multibyte character */
  174. }
  175. else
  176. {
  177. /* incomplete multibyte character */
  178. *puc = 0xfffd;
  179. return n;
  180. }
  181. }
  182. else if (c < 0xfe)
  183. {
  184. if (n >= 6)
  185. {
  186. if ((s[1] ^ 0x80) < 0x40)
  187. {
  188. if ((s[2] ^ 0x80) < 0x40)
  189. {
  190. if ((s[3] ^ 0x80) < 0x40)
  191. {
  192. if ((s[4] ^ 0x80) < 0x40)
  193. {
  194. if ((s[5] ^ 0x80) < 0x40)
  195. {
  196. if (c >= 0xfd || s[1] >= 0x84)
  197. {
  198. *puc = ((unsigned int) (c & 0x01) << 30)
  199. | ((unsigned int) (s[1] ^ 0x80) << 24)
  200. | ((unsigned int) (s[2] ^ 0x80) << 18)
  201. | ((unsigned int) (s[3] ^ 0x80) << 12)
  202. | ((unsigned int) (s[4] ^ 0x80) << 6)
  203. | (unsigned int) (s[5] ^ 0x80);
  204. return 6;
  205. }
  206. /* invalid multibyte character */
  207. *puc = 0xfffd;
  208. return 6;
  209. }
  210. /* invalid multibyte character */
  211. *puc = 0xfffd;
  212. return 5;
  213. }
  214. /* invalid multibyte character */
  215. *puc = 0xfffd;
  216. return 4;
  217. }
  218. /* invalid multibyte character */
  219. *puc = 0xfffd;
  220. return 3;
  221. }
  222. /* invalid multibyte character */
  223. return 2;
  224. }
  225. /* invalid multibyte character */
  226. }
  227. else
  228. {
  229. /* incomplete multibyte character */
  230. *puc = 0xfffd;
  231. return n;
  232. }
  233. }
  234. #endif
  235. }
  236. /* invalid multibyte character */
  237. *puc = 0xfffd;
  238. return 1;
  239. }
  240. #endif