u8-mbtouc-aux.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. /* Conversion UTF-8 to UCS-4.
  2. Copyright (C) 2001-2002, 2006-2007, 2009-2014 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This program is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "unistr.h"
  17. #if defined IN_LIBUNISTRING || HAVE_INLINE
  18. int
  19. u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
  20. {
  21. uint8_t c = *s;
  22. if (c >= 0xc2)
  23. {
  24. if (c < 0xe0)
  25. {
  26. if (n >= 2)
  27. {
  28. if ((s[1] ^ 0x80) < 0x40)
  29. {
  30. *puc = ((unsigned int) (c & 0x1f) << 6)
  31. | (unsigned int) (s[1] ^ 0x80);
  32. return 2;
  33. }
  34. /* invalid multibyte character */
  35. }
  36. else
  37. {
  38. /* incomplete multibyte character */
  39. *puc = 0xfffd;
  40. return 1;
  41. }
  42. }
  43. else if (c < 0xf0)
  44. {
  45. if (n >= 3)
  46. {
  47. if ((s[1] ^ 0x80) < 0x40)
  48. {
  49. if ((s[2] ^ 0x80) < 0x40)
  50. {
  51. if ((c >= 0xe1 || s[1] >= 0xa0)
  52. && (c != 0xed || s[1] < 0xa0))
  53. {
  54. *puc = ((unsigned int) (c & 0x0f) << 12)
  55. | ((unsigned int) (s[1] ^ 0x80) << 6)
  56. | (unsigned int) (s[2] ^ 0x80);
  57. return 3;
  58. }
  59. /* invalid multibyte character */
  60. *puc = 0xfffd;
  61. return 3;
  62. }
  63. /* invalid multibyte character */
  64. *puc = 0xfffd;
  65. return 2;
  66. }
  67. /* invalid multibyte character */
  68. }
  69. else
  70. {
  71. /* incomplete multibyte character */
  72. *puc = 0xfffd;
  73. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  74. return 1;
  75. else
  76. return 2;
  77. }
  78. }
  79. else if (c < 0xf8)
  80. {
  81. if (n >= 4)
  82. {
  83. if ((s[1] ^ 0x80) < 0x40)
  84. {
  85. if ((s[2] ^ 0x80) < 0x40)
  86. {
  87. if ((s[3] ^ 0x80) < 0x40)
  88. {
  89. if ((c >= 0xf1 || s[1] >= 0x90)
  90. #if 1
  91. && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
  92. #endif
  93. )
  94. {
  95. *puc = ((unsigned int) (c & 0x07) << 18)
  96. | ((unsigned int) (s[1] ^ 0x80) << 12)
  97. | ((unsigned int) (s[2] ^ 0x80) << 6)
  98. | (unsigned int) (s[3] ^ 0x80);
  99. return 4;
  100. }
  101. /* invalid multibyte character */
  102. *puc = 0xfffd;
  103. return 4;
  104. }
  105. /* invalid multibyte character */
  106. *puc = 0xfffd;
  107. return 3;
  108. }
  109. /* invalid multibyte character */
  110. *puc = 0xfffd;
  111. return 2;
  112. }
  113. /* invalid multibyte character */
  114. }
  115. else
  116. {
  117. /* incomplete multibyte character */
  118. *puc = 0xfffd;
  119. if (n == 1 || (s[1] ^ 0x80) >= 0x40)
  120. return 1;
  121. else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
  122. return 2;
  123. else
  124. return 3;
  125. }
  126. }
  127. #if 0
  128. else if (c < 0xfc)
  129. {
  130. if (n >= 5)
  131. {
  132. if ((s[1] ^ 0x80) < 0x40)
  133. {
  134. if ((s[2] ^ 0x80) < 0x40)
  135. {
  136. if ((s[3] ^ 0x80) < 0x40)
  137. {
  138. if ((s[4] ^ 0x80) < 0x40)
  139. {
  140. if (c >= 0xf9 || s[1] >= 0x88)
  141. {
  142. *puc = ((unsigned int) (c & 0x03) << 24)
  143. | ((unsigned int) (s[1] ^ 0x80) << 18)
  144. | ((unsigned int) (s[2] ^ 0x80) << 12)
  145. | ((unsigned int) (s[3] ^ 0x80) << 6)
  146. | (unsigned int) (s[4] ^ 0x80);
  147. return 5;
  148. }
  149. /* invalid multibyte character */
  150. *puc = 0xfffd;
  151. return 5;
  152. }
  153. /* invalid multibyte character */
  154. *puc = 0xfffd;
  155. return 4;
  156. }
  157. /* invalid multibyte character */
  158. *puc = 0xfffd;
  159. return 3;
  160. }
  161. /* invalid multibyte character */
  162. return 2;
  163. }
  164. /* invalid multibyte character */
  165. }
  166. else
  167. {
  168. /* incomplete multibyte character */
  169. *puc = 0xfffd;
  170. return n;
  171. }
  172. }
  173. else if (c < 0xfe)
  174. {
  175. if (n >= 6)
  176. {
  177. if ((s[1] ^ 0x80) < 0x40)
  178. {
  179. if ((s[2] ^ 0x80) < 0x40)
  180. {
  181. if ((s[3] ^ 0x80) < 0x40)
  182. {
  183. if ((s[4] ^ 0x80) < 0x40)
  184. {
  185. if ((s[5] ^ 0x80) < 0x40)
  186. {
  187. if (c >= 0xfd || s[1] >= 0x84)
  188. {
  189. *puc = ((unsigned int) (c & 0x01) << 30)
  190. | ((unsigned int) (s[1] ^ 0x80) << 24)
  191. | ((unsigned int) (s[2] ^ 0x80) << 18)
  192. | ((unsigned int) (s[3] ^ 0x80) << 12)
  193. | ((unsigned int) (s[4] ^ 0x80) << 6)
  194. | (unsigned int) (s[5] ^ 0x80);
  195. return 6;
  196. }
  197. /* invalid multibyte character */
  198. *puc = 0xfffd;
  199. return 6;
  200. }
  201. /* invalid multibyte character */
  202. *puc = 0xfffd;
  203. return 5;
  204. }
  205. /* invalid multibyte character */
  206. *puc = 0xfffd;
  207. return 4;
  208. }
  209. /* invalid multibyte character */
  210. *puc = 0xfffd;
  211. return 3;
  212. }
  213. /* invalid multibyte character */
  214. return 2;
  215. }
  216. /* invalid multibyte character */
  217. }
  218. else
  219. {
  220. /* incomplete multibyte character */
  221. *puc = 0xfffd;
  222. return n;
  223. }
  224. }
  225. #endif
  226. }
  227. /* invalid multibyte character */
  228. *puc = 0xfffd;
  229. return 1;
  230. }
  231. #endif