mbyte_spec.lua 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. local helpers = require("test.unit.helpers")(after_each)
  2. local itp = helpers.gen_itp(it)
  3. local ffi = helpers.ffi
  4. local eq = helpers.eq
  5. local mbyte = helpers.cimport("./src/nvim/mbyte.h")
  6. local charset = helpers.cimport('./src/nvim/charset.h')
  7. describe('mbyte', function()
  8. -- Array for composing characters
  9. local intp = ffi.typeof('int[?]')
  10. local function to_intp()
  11. -- how to get MAX_MCO from globals.h?
  12. return intp(7, 1)
  13. end
  14. -- Convert from bytes to string
  15. local function to_string(bytes)
  16. local s = {}
  17. for i = 1, #bytes do
  18. s[i] = string.char(bytes[i])
  19. end
  20. return table.concat(s)
  21. end
  22. before_each(function()
  23. end)
  24. itp('utf_ptr2char', function()
  25. -- For strings with length 1 the first byte is returned.
  26. for c = 0, 255 do
  27. eq(c, mbyte.utf_ptr2char(to_string({c, 0})))
  28. end
  29. -- Some ill formed byte sequences that should not be recognized as UTF-8
  30. -- First byte: 0xc0 or 0xc1
  31. -- Second byte: 0x80 .. 0xbf
  32. --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80})))
  33. --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf})))
  34. --
  35. -- Sequences with more than four bytes
  36. end)
  37. for n = 0, 0xF do
  38. itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
  39. local char_p = ffi.typeof('char[?]')
  40. for c = n * 0x1000, n * 0x1000 + 0xFFF do
  41. local p = char_p(4, 0)
  42. mbyte.utf_char2bytes(c, p)
  43. eq(c, mbyte.utf_ptr2char(p))
  44. eq(charset.vim_iswordc(c), charset.vim_iswordp(p))
  45. end
  46. end)
  47. end
  48. describe('utfc_ptr2char_len', function()
  49. itp('1-byte sequences', function()
  50. local pcc = to_intp()
  51. for c = 0, 255 do
  52. eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1))
  53. eq(0, pcc[0])
  54. end
  55. end)
  56. itp('2-byte sequences', function()
  57. local pcc = to_intp()
  58. -- No combining characters
  59. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2))
  60. eq(0, pcc[0])
  61. -- No combining characters
  62. pcc = to_intp()
  63. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
  64. eq(0, pcc[0])
  65. -- No UTF-8 sequence
  66. pcc = to_intp()
  67. eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
  68. eq(0, pcc[0])
  69. -- One UTF-8 character
  70. pcc = to_intp()
  71. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
  72. eq(0, pcc[0])
  73. -- No UTF-8 sequence
  74. pcc = to_intp()
  75. eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
  76. eq(0, pcc[0])
  77. end)
  78. itp('3-byte sequences', function()
  79. local pcc = to_intp()
  80. -- No second UTF-8 character
  81. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3))
  82. eq(0, pcc[0])
  83. -- No combining character
  84. pcc = to_intp()
  85. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
  86. eq(0, pcc[0])
  87. -- Combining character is U+0300
  88. pcc = to_intp()
  89. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
  90. eq(0x0300, pcc[0])
  91. eq(0x0000, pcc[1])
  92. -- No UTF-8 sequence
  93. pcc = to_intp()
  94. eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
  95. eq(0, pcc[0])
  96. -- Incomplete combining character
  97. pcc = to_intp()
  98. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
  99. eq(0, pcc[0])
  100. -- One UTF-8 character
  101. pcc = to_intp()
  102. eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
  103. eq(0, pcc[0])
  104. end)
  105. itp('4-byte sequences', function()
  106. local pcc = to_intp()
  107. -- No following combining character
  108. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4))
  109. eq(0, pcc[0])
  110. -- No second UTF-8 character
  111. pcc = to_intp()
  112. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
  113. eq(0, pcc[0])
  114. -- Combining character U+0300
  115. pcc = to_intp()
  116. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
  117. eq(0x0300, pcc[0])
  118. eq(0x0000, pcc[1])
  119. -- No UTF-8 sequence
  120. pcc = to_intp()
  121. eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
  122. eq(0, pcc[0])
  123. -- No following UTF-8 character
  124. pcc = to_intp()
  125. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
  126. eq(0, pcc[0])
  127. -- Combining character U+0301
  128. pcc = to_intp()
  129. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
  130. eq(0x0301, pcc[0])
  131. eq(0x0000, pcc[1])
  132. -- One UTF-8 character
  133. pcc = to_intp()
  134. eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
  135. eq(0, pcc[0])
  136. end)
  137. itp('5+-byte sequences', function()
  138. local pcc = to_intp()
  139. -- No following combining character
  140. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
  141. eq(0, pcc[0])
  142. -- No second UTF-8 character
  143. pcc = to_intp()
  144. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
  145. eq(0, pcc[0])
  146. -- Combining character U+0300
  147. pcc = to_intp()
  148. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
  149. eq(0x0300, pcc[0])
  150. eq(0x0000, pcc[1])
  151. -- Combining characters U+0300 and U+0301
  152. pcc = to_intp()
  153. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
  154. eq(0x0300, pcc[0])
  155. eq(0x0301, pcc[1])
  156. eq(0x0000, pcc[2])
  157. -- Combining characters U+0300, U+0301, U+0302
  158. pcc = to_intp()
  159. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
  160. eq(0x0300, pcc[0])
  161. eq(0x0301, pcc[1])
  162. eq(0x0302, pcc[2])
  163. eq(0x0000, pcc[3])
  164. -- Combining characters U+0300, U+0301, U+0302, U+0303
  165. pcc = to_intp()
  166. eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
  167. eq(0x0300, pcc[0])
  168. eq(0x0301, pcc[1])
  169. eq(0x0302, pcc[2])
  170. eq(0x0303, pcc[3])
  171. eq(0x0000, pcc[4])
  172. -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
  173. pcc = to_intp()
  174. eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
  175. {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11))
  176. eq(0x0300, pcc[0])
  177. eq(0x0301, pcc[1])
  178. eq(0x0302, pcc[2])
  179. eq(0x0303, pcc[3])
  180. eq(0x0304, pcc[4])
  181. eq(0x0000, pcc[5])
  182. -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
  183. -- U+0305
  184. pcc = to_intp()
  185. eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
  186. {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
  187. eq(0x0300, pcc[0])
  188. eq(0x0301, pcc[1])
  189. eq(0x0302, pcc[2])
  190. eq(0x0303, pcc[3])
  191. eq(0x0304, pcc[4])
  192. eq(0x0305, pcc[5])
  193. eq(1, pcc[6])
  194. -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
  195. -- U+0305, U+0306, but only save six (= MAX_MCO).
  196. pcc = to_intp()
  197. eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
  198. {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
  199. eq(0x0300, pcc[0])
  200. eq(0x0301, pcc[1])
  201. eq(0x0302, pcc[2])
  202. eq(0x0303, pcc[3])
  203. eq(0x0304, pcc[4])
  204. eq(0x0305, pcc[5])
  205. eq(0x0001, pcc[6])
  206. -- Only three following combining characters U+0300, U+0301, U+0302
  207. pcc = to_intp()
  208. eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
  209. {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
  210. eq(0x0300, pcc[0])
  211. eq(0x0301, pcc[1])
  212. eq(0x0302, pcc[2])
  213. eq(0x0000, pcc[3])
  214. -- No UTF-8 sequence
  215. pcc = to_intp()
  216. eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
  217. eq(0, pcc[0])
  218. -- No following UTF-8 character
  219. pcc = to_intp()
  220. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
  221. eq(0, pcc[0])
  222. -- Combining character U+0301
  223. pcc = to_intp()
  224. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
  225. eq(0x0301, pcc[0])
  226. eq(0x0000, pcc[1])
  227. -- Combining character U+0301
  228. pcc = to_intp()
  229. eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
  230. eq(0x0301, pcc[0])
  231. eq(0x0000, pcc[1])
  232. -- One UTF-8 character
  233. pcc = to_intp()
  234. eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
  235. eq(0, pcc[0])
  236. -- One UTF-8 character
  237. pcc = to_intp()
  238. eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
  239. eq(0, pcc[0])
  240. -- One UTF-8 character
  241. pcc = to_intp()
  242. eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
  243. eq(0, pcc[0])
  244. -- Combining characters U+1AB0 and U+0301
  245. pcc = to_intp()
  246. eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
  247. {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
  248. eq(0x1ab0, pcc[0])
  249. eq(0x0301, pcc[1])
  250. eq(0x0000, pcc[2])
  251. end)
  252. end)
  253. end)