mbyte_spec.lua 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. local t = require('test.unit.testutil')
  2. local itp = t.gen_itp(it)
  3. local ffi = t.ffi
  4. local eq = t.eq
  5. local to_cstr = t.to_cstr
  6. local lib = t.cimport(
  7. './src/nvim/mbyte.h',
  8. './src/nvim/charset.h',
  9. './src/nvim/grid.h',
  10. './src/nvim/option_vars.h'
  11. )
  12. describe('mbyte', function()
  13. -- Convert from bytes to string
  14. local function to_string(bytes)
  15. local s = {}
  16. for i = 1, #bytes do
  17. s[i] = string.char(bytes[i])
  18. end
  19. return table.concat(s)
  20. end
  21. before_each(function() end)
  22. itp('utf_ptr2char', function()
  23. -- For strings with length 1 the first byte is returned.
  24. for c = 0, 255 do
  25. eq(c, lib.utf_ptr2char(to_string({ c, 0 })))
  26. end
  27. -- Some ill formed byte sequences that should not be recognized as UTF-8
  28. -- First byte: 0xc0 or 0xc1
  29. -- Second byte: 0x80 .. 0xbf
  30. --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
  31. --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
  32. --
  33. -- Sequences with more than four bytes
  34. end)
  35. for n = 0, 0xF do
  36. itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
  37. local char_p = ffi.typeof('char[?]')
  38. for c = n * 0x1000, n * 0x1000 + 0xFFF do
  39. local p = char_p(4, 0)
  40. lib.utf_char2bytes(c, p)
  41. eq(c, lib.utf_ptr2char(p))
  42. eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
  43. end
  44. end)
  45. end
  46. describe('utfc_ptr2schar', function()
  47. local function test_seq(seq)
  48. local firstc = ffi.new('int[1]')
  49. local buf = ffi.new('char[32]')
  50. lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
  51. local str = ffi.string(buf)
  52. if 1 > 2 then -- for debugging
  53. local tabel = {}
  54. for i = 1, #str do
  55. table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
  56. end
  57. print('{ ' .. table.concat(tabel, ', ') .. ' }')
  58. io.stdout:flush()
  59. end
  60. return { str, firstc[0] }
  61. end
  62. local function byte(val)
  63. return { string.char(val), val }
  64. end
  65. itp('1-byte sequences', function()
  66. eq({ '', 0 }, test_seq { 0 })
  67. for c = 1, 127 do
  68. eq(byte(c), test_seq { c })
  69. end
  70. for c = 128, 255 do
  71. eq({ '', c }, test_seq { c })
  72. end
  73. end)
  74. itp('2-byte sequences', function()
  75. -- No combining characters
  76. eq(byte(0x7f), test_seq { 0x7f, 0x7f })
  77. -- No combining characters
  78. eq(byte(0x7f), test_seq { 0x7f, 0x80 })
  79. -- No UTF-8 sequence
  80. eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f })
  81. -- One UTF-8 character
  82. eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80 })
  83. -- No UTF-8 sequence
  84. eq({ '', 0xc2 }, test_seq { 0xc2, 0xc0 })
  85. end)
  86. itp('3-byte sequences', function()
  87. -- No second UTF-8 character
  88. eq(byte(0x7f), test_seq { 0x7f, 0x80, 0x80 })
  89. -- No combining character
  90. eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
  91. -- Combining character is U+0300
  92. eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
  93. -- invalid start byte for combining
  94. eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
  95. -- No UTF-8 sequence
  96. eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
  97. -- Incomplete combining character
  98. eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc })
  99. -- One UTF-8 character (composing only)
  100. eq({ ' \xe2\x83\x90', 0x20d0 }, test_seq { 0xe2, 0x83, 0x90 })
  101. end)
  102. itp('4-byte sequences', function()
  103. -- No following combining character
  104. eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
  105. eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
  106. -- No second UTF-8 character
  107. eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
  108. -- Combining character U+0300
  109. eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
  110. -- No UTF-8 sequence
  111. eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
  112. -- No following UTF-8 character
  113. eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
  114. -- Combining character U+0301
  115. eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
  116. -- U+0080 : not a valid start char
  117. eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
  118. -- One UTF-8 character
  119. eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
  120. end)
  121. itp('5+-byte sequences', function()
  122. -- No following combining character
  123. eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80, 0x80 })
  124. -- No second UTF-8 character
  125. eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
  126. -- Combining character U+0300
  127. eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
  128. -- Combining characters U+0300 and U+0301
  129. eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
  130. -- Combining characters U+0300, U+0301, U+0302
  131. eq(
  132. { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
  133. test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
  134. )
  135. -- Combining characters U+0300, U+0301, U+0302, U+0303
  136. eq(
  137. { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
  138. test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
  139. )
  140. -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
  141. eq(
  142. { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
  143. test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
  144. )
  145. -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
  146. eq(
  147. { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
  148. test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
  149. )
  150. -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
  151. eq(
  152. { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
  153. test_seq {
  154. 0x29,
  155. 0xcc,
  156. 0x80,
  157. 0xcc,
  158. 0x81,
  159. 0xcc,
  160. 0x82,
  161. 0xcc,
  162. 0x83,
  163. 0xcc,
  164. 0x84,
  165. 0xcc,
  166. 0x85,
  167. 0xcc,
  168. 0x86,
  169. }
  170. )
  171. -- Only three following combining characters U+0300, U+0301, U+0302
  172. eq(
  173. { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
  174. test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
  175. )
  176. -- No UTF-8 sequence
  177. eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
  178. -- No following UTF-8 character
  179. eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
  180. -- Combining character U+0301
  181. eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
  182. -- Combining character U+0301
  183. eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
  184. -- One UTF-8 character
  185. eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
  186. -- One UTF-8 character
  187. eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x80 })
  188. -- One UTF-8 character
  189. eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0xcc })
  190. -- Combining characters U+1AB0 and U+0301
  191. eq(
  192. { '\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81', 0x100000 },
  193. test_seq { 0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81 }
  194. )
  195. end)
  196. end)
  197. describe('utf_cp_bounds_len', function()
  198. local tests = {
  199. {
  200. name = 'for valid string',
  201. str = 'iÀiiⱠiⱠⱠ𐀀i',
  202. offsets = {
  203. b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 },
  204. e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 },
  205. },
  206. },
  207. {
  208. name = 'for string with incomplete sequence',
  209. str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80',
  210. offsets = {
  211. b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 },
  212. e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 },
  213. },
  214. },
  215. {
  216. name = 'for string with trailing bytes after multibyte',
  217. str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i',
  218. offsets = {
  219. b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 },
  220. e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 },
  221. },
  222. },
  223. }
  224. for _, test in ipairs(tests) do
  225. itp(test.name, function()
  226. local cstr = to_cstr(test.str)
  227. local b_offsets, e_offsets = {}, {}
  228. for i = 1, #test.str do
  229. local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1))
  230. table.insert(b_offsets, result.begin_off)
  231. table.insert(e_offsets, result.end_off)
  232. end
  233. eq(test.offsets, { b = b_offsets, e = e_offsets })
  234. end)
  235. end
  236. itp('does not read before start', function()
  237. local str = '𐀀'
  238. local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
  239. local cstr = to_cstr(str) + 1
  240. local b_offsets, e_offsets = {}, {}
  241. for i = 1, 3 do
  242. local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
  243. table.insert(b_offsets, result.begin_off)
  244. table.insert(e_offsets, result.end_off)
  245. end
  246. eq(expected_offsets, { b = b_offsets, e = e_offsets })
  247. end)
  248. itp('does not read past the end', function()
  249. local str = '𐀀'
  250. local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
  251. local cstr = to_cstr(str)
  252. local b_offsets, e_offsets = {}, {}
  253. for i = 1, 3 do
  254. local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
  255. table.insert(b_offsets, result.begin_off)
  256. table.insert(e_offsets, result.end_off)
  257. end
  258. eq(expected_offsets, { b = b_offsets, e = e_offsets })
  259. end)
  260. end)
  261. itp('utf_head_off', function()
  262. local function check(str, expected_glyphs)
  263. local len = #str
  264. local cstr = to_cstr(str)
  265. local breaks = { 0 } -- SOT
  266. local pos = 0
  267. local mb_glyphs = {}
  268. while pos < len do
  269. local clen = lib.utfc_ptr2len(cstr + pos)
  270. if clen == 0 then
  271. eq(0, string.byte(str, pos + 1)) -- only NUL bytes can has length zery
  272. clen = 1 -- but skip it, otherwise we get stuck
  273. end
  274. if clen > 1 then
  275. table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
  276. end
  277. pos = pos + clen
  278. table.insert(breaks, pos)
  279. end
  280. eq(breaks[#breaks], len) -- include EOT as break
  281. -- we could also send in breaks, but this is more human readable
  282. eq(mb_glyphs, expected_glyphs)
  283. for i = 1, #breaks - 1 do
  284. local start, next = breaks[i], breaks[i + 1]
  285. for p = start, next - 1 do
  286. eq(p - start, lib.utf_head_off(cstr, cstr + p))
  287. end
  288. end
  289. eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
  290. end
  291. -- stylua doesn't like ZWJ chars..
  292. -- stylua: ignore start
  293. check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
  294. -- emoji (various kinds of combinations, use g8 to see them)
  295. check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
  296. check('🏳️‍⚧️xy🧑‍🌾\r❤️😂å🏴‍☠️€', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '€' })
  297. check('🏳️‍⚧️\000🧑‍🌾\000❤️\000😂\000å\000🏴‍☠️\000€', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '€' })
  298. check('\195🏳️‍⚧️\198🧑‍🌾\165❤️\168\195😂\255🏴‍☠️\129€\165', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', '🏴‍☠️', '€' })
  299. check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
  300. check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
  301. check('å\165ü\195aëq\168β\000\169本\255', {'å', 'ü', 'ë', 'β', '本'})
  302. lib.p_arshape = true -- default
  303. check('سلام', { 'س', 'لا', 'م' })
  304. lib.p_arshape = false
  305. check('سلام', { 'س', 'ل', 'ا', 'م' })
  306. check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
  307. -- stylua: ignore end
  308. end)
  309. describe('utf_fold', function()
  310. itp('does not crash with surrogates #30527', function()
  311. eq(0xddfb, lib.utf_fold(0xddfb)) -- low surrogate, invalid as a character
  312. eq(0xd800, lib.utf_fold(0xd800)) -- high surrogate, invalid as a character
  313. end)
  314. itp("doesn't crash on invalid codepoints", function()
  315. eq(9000000, lib.utf_fold(9000000))
  316. eq(0, lib.utf_fold(0))
  317. end)
  318. end)
  319. end)