sync.lua 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. -- Notes on incremental sync:
  2. -- Per the protocol, the text range should be:
  3. --
  4. -- A position inside a document (see Position definition below) is expressed as
  5. -- a zero-based line and character offset. The offsets are based on a UTF-16
  6. -- string representation. So a string of the form a𐐀b the character offset
  7. -- of the character a is 0, the character offset of 𐐀 is 1 and the character
  8. -- offset of b is 3 since 𐐀 is represented using two code units in UTF-16.
  9. --
  10. -- To ensure that both client and server split the string into the same line
  11. -- representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’.
  12. --
  13. -- Positions are line end character agnostic. So you can not specify a position that
  14. -- denotes \r|\n or \n| where | represents the character offset. This means *no* defining
  15. -- a range than ends on the same line after a terminating character
  16. --
  17. -- Generic warnings about byte level changes in neovim. Many apparently "single"
  18. -- operations in on_lines callbacks are actually multiple operations.
  19. --
  20. -- Join operation (2 operations):
  21. -- * extends line 1 with the contents of line 2
  22. -- * deletes line 2
  23. --
  24. -- test 1 test 1 test 2 test 1 test 2
  25. -- test 2 -> test 2 -> test 3
  26. -- test 3 test 3
  27. --
  28. -- Deleting (and undoing) two middle lines (1 operation):
  29. --
  30. -- test 1 test 1
  31. -- test 2 -> test 4
  32. -- test 3
  33. -- test 4
  34. --
  35. -- Deleting partial lines (5 operations) deleting between asterisks below:
  36. --
  37. -- test *1 test * test * test * test *4 test *4*
  38. -- test 2 -> test 2 -> test *4 -> *4 -> *4 ->
  39. -- test 3 test 3
  40. -- test *4 test 4
  41. local M = {}
  42. -- local string.byte, unclear if this is necessary for JIT compilation
  43. local str_byte = string.byte
  44. local min = math.min
  45. local str_utfindex = vim.str_utfindex
  46. local str_utf_start = vim.str_utf_start
  47. local str_utf_end = vim.str_utf_end
  48. ---@private
  49. -- Given a line, byte idx, and offset_encoding convert to the
  50. -- utf-8, utf-16, or utf-32 index.
  51. ---@param line string the line to index into
  52. ---@param byte integer the byte idx
  53. ---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
  54. --@returns integer the utf idx for the given encoding
  55. local function byte_to_utf(line, byte, offset_encoding)
  56. -- convert to 0 based indexing for str_utfindex
  57. byte = byte - 1
  58. local utf_idx
  59. local _
  60. -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
  61. if offset_encoding == 'utf-16' then
  62. _, utf_idx = str_utfindex(line, byte)
  63. elseif offset_encoding == 'utf-32' then
  64. utf_idx, _ = str_utfindex(line, byte)
  65. else
  66. utf_idx = byte
  67. end
  68. -- convert to 1 based indexing
  69. return utf_idx + 1
  70. end
  71. ---@private
  72. local function compute_line_length(line, offset_encoding)
  73. local length
  74. local _
  75. if offset_encoding == 'utf-16' then
  76. _, length = str_utfindex(line)
  77. elseif offset_encoding == 'utf-32' then
  78. length, _ = str_utfindex(line)
  79. else
  80. length = #line
  81. end
  82. return length
  83. end
  84. ---@private
  85. -- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
  86. -- utf-8 index and either the utf-16, or utf-32 index.
  87. ---@param line string the line to index into
  88. ---@param byte integer the byte idx
  89. ---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
  90. ---@returns table<string, int> byte_idx and char_idx of first change position
  91. local function align_end_position(line, byte, offset_encoding)
  92. local char
  93. -- If on the first byte, or an empty string: the trivial case
  94. if byte == 1 or #line == 0 then
  95. char = byte
  96. -- Called in the case of extending an empty line "" -> "a"
  97. elseif byte == #line + 1 then
  98. char = compute_line_length(line, offset_encoding) + 1
  99. else
  100. -- Modifying line, find the nearest utf codepoint
  101. local offset = str_utf_start(line, byte)
  102. -- If the byte does not fall on the start of the character, then
  103. -- align to the start of the next character.
  104. if offset < 0 then
  105. byte = byte + str_utf_end(line, byte) + 1
  106. end
  107. if byte <= #line then
  108. char = byte_to_utf(line, byte, offset_encoding)
  109. else
  110. char = compute_line_length(line, offset_encoding) + 1
  111. end
  112. -- Extending line, find the nearest utf codepoint for the last valid character
  113. end
  114. return byte, char
  115. end
  116. ---@private
  117. --- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint.
  118. ---@param prev_lines table list of lines from previous buffer
  119. ---@param curr_lines table list of lines from current buffer
  120. ---@param firstline integer firstline from on_lines, adjusted to 1-index
  121. ---@param lastline integer lastline from on_lines, adjusted to 1-index
  122. ---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index
  123. ---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8)
  124. ---@returns table<int, int> line_idx, byte_idx, and char_idx of first change position
  125. local function compute_start_range(
  126. prev_lines,
  127. curr_lines,
  128. firstline,
  129. lastline,
  130. new_lastline,
  131. offset_encoding
  132. )
  133. local char_idx
  134. local byte_idx
  135. -- If firstline == lastline, no existing text is changed. All edit operations
  136. -- occur on a new line pointed to by lastline. This occurs during insertion of
  137. -- new lines(O), the new newline is inserted at the line indicated by
  138. -- new_lastline.
  139. if firstline == lastline then
  140. local line_idx
  141. local line = prev_lines[firstline - 1]
  142. if line then
  143. line_idx = firstline - 1
  144. byte_idx = #line + 1
  145. char_idx = compute_line_length(line, offset_encoding) + 1
  146. else
  147. line_idx = firstline
  148. byte_idx = 1
  149. char_idx = 1
  150. end
  151. return { line_idx = line_idx, byte_idx = byte_idx, char_idx = char_idx }
  152. end
  153. -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  154. -- In this case, the first byte change is also at the first byte of firstline
  155. if firstline == new_lastline then
  156. return { line_idx = firstline, byte_idx = 1, char_idx = 1 }
  157. end
  158. local prev_line = prev_lines[firstline]
  159. local curr_line = curr_lines[firstline]
  160. -- Iterate across previous and current line containing first change
  161. -- to find the first different byte.
  162. -- Note: *about -> a*about will register the second a as the first
  163. -- difference, regardless of edit since we do not receive the first
  164. -- column of the edit from on_lines.
  165. local start_byte_idx = 1
  166. for idx = 1, #prev_line + 1 do
  167. start_byte_idx = idx
  168. if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then
  169. break
  170. end
  171. end
  172. -- Convert byte to codepoint if applicable
  173. if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1) then
  174. byte_idx = start_byte_idx
  175. char_idx = 1
  176. elseif start_byte_idx == #prev_line + 1 then
  177. byte_idx = start_byte_idx
  178. char_idx = compute_line_length(prev_line, offset_encoding) + 1
  179. else
  180. byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
  181. char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding)
  182. end
  183. -- Return the start difference (shared for new and prev lines)
  184. return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
  185. end
  186. ---@private
  187. --- Finds the last line and byte index of the differences between prev and current buffer.
  188. --- Normalized to the next codepoint.
  189. --- prev_end_range is the text range sent to the server representing the changed region.
  190. --- curr_end_range is the text that should be collected and sent to the server.
  191. --
  192. ---@param prev_lines table list of lines
  193. ---@param curr_lines table list of lines
  194. ---@param start_range table
  195. ---@param lastline integer
  196. ---@param new_lastline integer
  197. ---@param offset_encoding string
  198. ---@returns (int, int) end_line_idx and end_col_idx of range
  199. local function compute_end_range(
  200. prev_lines,
  201. curr_lines,
  202. start_range,
  203. firstline,
  204. lastline,
  205. new_lastline,
  206. offset_encoding
  207. )
  208. -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  209. -- In this case, the last_byte...
  210. if firstline == new_lastline then
  211. return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, {
  212. line_idx = firstline,
  213. byte_idx = 1,
  214. char_idx = 1,
  215. }
  216. end
  217. if firstline == lastline then
  218. return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, {
  219. line_idx = new_lastline - lastline + firstline,
  220. byte_idx = 1,
  221. char_idx = 1,
  222. }
  223. end
  224. -- Compare on last line, at minimum will be the start range
  225. local start_line_idx = start_range.line_idx
  226. -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines
  227. local prev_line_idx = lastline - 1
  228. local curr_line_idx = new_lastline - 1
  229. local prev_line = prev_lines[lastline - 1]
  230. local curr_line = curr_lines[new_lastline - 1]
  231. local prev_line_length = #prev_line
  232. local curr_line_length = #curr_line
  233. local byte_offset = 0
  234. -- Editing the same line
  235. -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
  236. if prev_line_idx == curr_line_idx then
  237. local max_length
  238. if start_line_idx == prev_line_idx then
  239. -- Search until beginning of difference
  240. max_length = min(
  241. prev_line_length - start_range.byte_idx,
  242. curr_line_length - start_range.byte_idx
  243. ) + 1
  244. else
  245. max_length = min(prev_line_length, curr_line_length) + 1
  246. end
  247. for idx = 0, max_length do
  248. byte_offset = idx
  249. if
  250. str_byte(prev_line, prev_line_length - byte_offset)
  251. ~= str_byte(curr_line, curr_line_length - byte_offset)
  252. then
  253. break
  254. end
  255. end
  256. end
  257. -- Iterate from end to beginning of shortest line
  258. local prev_end_byte_idx = prev_line_length - byte_offset + 1
  259. -- Handle case where lines match
  260. if prev_end_byte_idx == 0 then
  261. prev_end_byte_idx = 1
  262. end
  263. local prev_byte_idx, prev_char_idx =
  264. align_end_position(prev_line, prev_end_byte_idx, offset_encoding)
  265. local prev_end_range =
  266. { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }
  267. local curr_end_range
  268. -- Deletion event, new_range cannot be before start
  269. if curr_line_idx < start_line_idx then
  270. curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
  271. else
  272. local curr_end_byte_idx = curr_line_length - byte_offset + 1
  273. -- Handle case where lines match
  274. if curr_end_byte_idx == 0 then
  275. curr_end_byte_idx = 1
  276. end
  277. local curr_byte_idx, curr_char_idx =
  278. align_end_position(curr_line, curr_end_byte_idx, offset_encoding)
  279. curr_end_range =
  280. { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
  281. end
  282. return prev_end_range, curr_end_range
  283. end
  284. ---@private
  285. --- Get the text of the range defined by start and end line/column
  286. ---@param lines table list of lines
  287. ---@param start_range table table returned by first_difference
  288. ---@param end_range table new_end_range returned by last_difference
  289. ---@returns string text extracted from defined region
  290. local function extract_text(lines, start_range, end_range, line_ending)
  291. if not lines[start_range.line_idx] then
  292. return ''
  293. end
  294. -- Trivial case: start and end range are the same line, directly grab changed text
  295. if start_range.line_idx == end_range.line_idx then
  296. -- string.sub is inclusive, end_range is not
  297. return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1)
  298. else
  299. -- Handle deletion case
  300. -- Collect the changed portion of the first changed line
  301. local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) }
  302. -- Collect the full line for intermediate lines
  303. for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
  304. table.insert(result, lines[idx])
  305. end
  306. if lines[end_range.line_idx] then
  307. -- Collect the changed portion of the last changed line.
  308. table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1))
  309. else
  310. table.insert(result, '')
  311. end
  312. -- Add line ending between all lines
  313. return table.concat(result, line_ending)
  314. end
  315. end
  316. ---@private
  317. -- rangelength depends on the offset encoding
  318. -- bytes for utf-8 (clangd with extension)
  319. -- codepoints for utf-16
  320. -- codeunits for utf-32
  321. -- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac)
  322. -- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior)
  323. local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending)
  324. local line_ending_length = #line_ending
  325. -- Single line case
  326. if start_range.line_idx == end_range.line_idx then
  327. return end_range.char_idx - start_range.char_idx
  328. end
  329. local start_line = lines[start_range.line_idx]
  330. local range_length
  331. if start_line and #start_line > 0 then
  332. range_length = compute_line_length(start_line, offset_encoding)
  333. - start_range.char_idx
  334. + 1
  335. + line_ending_length
  336. else
  337. -- Length of newline character
  338. range_length = line_ending_length
  339. end
  340. -- The first and last range of the line idx may be partial lines
  341. for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
  342. -- Length full line plus newline character
  343. if #lines[idx] > 0 then
  344. range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
  345. else
  346. range_length = range_length + line_ending_length
  347. end
  348. end
  349. local end_line = lines[end_range.line_idx]
  350. if end_line and #end_line > 0 then
  351. range_length = range_length + end_range.char_idx - 1
  352. end
  353. return range_length
  354. end
  355. --- Returns the range table for the difference between prev and curr lines
  356. ---@param prev_lines table list of lines
  357. ---@param curr_lines table list of lines
  358. ---@param firstline number line to begin search for first difference
  359. ---@param lastline number line to begin search in old_lines for last difference
  360. ---@param new_lastline number line to begin search in new_lines for last difference
  361. ---@param offset_encoding string encoding requested by language server
  362. ---@returns table TextDocumentContentChangeEvent see https://microsoft.github.io/language-server-protocol/specifications/specification-3-17/#textDocumentContentChangeEvent
  363. function M.compute_diff(
  364. prev_lines,
  365. curr_lines,
  366. firstline,
  367. lastline,
  368. new_lastline,
  369. offset_encoding,
  370. line_ending
  371. )
  372. -- Find the start of changes between the previous and current buffer. Common between both.
  373. -- Sent to the server as the start of the changed range.
  374. -- Used to grab the changed text from the latest buffer.
  375. local start_range = compute_start_range(
  376. prev_lines,
  377. curr_lines,
  378. firstline + 1,
  379. lastline + 1,
  380. new_lastline + 1,
  381. offset_encoding
  382. )
  383. -- Find the last position changed in the previous and current buffer.
  384. -- prev_end_range is sent to the server as as the end of the changed range.
  385. -- curr_end_range is used to grab the changed text from the latest buffer.
  386. local prev_end_range, curr_end_range = compute_end_range(
  387. prev_lines,
  388. curr_lines,
  389. start_range,
  390. firstline + 1,
  391. lastline + 1,
  392. new_lastline + 1,
  393. offset_encoding
  394. )
  395. -- Grab the changed text of from start_range to curr_end_range in the current buffer.
  396. -- The text range is "" if entire range is deleted.
  397. local text = extract_text(curr_lines, start_range, curr_end_range, line_ending)
  398. -- Compute the range of the replaced text. Deprecated but still required for certain language servers
  399. local range_length =
  400. compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending)
  401. -- convert to 0 based indexing
  402. local result = {
  403. range = {
  404. ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 },
  405. ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 },
  406. },
  407. text = text,
  408. rangeLength = range_length,
  409. }
  410. return result
  411. end
  412. return M