diff.nim 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements an algorithm to compute the
  10. ## `diff`:idx: between two sequences of lines.
  11. ##
  12. ## - To learn more see `Diff on Wikipedia. <https://wikipedia.org/wiki/Diff>`_
  13. runnableExamples:
  14. assert diffInt(
  15. [0, 1, 2, 3, 4, 5, 6, 7, 8],
  16. [-1, 1, 2, 3, 4, 5, 666, 7, 42]) ==
  17. @[Item(startA: 0, startB: 0, deletedA: 1, insertedB: 1),
  18. Item(startA: 6, startB: 6, deletedA: 1, insertedB: 1),
  19. Item(startA: 8, startB: 8, deletedA: 1, insertedB: 1)]
  20. runnableExamples:
  21. # 2 samples of text (from "The Call of Cthulhu" by Lovecraft)
  22. let txt0 = """
  23. abc
  24. def ghi
  25. jkl2"""
  26. let txt1 = """
  27. bacx
  28. abc
  29. def ghi
  30. jkl"""
  31. assert diffText(txt0, txt1) ==
  32. @[Item(startA: 0, startB: 0, deletedA: 0, insertedB: 1),
  33. Item(startA: 2, startB: 3, deletedA: 1, insertedB: 1)]
  34. # code owner: Arne Döring
  35. #
  36. # This is based on C# code written by Matthias Hertel, https://www.mathertel.de
  37. #
  38. # This Class implements the Difference Algorithm published in
  39. # "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
  40. # Algorithmica Vol. 1 No. 2, 1986, p 251.
  41. import std/[tables, strutils]
  42. when defined(nimPreviewSlimSystem):
  43. import std/assertions
  44. type
  45. Item* = object ## An Item in the list of differences.
  46. startA*: int ## Start Line number in Data A.
  47. startB*: int ## Start Line number in Data B.
  48. deletedA*: int ## Number of changes in Data A.
  49. insertedB*: int ## Number of changes in Data B.
  50. DiffData = object ## Data on one input file being compared.
  51. data: seq[int] ## Buffer of numbers that will be compared.
  52. modified: seq[bool] ## Array of booleans that flag for modified
  53. ## data. This is the result of the diff.
  54. ## This means deletedA in the first Data or
  55. ## inserted in the second Data.
  56. Smsrd = object
  57. x, y: int
  58. # template to avoid a seq copy. Required until `sink` parameters are ready.
  59. template newDiffData(initData: seq[int]; L: int): DiffData =
  60. DiffData(
  61. data: initData,
  62. modified: newSeq[bool](L + 2)
  63. )
  64. proc len(d: DiffData): int {.inline.} = d.data.len
  65. proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
  66. ## This function converts all textlines of the text into unique numbers for every unique textline
  67. ## so further work can work only with simple numbers.
  68. ## `aText` the input text
  69. ## `h` This extern initialized hashtable is used for storing all ever used textlines.
  70. ## `trimSpace` ignore leading and trailing space characters
  71. ## Returns a array of integers.
  72. var lastUsedCode = h.len
  73. result = DiffData(data: newSeq[int]())
  74. for s in aText.splitLines:
  75. if h.contains s:
  76. result.data.add h[s]
  77. else:
  78. inc lastUsedCode
  79. h[s] = lastUsedCode
  80. result.data.add lastUsedCode
  81. result.modified = newSeq[bool](result.data.len + 2)
  82. proc optimize(data: var DiffData) =
  83. ## If a sequence of modified lines starts with a line that contains the same content
  84. ## as the line that appends the changes, the difference sequence is modified so that the
  85. ## appended line and not the starting line is marked as modified.
  86. ## This leads to more readable diff sequences when comparing text files.
  87. var startPos = 0
  88. while startPos < data.len:
  89. while startPos < data.len and not data.modified[startPos]:
  90. inc startPos
  91. var endPos = startPos
  92. while endPos < data.len and data.modified[endPos]:
  93. inc endPos
  94. if endPos < data.len and data.data[startPos] == data.data[endPos]:
  95. data.modified[startPos] = false
  96. data.modified[endPos] = true
  97. else:
  98. startPos = endPos
  99. proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
  100. downVector, upVector: var openArray[int]): Smsrd =
  101. ## This is the algorithm to find the Shortest Middle Snake (sms).
  102. ## `dataA` sequence A
  103. ## `lowerA` lower bound of the actual range in dataA
  104. ## `upperA` upper bound of the actual range in dataA (exclusive)
  105. ## `dataB` sequence B
  106. ## `lowerB` lower bound of the actual range in dataB
  107. ## `upperB` upper bound of the actual range in dataB (exclusive)
  108. ## `downVector` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
  109. ## `upVector` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
  110. ## Returns a MiddleSnakeData record containing x,y and u,v.
  111. result = Smsrd()
  112. let max = dataA.len + dataB.len + 1
  113. let downK = lowerA - lowerB # the k-line to start the forward search
  114. let upK = upperA - upperB # the k-line to start the reverse search
  115. let delta = (upperA - lowerA) - (upperB - lowerB)
  116. let oddDelta = (delta and 1) != 0
  117. # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
  118. # and are access using a specific offset: upOffset upVector and downOffset for downVector
  119. let downOffset = max - downK
  120. let upOffset = max - upK
  121. let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
  122. downVector[downOffset + downK + 1] = lowerA
  123. upVector[upOffset + upK - 1] = upperA
  124. for D in 0 .. maxD:
  125. # Extend the forward path.
  126. for k in countup(downK - D, downK + D, 2):
  127. # find the only or better starting point
  128. var x: int
  129. if k == downK - D:
  130. x = downVector[downOffset + k + 1] # down
  131. else:
  132. x = downVector[downOffset + k - 1] + 1 # a step to the right
  133. if k < downK + D and downVector[downOffset + k + 1] >= x:
  134. x = downVector[downOffset + k + 1] # down
  135. var y = x - k
  136. # find the end of the furthest reaching forward D-path in diagonal k.
  137. while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
  138. inc x
  139. inc y
  140. downVector[downOffset + k] = x
  141. # overlap ?
  142. if oddDelta and upK - D < k and k < upK + D:
  143. if upVector[upOffset + k] <= downVector[downOffset + k]:
  144. return Smsrd(x: downVector[downOffset + k],
  145. y: downVector[downOffset + k] - k)
  146. # Extend the reverse path.
  147. for k in countup(upK - D, upK + D, 2):
  148. # find the only or better starting point
  149. var x: int
  150. if k == upK + D:
  151. x = upVector[upOffset + k - 1] # up
  152. else:
  153. x = upVector[upOffset + k + 1] - 1 # left
  154. if k > upK - D and upVector[upOffset + k - 1] < x:
  155. x = upVector[upOffset + k - 1] # up
  156. var y = x - k
  157. while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
  158. dec x
  159. dec y
  160. upVector[upOffset + k] = x
  161. # overlap ?
  162. if not oddDelta and downK-D <= k and k <= downK+D:
  163. if upVector[upOffset + k] <= downVector[downOffset + k]:
  164. return Smsrd(x: downVector[downOffset + k],
  165. y: downVector[downOffset + k] - k)
  166. assert false, "the algorithm should never come here."
  167. proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
  168. downVector, upVector: var openArray[int]) =
  169. ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
  170. ## algorithm.
  171. ## The published algorithm passes recursively parts of the A and B sequences.
  172. ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
  173. ## `dataA` sequence A
  174. ## `lowerA` lower bound of the actual range in dataA
  175. ## `upperA` upper bound of the actual range in dataA (exclusive)
  176. ## `dataB` sequence B
  177. ## `lowerB` lower bound of the actual range in dataB
  178. ## `upperB` upper bound of the actual range in dataB (exclusive)
  179. ## `downVector` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
  180. ## `upVector` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
  181. # make mutable copy:
  182. var lowerA = lowerA
  183. var lowerB = lowerB
  184. var upperA = upperA
  185. var upperB = upperB
  186. # Fast walkthrough equal lines at the start
  187. while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
  188. inc lowerA
  189. inc lowerB
  190. # Fast walkthrough equal lines at the end
  191. while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
  192. dec upperA
  193. dec upperB
  194. if lowerA == upperA:
  195. # mark as inserted lines.
  196. while lowerB < upperB:
  197. dataB.modified[lowerB] = true
  198. inc lowerB
  199. elif lowerB == upperB:
  200. # mark as deleted lines.
  201. while lowerA < upperA:
  202. dataA.modified[lowerA] = true
  203. inc lowerA
  204. else:
  205. # Find the middle snake and length of an optimal path for A and B
  206. let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
  207. # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
  208. # The path is from LowerX to (x,y) and (x,y) to UpperX
  209. lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
  210. lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points
  211. proc createDiffs(dataA, dataB: DiffData): seq[Item] =
  212. ## Scan the tables of which lines are inserted and deleted,
  213. ## producing an edit script in forward order.
  214. result = @[]
  215. var startA = 0
  216. var startB = 0
  217. var lineA = 0
  218. var lineB = 0
  219. while lineA < dataA.len or lineB < dataB.len:
  220. if lineA < dataA.len and not dataA.modified[lineA] and
  221. lineB < dataB.len and not dataB.modified[lineB]:
  222. # equal lines
  223. inc lineA
  224. inc lineB
  225. else:
  226. # maybe deleted and/or inserted lines
  227. startA = lineA
  228. startB = lineB
  229. while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
  230. inc lineA
  231. while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
  232. inc lineB
  233. if (startA < lineA) or (startB < lineB):
  234. result.add Item(startA: startA,
  235. startB: startB,
  236. deletedA: lineA - startA,
  237. insertedB: lineB - startB)
  238. proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
  239. ## Find the difference in 2 arrays of integers.
  240. ##
  241. ## `arrayA` A-version of the numbers (usually the old one)
  242. ##
  243. ## `arrayB` B-version of the numbers (usually the new one)
  244. ##
  245. ## Returns a sequence of Items that describe the differences.
  246. # The A-Version of the data (original data) to be compared.
  247. var dataA = newDiffData(@arrayA, arrayA.len)
  248. # The B-Version of the data (modified data) to be compared.
  249. var dataB = newDiffData(@arrayB, arrayB.len)
  250. let max = dataA.len + dataB.len + 1
  251. # vector for the (0,0) to (x,y) search
  252. var downVector = newSeq[int](2 * max + 2)
  253. # vector for the (u,v) to (N,M) search
  254. var upVector = newSeq[int](2 * max + 2)
  255. lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
  256. result = createDiffs(dataA, dataB)
  257. proc diffText*(textA, textB: string): seq[Item] =
  258. ## Find the difference in 2 text documents, comparing by textlines.
  259. ##
  260. ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
  261. ## each line is converted into a (hash) number. This hash-value is computed by storing all
  262. ## textlines into a common hashtable so i can find duplicates in there, and generating a
  263. ## new number each time a new textline is inserted.
  264. ##
  265. ## `textA` A-version of the text (usually the old one)
  266. ##
  267. ## `textB` B-version of the text (usually the new one)
  268. ##
  269. ## Returns a seq of Items that describe the differences.
  270. # See also `gitutils.diffStrings`.
  271. # prepare the input-text and convert to comparable numbers.
  272. var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size
  273. # The A-Version of the data (original data) to be compared.
  274. var dataA = diffCodes(textA, h)
  275. # The B-Version of the data (modified data) to be compared.
  276. var dataB = diffCodes(textB, h)
  277. h.clear # free up hashtable memory (maybe)
  278. let max = dataA.len + dataB.len + 1
  279. # vector for the (0,0) to (x,y) search
  280. var downVector = newSeq[int](2 * max + 2)
  281. # vector for the (u,v) to (N,M) search
  282. var upVector = newSeq[int](2 * max + 2)
  283. lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
  284. optimize(dataA)
  285. optimize(dataB)
  286. result = createDiffs(dataA, dataB)