diff.nim 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements an algorithm to compute the
  10. ## `diff`:idx: between two sequences of lines.
  11. ##
  12. ## A basic example of ``diffInt`` on 2 arrays of integers:
  13. ##
  14. ## .. code::nim
  15. ##
  16. ## import experimental/diff
  17. ## echo diffInt([0, 1, 2, 3, 4, 5, 6, 7, 8], [-1, 1, 2, 3, 4, 5, 666, 7, 42])
  18. ##
  19. ## Another short example of ``diffText`` to diff strings:
  20. ##
  21. ## .. code::nim
  22. ##
  23. ## import experimental/diff
  24. ## # 2 samples of text for testing (from "The Call of Cthulhu" by Lovecraft)
  25. ## let txt0 = """I have looked upon all the universe has to hold of horror,
  26. ## even skies of spring and flowers of summer must ever be poison to me."""
  27. ## let txt1 = """I have looked upon all your code has to hold of bugs,
  28. ## even skies of spring and flowers of summer must ever be poison to me."""
  29. ##
  30. ## echo diffText(txt0, txt1)
  31. ##
  32. ## - To learn more see `Diff on Wikipedia. <http://wikipedia.org/wiki/Diff>`_
  33. # code owner: Arne Döring
  34. #
  35. # This is based on C# code written by Matthias Hertel, http://www.mathertel.de
  36. #
  37. # This Class implements the Difference Algorithm published in
  38. # "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
  39. # Algorithmica Vol. 1 No. 2, 1986, p 251.
  40. import tables, strutils
  41. type
  42. Item* = object ## An Item in the list of differences.
  43. startA*: int ## Start Line number in Data A.
  44. startB*: int ## Start Line number in Data B.
  45. deletedA*: int ## Number of changes in Data A.
  46. insertedB*: int ## Number of changes in Data B.
  47. DiffData = object ## Data on one input file being compared.
  48. data: seq[int] ## Buffer of numbers that will be compared.
  49. modified: seq[bool] ## Array of booleans that flag for modified
  50. ## data. This is the result of the diff.
  51. ## This means deletedA in the first Data or
  52. ## inserted in the second Data.
  53. Smsrd = object
  54. x, y: int
  55. # template to avoid a seq copy. Required until ``sink`` parameters are ready.
  56. template newDiffData(initData: seq[int]; L: int): DiffData =
  57. DiffData(
  58. data: initData,
  59. modified: newSeq[bool](L + 2)
  60. )
  61. proc len(d: DiffData): int {.inline.} = d.data.len
  62. proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
  63. ## This function converts all textlines of the text into unique numbers for every unique textline
  64. ## so further work can work only with simple numbers.
  65. ## ``aText`` the input text
  66. ## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
  67. ## ``trimSpace`` ignore leading and trailing space characters
  68. ## Returns a array of integers.
  69. var lastUsedCode = h.len
  70. result.data = newSeq[int]()
  71. for s in aText.splitLines:
  72. if h.contains s:
  73. result.data.add h[s]
  74. else:
  75. inc lastUsedCode
  76. h[s] = lastUsedCode
  77. result.data.add lastUsedCode
  78. result.modified = newSeq[bool](result.data.len + 2)
  79. proc optimize(data: var DiffData) =
  80. ## If a sequence of modified lines starts with a line that contains the same content
  81. ## as the line that appends the changes, the difference sequence is modified so that the
  82. ## appended line and not the starting line is marked as modified.
  83. ## This leads to more readable diff sequences when comparing text files.
  84. var startPos = 0
  85. while startPos < data.len:
  86. while startPos < data.len and not data.modified[startPos]:
  87. inc startPos
  88. var endPos = startPos
  89. while endPos < data.len and data.modified[endPos]:
  90. inc endPos
  91. if endPos < data.len and data.data[startPos] == data.data[endPos]:
  92. data.modified[startPos] = false
  93. data.modified[endPos] = true
  94. else:
  95. startPos = endPos
  96. proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
  97. downVector, upVector: var openArray[int]): Smsrd =
  98. ## This is the algorithm to find the Shortest Middle Snake (sms).
  99. ## ``dataA`` sequence A
  100. ## ``lowerA`` lower bound of the actual range in dataA
  101. ## ``upperA`` upper bound of the actual range in dataA (exclusive)
  102. ## ``dataB`` sequence B
  103. ## ``lowerB`` lower bound of the actual range in dataB
  104. ## ``upperB`` upper bound of the actual range in dataB (exclusive)
  105. ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
  106. ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
  107. ## Returns a MiddleSnakeData record containing x,y and u,v.
  108. let max = dataA.len + dataB.len + 1
  109. let downK = lowerA - lowerB # the k-line to start the forward search
  110. let upK = upperA - upperB # the k-line to start the reverse search
  111. let delta = (upperA - lowerA) - (upperB - lowerB)
  112. let oddDelta = (delta and 1) != 0
  113. # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
  114. # and are access using a specific offset: upOffset upVector and downOffset for downVector
  115. let downOffset = max - downK
  116. let upOffset = max - upK
  117. let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
  118. downVector[downOffset + downK + 1] = lowerA
  119. upVector[upOffset + upK - 1] = upperA
  120. for D in 0 .. maxD:
  121. # Extend the forward path.
  122. for k in countUp(downK - D, downK + D, 2):
  123. # find the only or better starting point
  124. var x: int
  125. if k == downK - D:
  126. x = downVector[downOffset + k + 1] # down
  127. else:
  128. x = downVector[downOffset + k - 1] + 1 # a step to the right
  129. if k < downK + D and downVector[downOffset + k + 1] >= x:
  130. x = downVector[downOffset + k + 1] # down
  131. var y = x - k
  132. # find the end of the furthest reaching forward D-path in diagonal k.
  133. while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
  134. inc x
  135. inc y
  136. downVector[downOffset + k] = x
  137. # overlap ?
  138. if oddDelta and upK - D < k and k < upK + D:
  139. if upVector[upOffset + k] <= downVector[downOffset + k]:
  140. return Smsrd(x: downVector[downOffset + k],
  141. y: downVector[downOffset + k] - k)
  142. # Extend the reverse path.
  143. for k in countUp(upK - D, upK + D, 2):
  144. # find the only or better starting point
  145. var x: int
  146. if k == upK + D:
  147. x = upVector[upOffset + k - 1] # up
  148. else:
  149. x = upVector[upOffset + k + 1] - 1 # left
  150. if k > upK - D and upVector[upOffset + k - 1] < x:
  151. x = upVector[upOffset + k - 1] # up
  152. var y = x - k
  153. while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
  154. dec x
  155. dec y
  156. upVector[upOffset + k] = x
  157. # overlap ?
  158. if not oddDelta and downK-D <= k and k <= downK+D:
  159. if upVector[upOffset + k] <= downVector[downOffset + k]:
  160. return Smsrd(x: downVector[downOffset + k],
  161. y: downVector[downOffset + k] - k)
  162. assert false, "the algorithm should never come here."
  163. proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
  164. downVector, upVector: var openArray[int]) =
  165. ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
  166. ## algorithm.
  167. ## The published algorithm passes recursively parts of the A and B sequences.
  168. ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
  169. ## ``dataA`` sequence A
  170. ## ``lowerA`` lower bound of the actual range in dataA
  171. ## ``upperA`` upper bound of the actual range in dataA (exclusive)
  172. ## ``dataB`` sequence B
  173. ## ``lowerB`` lower bound of the actual range in dataB
  174. ## ``upperB`` upper bound of the actual range in dataB (exclusive)
  175. ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
  176. ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
  177. # make mutable copy:
  178. var lowerA = lowerA
  179. var lowerB = lowerB
  180. var upperA = upperA
  181. var upperB = upperB
  182. # Fast walkthrough equal lines at the start
  183. while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
  184. inc lowerA
  185. inc lowerB
  186. # Fast walkthrough equal lines at the end
  187. while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
  188. dec upperA
  189. dec upperB
  190. if lowerA == upperA:
  191. # mark as inserted lines.
  192. while lowerB < upperB:
  193. dataB.modified[lowerB] = true
  194. inc lowerB
  195. elif lowerB == upperB:
  196. # mark as deleted lines.
  197. while lowerA < upperA:
  198. dataA.modified[lowerA] = true
  199. inc lowerA
  200. else:
  201. # Find the middle snakea and length of an optimal path for A and B
  202. let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
  203. # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
  204. # The path is from LowerX to (x,y) and (x,y) to UpperX
  205. lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
  206. lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points
  207. proc createDiffs(dataA, dataB: DiffData): seq[Item] =
  208. ## Scan the tables of which lines are inserted and deleted,
  209. ## producing an edit script in forward order.
  210. var startA = 0
  211. var startB = 0
  212. var lineA = 0
  213. var lineB = 0
  214. while lineA < dataA.len or lineB < dataB.len:
  215. if lineA < dataA.len and not dataA.modified[lineA] and
  216. lineB < dataB.len and not dataB.modified[lineB]:
  217. # equal lines
  218. inc lineA
  219. inc lineB
  220. else:
  221. # maybe deleted and/or inserted lines
  222. startA = lineA
  223. startB = lineB
  224. while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
  225. inc lineA
  226. while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
  227. inc lineB
  228. if (startA < lineA) or (startB < lineB):
  229. result.add Item(startA: startA,
  230. startB: startB,
  231. deletedA: lineA - startA,
  232. insertedB: lineB - startB)
  233. proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
  234. ## Find the difference in 2 arrays of integers.
  235. ##
  236. ## ``arrayA`` A-version of the numbers (usualy the old one)
  237. ##
  238. ## ``arrayB`` B-version of the numbers (usualy the new one)
  239. ##
  240. ## Returns a sequence of Items that describe the differences.
  241. # The A-Version of the data (original data) to be compared.
  242. var dataA = newDiffData(@arrayA, arrayA.len)
  243. # The B-Version of the data (modified data) to be compared.
  244. var dataB = newDiffData(@arrayB, arrayB.len)
  245. let max = dataA.len + dataB.len + 1
  246. ## vector for the (0,0) to (x,y) search
  247. var downVector = newSeq[int](2 * max + 2)
  248. ## vector for the (u,v) to (N,M) search
  249. var upVector = newSeq[int](2 * max + 2)
  250. lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
  251. result = createDiffs(dataA, dataB)
  252. proc diffText*(textA, textB: string): seq[Item] =
  253. ## Find the difference in 2 text documents, comparing by textlines.
  254. ##
  255. ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
  256. ## each line is converted into a (hash) number. This hash-value is computed by storing all
  257. ## textlines into a common hashtable so i can find dublicates in there, and generating a
  258. ## new number each time a new textline is inserted.
  259. ##
  260. ## ``textA`` A-version of the text (usually the old one)
  261. ##
  262. ## ``textB`` B-version of the text (usually the new one)
  263. ##
  264. ## Returns a seq of Items that describe the differences.
  265. # prepare the input-text and convert to comparable numbers.
  266. var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size
  267. # The A-Version of the data (original data) to be compared.
  268. var dataA = diffCodes(textA, h)
  269. # The B-Version of the data (modified data) to be compared.
  270. var dataB = diffCodes(textB, h)
  271. h.clear # free up hashtable memory (maybe)
  272. let max = dataA.len + dataB.len + 1
  273. ## vector for the (0,0) to (x,y) search
  274. var downVector = newSeq[int](2 * max + 2)
  275. ## vector for the (u,v) to (N,M) search
  276. var upVector = newSeq[int](2 * max + 2)
  277. lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
  278. optimize(dataA)
  279. optimize(dataB)
  280. result = createDiffs(dataA, dataB)
  281. when isMainModule:
  282. proc testHelper(f: seq[Item]): string =
  283. for it in f:
  284. result.add(
  285. $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
  286. )
  287. proc main() =
  288. var a, b: string
  289. stdout.writeLine("Diff Self Test...")
  290. # test all changes
  291. a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
  292. b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
  293. assert(testHelper(diffText(a, b)) ==
  294. "12.10.0.0*",
  295. "all-changes test failed.")
  296. stdout.writeLine("all-changes test passed.")
  297. # test all same
  298. a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
  299. b = a
  300. assert(testHelper(diffText(a, b)) ==
  301. "",
  302. "all-same test failed.")
  303. stdout.writeLine("all-same test passed.")
  304. # test snake
  305. a = "a,b,c,d,e,f".replace(',', '\n')
  306. b = "b,c,d,e,f,x".replace(',', '\n')
  307. assert(testHelper(diffText(a, b)) ==
  308. "1.0.0.0*0.1.6.5*",
  309. "snake test failed.")
  310. stdout.writeLine("snake test passed.")
  311. # 2002.09.20 - repro
  312. a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
  313. b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
  314. assert(testHelper(diffText(a, b)) ==
  315. "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
  316. "repro20020920 test failed.")
  317. stdout.writeLine("repro20020920 test passed.")
  318. # 2003.02.07 - repro
  319. a = "F".replace(',', '\n')
  320. b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
  321. assert(testHelper(diffText(a, b)) ==
  322. "0.1.0.0*0.7.1.2*",
  323. "repro20030207 test failed.")
  324. stdout.writeLine("repro20030207 test passed.")
  325. # Muegel - repro
  326. a = "HELLO\nWORLD"
  327. b = "\n\nhello\n\n\n\nworld\n"
  328. assert(testHelper(diffText(a, b)) ==
  329. "2.8.0.0*",
  330. "repro20030409 test failed.")
  331. stdout.writeLine("repro20030409 test passed.")
  332. # test some differences
  333. a = "a,b,-,c,d,e,f,f".replace(',', '\n')
  334. b = "a,b,x,c,e,f".replace(',', '\n')
  335. assert(testHelper(diffText(a, b)) ==
  336. "1.1.2.2*1.0.4.4*1.0.7.6*",
  337. "some-changes test failed.")
  338. stdout.writeLine("some-changes test passed.")
  339. # test one change within long chain of repeats
  340. a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
  341. b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
  342. assert(testHelper(diffText(a, b)) ==
  343. "0.1.4.4*1.0.9.10*",
  344. "long chain of repeats test failed.")
  345. stdout.writeLine("End.")
  346. stdout.flushFile
  347. main()