editdistance.nim 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements an algorithm to compute the
  10. ## `edit distance`:idx: between two Unicode strings.
  11. import unicode
  12. proc editDistance*(a, b: string): int {.noSideEffect.} =
  13. ## Returns the **unicode-rune** edit distance between ``a`` and ``b``.
  14. ##
  15. ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
  16. ## memory overhead.
  17. runnableExamples: static: doAssert editdistance("Kitten", "Bitten") == 1
  18. if len(a) > len(b):
  19. # make ``b`` the longer string
  20. return editDistance(b, a)
  21. # strip common prefix
  22. var
  23. iStart = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
  24. iNextA = 0
  25. iNextB = 0
  26. runeA, runeB: Rune
  27. lenRunesA = 0 ## The number of relevant runes in string ``a``.
  28. lenRunesB = 0 ## The number of relevant runes in string ``b``.
  29. block commonPrefix:
  30. # ``a`` is the shorter string
  31. while iStart < len(a):
  32. iNextA = iStart
  33. a.fastRuneAt(iNextA, runeA, doInc = true)
  34. iNextB = iStart
  35. b.fastRuneAt(iNextB, runeB, doInc = true)
  36. if runeA != runeB:
  37. inc(lenRunesA)
  38. inc(lenRunesB)
  39. break
  40. iStart = iNextA
  41. var
  42. # we know that we are either at the start of the strings
  43. # or that the current value of runeA is not equal to runeB
  44. # => start search for common suffix after the current rune (``i_next_*``)
  45. iEndA = iNextA ## The exclusive upper index bound of string ``a``.
  46. iEndB = iNextB ## The exclusive upper index bound of string ``b``.
  47. iCurrentA = iNextA
  48. iCurrentB = iNextB
  49. block commonSuffix:
  50. var
  51. addRunesA = 0
  52. addRunesB = 0
  53. while iCurrentA < len(a) and iCurrentB < len(b):
  54. iNextA = iCurrentA
  55. a.fastRuneAt(iNextA, runeA)
  56. iNextB = iCurrentB
  57. b.fastRuneAt(iNextB, runeB)
  58. inc(addRunesA)
  59. inc(addRunesB)
  60. if runeA != runeB:
  61. iEndA = iNextA
  62. iEndB = iNextB
  63. inc(lenRunesA, addRunesA)
  64. inc(lenRunesB, addRunesB)
  65. addRunesA = 0
  66. addRunesB = 0
  67. iCurrentA = iNextA
  68. iCurrentB = iNextB
  69. if iCurrentA >= len(a): # ``a`` exhausted
  70. if iCurrentB < len(b): # ``b`` not exhausted
  71. iEndA = iCurrentA
  72. iEndB = iCurrentB
  73. inc(lenRunesA, addRunesA)
  74. inc(lenRunesB, addRunesB)
  75. while true:
  76. b.fastRuneAt(iEndB, runeB)
  77. inc(lenRunesB)
  78. if iEndB >= len(b): break
  79. elif iCurrentB >= len(b): # ``b`` exhausted and ``a`` not exhausted
  80. iEndA = iCurrentA
  81. iEndB = iCurrentB
  82. inc(lenRunesA, addRunesA)
  83. inc(lenRunesB, addRunesB)
  84. while true:
  85. a.fastRuneAt(iEndA, runeA)
  86. inc(lenRunesA)
  87. if iEndA >= len(a): break
  88. block specialCases:
  89. # trivial cases:
  90. if lenRunesA == 0: return lenRunesB
  91. if lenRunesB == 0: return lenRunesA
  92. # another special case:
  93. if lenRunesA == 1:
  94. a.fastRuneAt(iStart, runeA, doInc = false)
  95. var iCurrentB = iStart
  96. while iCurrentB < iEndB:
  97. b.fastRuneAt(iCurrentB, runeB, doInc = true)
  98. if runeA == runeB: return lenRunesB - 1
  99. return lenRunesB
  100. # common case:
  101. var
  102. len1 = lenRunesA + 1
  103. len2 = lenRunesB + 1
  104. row: seq[int]
  105. let half = lenRunesA div 2
  106. newSeq(row, len2)
  107. var e = iStart + len2 - 1 # end marker
  108. # initialize first row:
  109. for i in 1 .. (len2 - half - 1): row[i] = i
  110. row[0] = len1 - half - 1
  111. iCurrentA = iStart
  112. var
  113. char2pI = -1
  114. char2pPrev: int
  115. for i in 1 .. (len1 - 1):
  116. iNextA = iCurrentA
  117. a.fastRuneAt(iNextA, runeA)
  118. var
  119. char2p: int
  120. diff, x: int
  121. p: int
  122. if i >= (len1 - half):
  123. # skip the upper triangle:
  124. let offset = i + half - len1
  125. if char2pI == i:
  126. b.fastRuneAt(char2pPrev, runeB)
  127. char2p = char2pPrev
  128. char2pI = i + 1
  129. else:
  130. char2p = iStart
  131. for j in 0 ..< offset:
  132. runeB = b.runeAt(char2p)
  133. inc(char2p, runeB.size)
  134. char2pI = i + 1
  135. char2pPrev = char2p
  136. p = offset
  137. runeB = b.runeAt(char2p)
  138. var c3 = row[p] + (if runeA != runeB: 1 else: 0)
  139. inc(char2p, runeB.size)
  140. inc(p)
  141. x = row[p] + 1
  142. diff = x
  143. if x > c3: x = c3
  144. row[p] = x
  145. inc(p)
  146. else:
  147. p = 1
  148. char2p = iStart
  149. diff = i
  150. x = i
  151. if i <= (half + 1):
  152. # skip the lower triangle:
  153. e = len2 + i - half - 2
  154. # main:
  155. while p <= e:
  156. dec(diff)
  157. runeB = b.runeAt(char2p)
  158. var c3 = diff + (if runeA != runeB: 1 else: 0)
  159. inc(char2p, runeB.size)
  160. inc(x)
  161. if x > c3: x = c3
  162. diff = row[p] + 1
  163. if x > diff: x = diff
  164. row[p] = x
  165. inc(p)
  166. # lower triangle sentinel:
  167. if i <= half:
  168. dec(diff)
  169. runeB = b.runeAt(char2p)
  170. var c3 = diff + (if runeA != runeB: 1 else: 0)
  171. inc(x)
  172. if x > c3: x = c3
  173. row[p] = x
  174. iCurrentA = iNextA
  175. result = row[e]
  176. proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
  177. ## Returns the edit distance between `a` and `b`.
  178. ##
  179. ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
  180. ## memory overhead.
  181. runnableExamples: static: doAssert editDistanceAscii("Kitten", "Bitten") == 1
  182. var len1 = a.len
  183. var len2 = b.len
  184. if len1 > len2:
  185. # make `b` the longer string
  186. return editDistanceAscii(b, a)
  187. # strip common prefix:
  188. var s = 0
  189. while s < len1 and a[s] == b[s]:
  190. inc(s)
  191. dec(len1)
  192. dec(len2)
  193. # strip common suffix:
  194. while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
  195. dec(len1)
  196. dec(len2)
  197. # trivial cases:
  198. if len1 == 0: return len2
  199. if len2 == 0: return len1
  200. # another special case:
  201. if len1 == 1:
  202. for j in s..s+len2-1:
  203. if a[s] == b[j]: return len2 - 1
  204. return len2
  205. inc(len1)
  206. inc(len2)
  207. var half = len1 shr 1
  208. # initialize first row:
  209. #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
  210. var row: seq[int]
  211. newSeq(row, len2)
  212. var e = s + len2 - 1 # end marker
  213. for i in 1..len2 - half - 1: row[i] = i
  214. row[0] = len1 - half - 1
  215. for i in 1 .. len1 - 1:
  216. var char1 = a[i + s - 1]
  217. var char2p: int
  218. var diff, x: int
  219. var p: int
  220. if i >= len1 - half:
  221. # skip the upper triangle:
  222. var offset = i - len1 + half
  223. char2p = offset
  224. p = offset
  225. var c3 = row[p] + ord(char1 != b[s + char2p])
  226. inc(p)
  227. inc(char2p)
  228. x = row[p] + 1
  229. diff = x
  230. if x > c3: x = c3
  231. row[p] = x
  232. inc(p)
  233. else:
  234. p = 1
  235. char2p = 0
  236. diff = i
  237. x = i
  238. if i <= half + 1:
  239. # skip the lower triangle:
  240. e = len2 + i - half - 2
  241. # main:
  242. while p <= e:
  243. dec(diff)
  244. var c3 = diff + ord(char1 != b[char2p + s])
  245. inc(char2p)
  246. inc(x)
  247. if x > c3: x = c3
  248. diff = row[p] + 1
  249. if x > diff: x = diff
  250. row[p] = x
  251. inc(p)
  252. # lower triangle sentinel:
  253. if i <= half:
  254. dec(diff)
  255. var c3 = diff + ord(char1 != b[char2p + s])
  256. inc(x)
  257. if x > c3: x = c3
  258. row[p] = x
  259. result = row[e]
  260. when isMainModule:
  261. doAssert editDistance("", "") == 0
  262. doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
  263. doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
  264. doAssert editDistance("привет", "превет") == 1
  265. doAssert editDistance("Åge", "Age") == 1
  266. # editDistance, one string is longer in bytes, but shorter in rune length
  267. # first string: 4 bytes, second: 6 bytes, but only 3 runes
  268. doAssert editDistance("aaaa", "×××") == 4
  269. block veryLongStringEditDistanceTest:
  270. const cap = 256
  271. var
  272. s1 = newStringOfCap(cap)
  273. s2 = newStringOfCap(cap)
  274. while len(s1) < cap:
  275. s1.add 'a'
  276. while len(s2) < cap:
  277. s2.add 'b'
  278. doAssert editDistance(s1, s2) == cap
  279. block combiningCodePointsEditDistanceTest:
  280. const s = "A\xCC\x8Age"
  281. doAssert editDistance(s, "Age") == 1
  282. doAssert editDistanceAscii("", "") == 0
  283. doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
  284. doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia
  285. assert(editDistance("prefix__hallo_suffix", "prefix__hallo_suffix") == 0)
  286. assert(editDistance("prefix__hallo_suffix", "prefix__hallo_suffi1") == 1)
  287. assert(editDistance("prefix__hallo_suffix", "prefix__HALLO_suffix") == 5)
  288. assert(editDistance("prefix__hallo_suffix", "prefix__ha_suffix") == 3)
  289. assert(editDistance("prefix__hallo_suffix", "prefix") == 14)
  290. assert(editDistance("prefix__hallo_suffix", "suffix") == 14)
  291. assert(editDistance("prefix__hallo_suffix", "prefix__hao_suffix") == 2)
  292. assert(editDistance("main", "malign") == 2)