editdistance.nim 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements an algorithm to compute the
  10. ## `edit distance`:idx: between two Unicode strings.
  11. import unicode
  12. proc editDistance*(a, b: string): int {.noSideEffect.} =
  13. ## Returns the unicode-rune edit distance between ``a`` and ``b``.
  14. ##
  15. ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
  16. ## memory overhead.
  17. if len(a) > len(b):
  18. # make ``b`` the longer string
  19. return editDistance(b, a)
  20. # strip common prefix
  21. var
  22. i_start = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
  23. i_next_a = 0
  24. i_next_b = 0
  25. rune_a, rune_b: Rune
  26. len_runes_a = 0 ## The number of relevant runes in string ``a``.
  27. len_runes_b = 0 ## The number of relevant runes in string ``b``.
  28. block commonPrefix:
  29. # ``a`` is the shorter string
  30. while i_start < len(a):
  31. i_next_a = i_start
  32. a.fastRuneAt(i_next_a, rune_a, doInc = true)
  33. i_next_b = i_start
  34. b.fastRuneAt(i_next_b, rune_b, doInc = true)
  35. if rune_a != rune_b:
  36. inc(len_runes_a)
  37. inc(len_runes_b)
  38. break
  39. i_start = i_next_a
  40. var
  41. # we know that we are either at the start of the strings
  42. # or that the current value of rune_a is not equal to rune_b
  43. # => start search for common suffix after the current rune (``i_next_*``)
  44. i_end_a = i_next_a ## The exclusive upper index bound of string ``a``.
  45. i_end_b = i_next_b ## The exclusive upper index bound of string ``b``.
  46. i_current_a = i_next_a
  47. i_current_b = i_next_b
  48. block commonSuffix:
  49. var
  50. add_runes_a = 0
  51. add_runes_b = 0
  52. while i_current_a < len(a) and i_current_b < len(b):
  53. i_next_a = i_current_a
  54. a.fastRuneAt(i_next_a, rune_a)
  55. i_next_b = i_current_b
  56. b.fastRuneAt(i_next_b, rune_b)
  57. inc(add_runes_a)
  58. inc(add_runes_b)
  59. if rune_a != rune_b:
  60. i_end_a = i_next_a
  61. i_end_b = i_next_b
  62. inc(len_runes_a, add_runes_a)
  63. inc(len_runes_b, add_runes_b)
  64. add_runes_a = 0
  65. add_runes_b = 0
  66. i_current_a = i_next_a
  67. i_current_b = i_next_b
  68. if i_current_a >= len(a): # ``a`` exhausted
  69. if i_current_b < len(b): # ``b`` not exhausted
  70. i_end_a = i_current_a
  71. i_end_b = i_current_b
  72. inc(len_runes_a, add_runes_a)
  73. inc(len_runes_b, add_runes_b)
  74. while true:
  75. b.fastRuneAt(i_end_b, rune_b)
  76. inc(len_runes_b)
  77. if i_end_b >= len(b): break
  78. elif i_current_b >= len(b): # ``b`` exhausted and ``a`` not exhausted
  79. i_end_a = i_current_a
  80. i_end_b = i_current_b
  81. inc(len_runes_a, add_runes_a)
  82. inc(len_runes_b, add_runes_b)
  83. while true:
  84. a.fastRuneAt(i_end_a, rune_a)
  85. inc(len_runes_a)
  86. if i_end_a >= len(a): break
  87. block specialCases:
  88. # trivial cases:
  89. if len_runes_a == 0: return len_runes_b
  90. if len_runes_b == 0: return len_runes_a
  91. # another special case:
  92. if len_runes_a == 1:
  93. a.fastRuneAt(i_start, rune_a, doInc = false)
  94. var i_current_b = i_start
  95. while i_current_b < i_end_b:
  96. b.fastRuneAt(i_current_b, rune_b, doInc = true)
  97. if rune_a == rune_b: return len_runes_b - 1
  98. return len_runes_b
  99. # common case:
  100. var
  101. len1 = len_runes_a + 1
  102. len2 = len_runes_b + 1
  103. row: seq[int]
  104. let half = len_runes_a div 2
  105. newSeq(row, len2)
  106. var e = i_start + len2 - 1 # end marker
  107. # initialize first row:
  108. for i in 1 .. (len2 - half - 1): row[i] = i
  109. row[0] = len1 - half - 1
  110. i_current_a = i_start
  111. var
  112. char2p_i = -1
  113. char2p_prev: int
  114. for i in 1 .. (len1 - 1):
  115. i_next_a = i_current_a
  116. a.fastRuneAt(i_next_a, rune_a)
  117. var
  118. char2p: int
  119. D, x: int
  120. p: int
  121. if i >= (len1 - half):
  122. # skip the upper triangle:
  123. let offset = i + half - len1
  124. if char2p_i == i:
  125. b.fastRuneAt(char2p_prev, rune_b)
  126. char2p = char2p_prev
  127. char2p_i = i + 1
  128. else:
  129. char2p = i_start
  130. for j in 0 ..< offset:
  131. rune_b = b.runeAt(char2p)
  132. inc(char2p, rune_b.size)
  133. char2p_i = i + 1
  134. char2p_prev = char2p
  135. p = offset
  136. rune_b = b.runeAt(char2p)
  137. var c3 = row[p] + (if rune_a != rune_b: 1 else: 0)
  138. inc(char2p, rune_b.size)
  139. inc(p)
  140. x = row[p] + 1
  141. D = x
  142. if x > c3: x = c3
  143. row[p] = x
  144. inc(p)
  145. else:
  146. p = 1
  147. char2p = i_start
  148. D = i
  149. x = i
  150. if i <= (half + 1):
  151. # skip the lower triangle:
  152. e = len2 + i - half - 2
  153. # main:
  154. while p <= e:
  155. dec(D)
  156. rune_b = b.runeAt(char2p)
  157. var c3 = D + (if rune_a != rune_b: 1 else: 0)
  158. inc(char2p, rune_b.size)
  159. inc(x)
  160. if x > c3: x = c3
  161. D = row[p] + 1
  162. if x > D: x = D
  163. row[p] = x
  164. inc(p)
  165. # lower triangle sentinel:
  166. if i <= half:
  167. dec(D)
  168. rune_b = b.runeAt(char2p)
  169. var c3 = D + (if rune_a != rune_b: 1 else: 0)
  170. inc(x)
  171. if x > c3: x = c3
  172. row[p] = x
  173. i_current_a = i_next_a
  174. result = row[e]
  175. proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
  176. ## Returns the edit distance between `a` and `b`.
  177. ##
  178. ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
  179. ## memory overhead.
  180. var len1 = a.len
  181. var len2 = b.len
  182. if len1 > len2:
  183. # make `b` the longer string
  184. return editDistanceAscii(b, a)
  185. # strip common prefix:
  186. var s = 0
  187. while s < len1 and a[s] == b[s]:
  188. inc(s)
  189. dec(len1)
  190. dec(len2)
  191. # strip common suffix:
  192. while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
  193. dec(len1)
  194. dec(len2)
  195. # trivial cases:
  196. if len1 == 0: return len2
  197. if len2 == 0: return len1
  198. # another special case:
  199. if len1 == 1:
  200. for j in s..s+len2-1:
  201. if a[s] == b[j]: return len2 - 1
  202. return len2
  203. inc(len1)
  204. inc(len2)
  205. var half = len1 shr 1
  206. # initalize first row:
  207. #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
  208. var row: seq[int]
  209. newSeq(row, len2)
  210. var e = s + len2 - 1 # end marker
  211. for i in 1..len2 - half - 1: row[i] = i
  212. row[0] = len1 - half - 1
  213. for i in 1 .. len1 - 1:
  214. var char1 = a[i + s - 1]
  215. var char2p: int
  216. var D, x: int
  217. var p: int
  218. if i >= len1 - half:
  219. # skip the upper triangle:
  220. var offset = i - len1 + half
  221. char2p = offset
  222. p = offset
  223. var c3 = row[p] + ord(char1 != b[s + char2p])
  224. inc(p)
  225. inc(char2p)
  226. x = row[p] + 1
  227. D = x
  228. if x > c3: x = c3
  229. row[p] = x
  230. inc(p)
  231. else:
  232. p = 1
  233. char2p = 0
  234. D = i
  235. x = i
  236. if i <= half + 1:
  237. # skip the lower triangle:
  238. e = len2 + i - half - 2
  239. # main:
  240. while p <= e:
  241. dec(D)
  242. var c3 = D + ord(char1 != b[char2p + s])
  243. inc(char2p)
  244. inc(x)
  245. if x > c3: x = c3
  246. D = row[p] + 1
  247. if x > D: x = D
  248. row[p] = x
  249. inc(p)
  250. # lower triangle sentinel:
  251. if i <= half:
  252. dec(D)
  253. var c3 = D + ord(char1 != b[char2p + s])
  254. inc(x)
  255. if x > c3: x = c3
  256. row[p] = x
  257. result = row[e]
  258. when isMainModule:
  259. doAssert editDistance("", "") == 0
  260. doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
  261. doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
  262. doAssert editDistance("привет", "превет") == 1
  263. doAssert editDistance("Åge", "Age") == 1
  264. # editDistance, one string is longer in bytes, but shorter in rune length
  265. # first string: 4 bytes, second: 6 bytes, but only 3 runes
  266. doAssert editDistance("aaaa", "×××") == 4
  267. block veryLongStringEditDistanceTest:
  268. const cap = 256
  269. var
  270. s1 = newStringOfCap(cap)
  271. s2 = newStringOfCap(cap)
  272. while len(s1) < cap:
  273. s1.add 'a'
  274. while len(s2) < cap:
  275. s2.add 'b'
  276. doAssert editDistance(s1, s2) == cap
  277. block combiningCodePointsEditDistanceTest:
  278. const s = "A\xCC\x8Age"
  279. doAssert editDistance(s, "Age") == 1
  280. doAssert editDistanceAscii("", "") == 0
  281. doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
  282. doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia