123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308 |
- #
- #
- # Nim's Runtime Library
- # (c) Copyright 2018 Nim contributors
- #
- # See the file "copying.txt", included in this
- # distribution, for details about the copyright.
- #
- ## This module implements an algorithm to compute the
- ## `edit distance`:idx: between two Unicode strings.
- import unicode
- proc editDistance*(a, b: string): int {.noSideEffect.} =
- ## Returns the **unicode-rune** edit distance between ``a`` and ``b``.
- ##
- ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
- ## memory overhead.
- runnableExamples: static: doAssert editdistance("Kitten", "Bitten") == 1
- if len(a) > len(b):
- # make ``b`` the longer string
- return editDistance(b, a)
- # strip common prefix
- var
- iStart = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
- iNextA = 0
- iNextB = 0
- runeA, runeB: Rune
- lenRunesA = 0 ## The number of relevant runes in string ``a``.
- lenRunesB = 0 ## The number of relevant runes in string ``b``.
- block commonPrefix:
- # ``a`` is the shorter string
- while iStart < len(a):
- iNextA = iStart
- a.fastRuneAt(iNextA, runeA, doInc = true)
- iNextB = iStart
- b.fastRuneAt(iNextB, runeB, doInc = true)
- if runeA != runeB:
- inc(lenRunesA)
- inc(lenRunesB)
- break
- iStart = iNextA
- var
- # we know that we are either at the start of the strings
- # or that the current value of runeA is not equal to runeB
- # => start search for common suffix after the current rune (``i_next_*``)
- iEndA = iNextA ## The exclusive upper index bound of string ``a``.
- iEndB = iNextB ## The exclusive upper index bound of string ``b``.
- iCurrentA = iNextA
- iCurrentB = iNextB
- block commonSuffix:
- var
- addRunesA = 0
- addRunesB = 0
- while iCurrentA < len(a) and iCurrentB < len(b):
- iNextA = iCurrentA
- a.fastRuneAt(iNextA, runeA)
- iNextB = iCurrentB
- b.fastRuneAt(iNextB, runeB)
- inc(addRunesA)
- inc(addRunesB)
- if runeA != runeB:
- iEndA = iNextA
- iEndB = iNextB
- inc(lenRunesA, addRunesA)
- inc(lenRunesB, addRunesB)
- addRunesA = 0
- addRunesB = 0
- iCurrentA = iNextA
- iCurrentB = iNextB
- if iCurrentA >= len(a): # ``a`` exhausted
- if iCurrentB < len(b): # ``b`` not exhausted
- iEndA = iCurrentA
- iEndB = iCurrentB
- inc(lenRunesA, addRunesA)
- inc(lenRunesB, addRunesB)
- while true:
- b.fastRuneAt(iEndB, runeB)
- inc(lenRunesB)
- if iEndB >= len(b): break
- elif iCurrentB >= len(b): # ``b`` exhausted and ``a`` not exhausted
- iEndA = iCurrentA
- iEndB = iCurrentB
- inc(lenRunesA, addRunesA)
- inc(lenRunesB, addRunesB)
- while true:
- a.fastRuneAt(iEndA, runeA)
- inc(lenRunesA)
- if iEndA >= len(a): break
- block specialCases:
- # trivial cases:
- if lenRunesA == 0: return lenRunesB
- if lenRunesB == 0: return lenRunesA
- # another special case:
- if lenRunesA == 1:
- a.fastRuneAt(iStart, runeA, doInc = false)
- var iCurrentB = iStart
- while iCurrentB < iEndB:
- b.fastRuneAt(iCurrentB, runeB, doInc = true)
- if runeA == runeB: return lenRunesB - 1
- return lenRunesB
- # common case:
- var
- len1 = lenRunesA + 1
- len2 = lenRunesB + 1
- row: seq[int]
- let half = lenRunesA div 2
- newSeq(row, len2)
- var e = iStart + len2 - 1 # end marker
- # initialize first row:
- for i in 1 .. (len2 - half - 1): row[i] = i
- row[0] = len1 - half - 1
- iCurrentA = iStart
- var
- char2pI = -1
- char2pPrev: int
- for i in 1 .. (len1 - 1):
- iNextA = iCurrentA
- a.fastRuneAt(iNextA, runeA)
- var
- char2p: int
- diff, x: int
- p: int
- if i >= (len1 - half):
- # skip the upper triangle:
- let offset = i + half - len1
- if char2pI == i:
- b.fastRuneAt(char2pPrev, runeB)
- char2p = char2pPrev
- char2pI = i + 1
- else:
- char2p = iStart
- for j in 0 ..< offset:
- runeB = b.runeAt(char2p)
- inc(char2p, runeB.size)
- char2pI = i + 1
- char2pPrev = char2p
- p = offset
- runeB = b.runeAt(char2p)
- var c3 = row[p] + (if runeA != runeB: 1 else: 0)
- inc(char2p, runeB.size)
- inc(p)
- x = row[p] + 1
- diff = x
- if x > c3: x = c3
- row[p] = x
- inc(p)
- else:
- p = 1
- char2p = iStart
- diff = i
- x = i
- if i <= (half + 1):
- # skip the lower triangle:
- e = len2 + i - half - 2
- # main:
- while p <= e:
- dec(diff)
- runeB = b.runeAt(char2p)
- var c3 = diff + (if runeA != runeB: 1 else: 0)
- inc(char2p, runeB.size)
- inc(x)
- if x > c3: x = c3
- diff = row[p] + 1
- if x > diff: x = diff
- row[p] = x
- inc(p)
- # lower triangle sentinel:
- if i <= half:
- dec(diff)
- runeB = b.runeAt(char2p)
- var c3 = diff + (if runeA != runeB: 1 else: 0)
- inc(x)
- if x > c3: x = c3
- row[p] = x
- iCurrentA = iNextA
- result = row[e]
- proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
- ## Returns the edit distance between `a` and `b`.
- ##
- ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
- ## memory overhead.
- runnableExamples: static: doAssert editDistanceAscii("Kitten", "Bitten") == 1
- var len1 = a.len
- var len2 = b.len
- if len1 > len2:
- # make `b` the longer string
- return editDistanceAscii(b, a)
- # strip common prefix:
- var s = 0
- while s < len1 and a[s] == b[s]:
- inc(s)
- dec(len1)
- dec(len2)
- # strip common suffix:
- while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
- dec(len1)
- dec(len2)
- # trivial cases:
- if len1 == 0: return len2
- if len2 == 0: return len1
- # another special case:
- if len1 == 1:
- for j in s..s+len2-1:
- if a[s] == b[j]: return len2 - 1
- return len2
- inc(len1)
- inc(len2)
- var half = len1 shr 1
- # initialize first row:
- #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
- var row: seq[int]
- newSeq(row, len2)
- var e = s + len2 - 1 # end marker
- for i in 1..len2 - half - 1: row[i] = i
- row[0] = len1 - half - 1
- for i in 1 .. len1 - 1:
- var char1 = a[i + s - 1]
- var char2p: int
- var diff, x: int
- var p: int
- if i >= len1 - half:
- # skip the upper triangle:
- var offset = i - len1 + half
- char2p = offset
- p = offset
- var c3 = row[p] + ord(char1 != b[s + char2p])
- inc(p)
- inc(char2p)
- x = row[p] + 1
- diff = x
- if x > c3: x = c3
- row[p] = x
- inc(p)
- else:
- p = 1
- char2p = 0
- diff = i
- x = i
- if i <= half + 1:
- # skip the lower triangle:
- e = len2 + i - half - 2
- # main:
- while p <= e:
- dec(diff)
- var c3 = diff + ord(char1 != b[char2p + s])
- inc(char2p)
- inc(x)
- if x > c3: x = c3
- diff = row[p] + 1
- if x > diff: x = diff
- row[p] = x
- inc(p)
- # lower triangle sentinel:
- if i <= half:
- dec(diff)
- var c3 = diff + ord(char1 != b[char2p + s])
- inc(x)
- if x > c3: x = c3
- row[p] = x
- result = row[e]
- when isMainModule:
- doAssert editDistance("", "") == 0
- doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
- doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
- doAssert editDistance("привет", "превет") == 1
- doAssert editDistance("Åge", "Age") == 1
- # editDistance, one string is longer in bytes, but shorter in rune length
- # first string: 4 bytes, second: 6 bytes, but only 3 runes
- doAssert editDistance("aaaa", "×××") == 4
- block veryLongStringEditDistanceTest:
- const cap = 256
- var
- s1 = newStringOfCap(cap)
- s2 = newStringOfCap(cap)
- while len(s1) < cap:
- s1.add 'a'
- while len(s2) < cap:
- s2.add 'b'
- doAssert editDistance(s1, s2) == cap
- block combiningCodePointsEditDistanceTest:
- const s = "A\xCC\x8Age"
- doAssert editDistance(s, "Age") == 1
- doAssert editDistanceAscii("", "") == 0
- doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
- doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia
- assert(editDistance("prefix__hallo_suffix", "prefix__hallo_suffix") == 0)
- assert(editDistance("prefix__hallo_suffix", "prefix__hallo_suffi1") == 1)
- assert(editDistance("prefix__hallo_suffix", "prefix__HALLO_suffix") == 5)
- assert(editDistance("prefix__hallo_suffix", "prefix__ha_suffix") == 3)
- assert(editDistance("prefix__hallo_suffix", "prefix") == 14)
- assert(editDistance("prefix__hallo_suffix", "suffix") == 14)
- assert(editDistance("prefix__hallo_suffix", "prefix__hao_suffix") == 2)
- assert(editDistance("main", "malign") == 2)
|