burrows-wheeler_transform_linear.sf 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 14 June 2023
  4. # Edit: 15 June 2023
  5. # https://github.com/trizen
  6. # A practical implementation of the Burrows–Wheeler_transform, running in O(n*log(n)) time, using O(n) space.
  7. # See also:
  8. # https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
  9. # https://rosettacode.org/wiki/Burrows%E2%80%93Wheeler_transform
  10. # Reference:
  11. # Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
  12. # https://youtube.com/watch?v=rQ7wwh4HRZM
  13. define LOOKAHEAD_LEN = 128
  14. func bwt_sort (String s) { # O(n * LOOKAHEAD_LEN) space (fast)
  15. ^s.len -> map {|i|
  16. var t = s.slice(i, LOOKAHEAD_LEN)
  17. if (t.len < LOOKAHEAD_LEN) {
  18. t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
  19. }
  20. [t, i]
  21. }.sort {|a,b|
  22. (a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
  23. }.map { .[1] }
  24. }
  25. func bwt_encode(String s) {
  26. var bwt = bwt_sort(s)
  27. var ret = bwt.map {|i| s.slice(i-1, 1) }.join
  28. var idx = bwt.first_index_by { .is_zero }
  29. return (ret, idx)
  30. }
  31. func bwt_decode(String bwt, Number idx) { # fast inversion
  32. var tail = bwt.chars
  33. var head = tail.sort
  34. var indices = Hash()
  35. tail.each_kv {|i,v|
  36. indices{v} := [] << i
  37. }
  38. var table = []
  39. head.each_kv {|i,v|
  40. table[i] = indices{v}.shift
  41. }
  42. var dec = ''
  43. var i = idx
  44. head.len.times {
  45. dec += head[i]
  46. i = table[i]
  47. }
  48. return dec
  49. }
  50. var tests = [
  51. "banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
  52. "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
  53. ]
  54. tests.each { |str|
  55. var (enc, idx) = bwt_encode(str)
  56. var dec = bwt_decode(enc, idx)
  57. say "BWT(#{dec.dump}) = (#{enc.dump}, #{idx})"
  58. assert_eq(str, dec)
  59. }