burrows-wheeler_transform_fast.sf 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 14 June 2023
  4. # Edit: 15 June 2023
  5. # https://github.com/trizen
  6. # Implementation of the Burrows–Wheeler transform, with fast inversion.
  7. # https://rosettacode.org/wiki/Burrows–Wheeler_transform
  8. # Reference:
  9. # Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
  10. # https://youtube.com/watch?v=rQ7wwh4HRZM
  11. define LOOKAHEAD_LEN = 128 # lower values are faster (on average)
  12. func bwt_quadratic (s) { # O(n^2) space (impractical)
  13. ^s.len -> map {|i| [s.rotate(i), i] }.sort_by { .[0] }.map { .[1] }
  14. }
  15. func bwt_simple (s) { # O(n) space (very slow)
  16. @(^s.len) -> sort {|a,b|
  17. s.rotate(a) <=> s.rotate(b)
  18. }
  19. }
  20. func bwt_cyclic (s) { # O(n) space (very slow)
  21. var cyclic = s.chars
  22. var len = cyclic.len
  23. gather { # check if all the symbols are the same
  24. take(true)
  25. cyclic.each_cons(2, {|a,b|
  26. if (a != b) {
  27. take(false)
  28. break
  29. }
  30. })
  31. } == [true] && return @(^len)
  32. @(^s.len) -> sort {|i,j|
  33. while (cyclic[i] == cyclic[j]) {
  34. i %= len if (++i >= len)
  35. j %= len if (++j >= len)
  36. }
  37. cyclic[i] <=> cyclic[j]
  38. }
  39. }
  40. func bwt_lookahead (s) { # O(n) space (moderately fast)
  41. @(^s.len) -> sort {|a,b|
  42. var t = s.slice(a, LOOKAHEAD_LEN)
  43. var u = s.slice(b, LOOKAHEAD_LEN)
  44. if (t.len < LOOKAHEAD_LEN) {
  45. t += s.slice(0, min(a, LOOKAHEAD_LEN - t.len))
  46. }
  47. if (u.len < LOOKAHEAD_LEN) {
  48. u += s.slice(0, min(b, LOOKAHEAD_LEN - u.len))
  49. }
  50. (t <=> u) || (s.rotate(a) <=> s.rotate(b))
  51. }
  52. }
  53. func bwt_balanced (s) { # O(n * LOOKAHEAD_LEN) space (fast)
  54. ^s.len -> map {|i|
  55. var t = s.slice(i, LOOKAHEAD_LEN)
  56. if (t.len < LOOKAHEAD_LEN) {
  57. t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
  58. }
  59. [t, i]
  60. }.sort {|a,b|
  61. (a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
  62. }.map { .[1] }
  63. }
  64. func bwt_encode_quadratic(String s) { # O(n^2) space
  65. #var bwt = s.len.of{|i| s.slice(i) + s.slice(0, i) }.sort
  66. var bwt = s.len.of{|i| s.rotate(i) }.sort
  67. var ret = bwt.map { .last }.join
  68. var idx = bwt.index(s)
  69. return (ret, idx)
  70. }
  71. func bwt_encode(String s) {
  72. #var bwt = bwt_simple(s)
  73. #var bwt = bwt_quadratic(s)
  74. #var bwt = bwt_cyclic(s)
  75. #var bwt = bwt_lookahead(s)
  76. var bwt = bwt_balanced(s)
  77. var ret = bwt.map {|i| s.slice(i-1, 1) }.join
  78. var idx = bwt.first_index_by { .is_zero }
  79. return (ret, idx)
  80. }
  81. func bwt_decode(String bwt, Number idx) { # fast inversion
  82. var tail = bwt.chars
  83. var head = tail.sort
  84. var indices = []
  85. tail.each_kv {|i,v|
  86. indices[v.ord] := [] << i
  87. }
  88. var table = []
  89. head.each_kv {|i,b|
  90. table[i] = indices[b.ord].shift
  91. }
  92. var dec = ''
  93. var i = idx
  94. head.len.times {
  95. dec += head[i]
  96. i = table[i]
  97. }
  98. return dec
  99. }
  100. var tests = [
  101. "banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
  102. "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", "PINEAPPLE",
  103. "","a","aa","aabb","aaaaaaaaaaaa","aaaaaaaaaaaab",
  104. "baaaaaaaaaaaa","aaaaaabaaaaaa","aaaaaaabaaaaa",
  105. File(__FILE__).read(:raw),
  106. File($^PERL).read(:raw),
  107. #File($^SIDEF).read(:raw),
  108. ]
  109. tests.each { |str|
  110. var (enc, idx) = bwt_encode(str)
  111. var dec = bwt_decode(enc, idx)
  112. if (enc.len < 1024) {
  113. say "BWT(#{dec.dump}) = (#{enc.dump}, #{idx})"
  114. }
  115. assert_eq(str, dec)
  116. }
  117. __END__
  118. BWT("banana") = ("nnbaaa", 3)
  119. BWT("appellee") = ("eelplepa", 0)
  120. BWT("dogwood") = ("odoodwg", 1)
  121. BWT("TOBEORNOTTOBEORTOBEORNOT") = ("OOOBBBRRTTTEEENNOOORTTOO", 20)
  122. BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = ("TEXYDST.E.IXIXIXXSSMPPS.B..E.S.EUSFXDIIOIIIT", 29)
  123. BWT("PINEAPPLE") = ("ENLPPIEPA", 6)