123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- #!/usr/bin/ruby
- # Author: Trizen
- # Date: 14 June 2023
- # Edit: 15 June 2023
- # https://github.com/trizen
- # A practical implementation of the Burrows–Wheeler_transform, running in O(n*log(n)) time, using O(n) space.
- # See also:
- # https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
- # https://rosettacode.org/wiki/Burrows%E2%80%93Wheeler_transform
- # Reference:
- # Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
- # https://youtube.com/watch?v=rQ7wwh4HRZM
- define LOOKAHEAD_LEN = 128
- func bwt_sort (String s) { # O(n * LOOKAHEAD_LEN) space (fast)
- ^s.len -> map {|i|
- var t = s.slice(i, LOOKAHEAD_LEN)
- if (t.len < LOOKAHEAD_LEN) {
- t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
- }
- [t, i]
- }.sort {|a,b|
- (a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
- }.map { .[1] }
- }
- func bwt_encode(String s) {
- var bwt = bwt_sort(s)
- var ret = bwt.map {|i| s.slice(i-1, 1) }.join
- var idx = bwt.first_index_by { .is_zero }
- return (ret, idx)
- }
- func bwt_decode(String bwt, Number idx) { # fast inversion
- var tail = bwt.chars
- var head = tail.sort
- var indices = Hash()
- tail.each_kv {|i,v|
- indices{v} := [] << i
- }
- var table = []
- head.each_kv {|i,v|
- table[i] = indices{v}.shift
- }
- var dec = ''
- var i = idx
- head.len.times {
- dec += head[i]
- i = table[i]
- }
- return dec
- }
- var tests = [
- "banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
- "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
- ]
- tests.each { |str|
- var (enc, idx) = bwt_encode(str)
- var dec = bwt_decode(enc, idx)
- say "BWT(#{dec.dump}) = (#{enc.dump}, #{idx})"
- assert_eq(str, dec)
- }
|