123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538 |
- #!/usr/bin/ruby
- # Author: Trizen
- # Date: 20 June 2023
- # https://github.com/trizen
- # Compress/decompress files using Burrows-Wheeler Transform (BWT) + Move-to-Front Transform + Run-length encoding + Huffman coding.
- # Reference:
- # Data Compression (Summer 2023) - Lecture 13 - BZip2
- # https://youtube.com/watch?v=cvoZbBZ3M2A
- define(
- CHUNK_SIZE = (1 << 17),
- SIGNATURE = ('BWT' + 2.chr),
- LOOKAHEAD_LEN = 128,
- )
- func read_bit (FileHandle fh, Ref bitstring) {
- if ((*bitstring \\ '').is_empty) {
- *bitstring = unpack('b*', fh.getc \\ die "error")
- }
- var bit = (*bitstring).substr(-1)
- *bitstring = (*bitstring).substr(0, -1)
- return bit
- }
- func read_bits (FileHandle fh, Number bits_len) {
- var str = ''
- fh.read(\str, bits_len>>3)
- str = unpack('B*', str)
- while (str.len < bits_len) {
- str += unpack('B*', fh.getc \\ die "error")
- }
- if (str.len > bits_len) {
- str.substr!(0, bits_len)
- }
- return str
- }
- func mtf_encode (Array bytes, Array alphabet = @(^256)) {
- var C = []
- for c in bytes {
- C << alphabet.first_index(c)
- alphabet.prepend(alphabet.delete_index(C.tail))
- }
- return C
- }
- func mtf_decode (Array encoded, Array alphabet = @(^256)) {
- var S = []
- for p in encoded {
- S << alphabet[p]
- alphabet.prepend(alphabet.delete_index(p))
- }
- return S
- }
- func bwt_sort (String s) { # O(n * LOOKAHEAD_LEN) space (fast)
- ^s.len -> map {|i|
- var t = s.slice(i, LOOKAHEAD_LEN)
- if (t.len < LOOKAHEAD_LEN) {
- t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
- }
- [t, i]
- }.sort {|a,b|
- (a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
- }.map { .[1] }
- }
- func bwt_encode(String s) {
- var bwt = bwt_sort(s)
- var ret = bwt.map {|i| s.slice(i-1, 1) }.join
- var idx = bwt.first_index_by { .is_zero }
- return (ret, idx)
- }
- func bwt_decode(String bwt, Number idx) { # fast inversion
- var tail = bwt.chars
- var head = tail.sort
- var indices = Hash()
- tail.each_kv {|i,v|
- indices{v} := [] << i
- }
- var table = []
- head.each_kv {|i,v|
- table[i] = indices{v}.shift
- }
- var dec = ''
- var i = idx
- head.len.times {
- dec += head[i]
- i = table[i]
- }
- return dec
- }
- func delta_encode (Array integers, Bool double = false) {
- var deltas = [0, integers.len, integers...].diffs
- var bitstring = FileHandle.new_buf(:raw)
- for d in (deltas) {
- if (d == 0) {
- bitstring << '0'
- }
- elsif (double) {
- var t = d.abs.inc.as_bin
- var l = t.len.as_bin
- bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (l.len-1)), '0', l.substr(1), t.substr(1))
- }
- else {
- var t = d.abs.as_bin
- bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (t.len-1)), '0', t.substr(1))
- }
- }
- pack('B*', bitstring.parent)
- }
- func delta_decode (FileHandle fh, Bool double = false) {
- var deltas = []
- var buffer = ''
- var len = 0
- for (var k = 0 ; k <= len ; ++k) {
- var bit = read_bit(fh, \buffer)
- if (bit == '0') {
- deltas << 0
- }
- elsif (double) {
- var bit = read_bit(fh, \buffer)
- var bl = (^Inf -> first { read_bit(fh, \buffer) != '1' })
- var bl2 = Num('1' + bl.of { read_bit(fh, \buffer) }.join, 2)
- var int = Num('1' + (bl2-1).of { read_bit(fh, \buffer) }.join, 2)-1
- deltas << (bit == '1' ? int : -int)
- }
- else {
- var bit = read_bit(fh, \buffer)
- var n = (^Inf -> first { read_bit(fh, \buffer) != '1' })
- var d = Num('1' + n.of { read_bit(fh, \buffer) }.join, 2)
- deltas << (bit == '1' ? d : -d)
- }
- if (k == 0) {
- len = deltas.pop
- }
- }
- var acc = [len, deltas...].acc
- acc.shift
- return acc
- }
- func walk(Hash n, String s, Hash h) {
- if (n.has(:a)) {
- h{n{:a}} = s
- return nil
- }
- walk(n{:0}, s+'0', h) if n.has(:0)
- walk(n{:1}, s+'1', h) if n.has(:1)
- }
- func mktree_from_freq(Hash freqs) {
- var nodes = freqs.sort_by {|k| k.to_i }.map_2d { |b,f|
- Hash(a => b, freq => f)
- }
- loop {
- nodes.sort_by!{|h| h{:freq} }
- var(x, y) = (nodes.shift, nodes.shift)
- if (defined(x)) {
- if (defined(y)) {
- nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
- }
- else {
- nodes << Hash(:0 => x, :freq => x{:freq})
- }
- }
- nodes.len > 1 || break
- }
- var n = nodes.first
- walk(n, '', n{:tree} = Hash())
- return n
- }
- func huffman_encode(Array bytes, Hash t) {
- join('', t{bytes...})
- }
- func huffman_decode (String bits, Hash tree) {
- bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
- }
- func create_huffman_entry (Array bytes, FileHandle out_fh) {
- var freq = bytes.freq
- var h = mktree_from_freq(freq){:tree}
- var enc = huffman_encode(bytes, h)
- var max_symbol = (bytes.max \\ 0)
- say "Max symbol: #{max_symbol}\n";
- var freqs = []
- for i in (0 .. max_symbol) {
- freqs << (freq{i} \\ 0)
- }
- out_fh << delta_encode(freqs)
- out_fh << pack("N", enc.len)
- out_fh << pack("B*", enc)
- }
- func decode_huffman_entry (FileHandle fh) {
- var freqs = delta_decode(fh)
- var freq = Hash()
- freqs.each_kv {|k,v|
- if (v != 0) {
- freq{k} = v
- }
- }
- var rev_dict = mktree_from_freq(freq){:tree}.flip
- for k in (rev_dict.keys) {
- rev_dict{k} = "#{rev_dict{k}} "
- }
- var enc_len = unpack('N', 4.of { fh.getc }.join).to_i
- say "Encoded length: #{enc_len}\n"
- if (enc_len > 0) {
- return huffman_decode(read_bits(fh, enc_len), rev_dict).nums
- }
- return []
- }
- func rle4_encode (Array bytes) { # RLE1
- var rle = []
- var end = bytes.end
- var prev = -1
- var run = 0
- for (var i = 0; i <= end ; ++i) {
- if (bytes[i] == prev) {
- ++run
- }
- else {
- run = 1
- }
- rle << (prev = bytes[i])
- if (run >= 4) {
- run = 0
- i += 1
- while ((run < 255) && (i <= end) && (bytes[i] == prev)) {
- ++run
- ++i
- }
- rle << run
- run = 1
- if (i <= end) {
- rle << (prev = bytes[i])
- }
- }
- }
- return rle
- }
- func rle4_decode (Array bytes) { # RLE1
- var dec = [bytes[0]]
- var end = bytes.end
- var prev = bytes[0]
- var run = 1
- for (var i = 1 ; i <= end ; ++i) {
- if (bytes[i] == prev) {
- ++run
- }
- else {
- run = 1
- }
- dec << (prev = bytes[i])
- if (run >= 4) {
- if (++i <= end) {
- run = bytes[i]
- dec << run.of(prev)...
- }
- run = 0
- }
- }
- return dec
- }
- func rle_encode (Array bytes) { # RLE2
- var rle = []
- var end = bytes.end
- for (var i = 0 ; i <= end ; ++i) {
- var run = 0
- while ((i <= end) && (bytes[i] == 0)) {
- ++run
- ++i
- }
- if (run >= 1) {
- rle << (run+1).bits.slice(1)...
- }
- if (i <= end) {
- rle << (bytes[i] + 1)
- }
- }
- return rle
- }
- func rle_decode (Array rle) { # RLE2
- var dec = []
- var end = rle.end
- for (var i = 0 ; i <= end ; ++i) {
- var k = rle[i]
- if ((k == 0) || (k == 1)) {
- var run = 1
- while ((i <= end) && ((k == 0) || (k == 1))) {
- (run <<= 1) |= k
- k = rle[++i]
- }
- dec << (run-1).of(0)...
- }
- if (i <= end) {
- dec << (k - 1)
- }
- }
- return dec
- }
- func encode_alphabet (Array alphabet) {
- (var table = Hash()){alphabet...} = ()
- var populated = 0
- var marked = []
- for (var i = 0 ; i <= 255 ; i += 32) {
- var enc = 0
- for j in (0 .. 31) {
- if (table.has(i + j)) {
- enc |= (1 << j)
- }
- }
- if (enc == 0) {
- populated <<= 1
- }
- else {
- (populated <<= 1) |= 1
- marked << enc
- }
- }
- var delta = delta_encode(marked, true)
- say "Populated : #{sprintf('%08b', populated)}"
- say "Marked : #{marked}"
- say "Delta len : #{delta.len}"
- var encoded = ''
- encoded += populated.chr
- encoded += delta
- return encoded
- }
- func decode_alphabet (FileHandle fh) {
- var populated = sprintf('%08b', ord(fh.getc)).chars
- var marked = delta_decode(fh, true)
- var alphabet = []
- for (var i = 0 ; i <= 255 ; i += 32) {
- if (populated.shift) {
- var m = marked.shift
- for j in (0 .. 31) {
- if (m & 1) {
- alphabet << (i + j)
- }
- m >>= 1
- }
- }
- }
- return alphabet
- }
- func compression (String chunk, FileHandle out_fh) {
- var rle1 = rle4_encode(chunk.bytes)
- var (bwt, idx) = bwt_encode(rle1.pack('C*'))
- say "BWT index = #{idx}"
- var bytes = bwt.bytes
- var alphabet = bwt.uniq.sort.bytes
- var alphabet_enc = encode_alphabet(alphabet)
- var mtf = mtf_encode(bytes, alphabet)
- var rle = rle_encode(mtf)
- out_fh << pack('N', idx)
- out_fh << alphabet_enc
- create_huffman_entry(rle, out_fh)
- }
- func decompression (FileHandle fh, FileHandle out_fh) {
- var idx = unpack('N', 4.of { fh.getc \\ die "error" }.join).to_i
- var alphabet = decode_alphabet(fh)
- say "BWT index = #{idx}"
- say "Alphabet size: #{alphabet.len}"
- var rle = decode_huffman_entry(fh)
- var mtf = rle_decode(rle);
- var bwt = mtf_decode(mtf, alphabet)
- var rle4 = bwt_decode(bwt.pack('C*'), idx)
- var data = rle4_decode(rle4.bytes)
- out_fh << data.pack('C*')
- }
- # Compress file
- func compress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- var header = SIGNATURE
- # Print the header
- out_fh.print(header)
- # Compress data
- while (in_fh.read(\var chunk, CHUNK_SIZE)) {
- compression(chunk, out_fh)
- }
- # Close the files
- in_fh.close
- out_fh.close
- }
- # Decompress file
- func decompress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
- die "Not a BWT archive!\n"
- }
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- while (!in_fh.eof) {
- decompression(in_fh, out_fh)
- }
- in_fh.close
- out_fh.close
- }
- ARGV.getopt!('d', \var decode)
- var file = File(ARGV.shift) || do {
- say "usage: #{File(__MAIN__).basename} [-d] [input file]"
- Sys.exit(2)
- }
- if (decode || file.match(/\.bwt\.enc\z/)) {
- decompress_file(file, File("output.bwt.dec"))
- }
- else {
- compress_file(file, File("output.bwt.enc"))
- }
|