123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- #!/usr/bin/ruby
- # Author: Trizen
- # Date: 21 June 2023
- # https://github.com/trizen
- # Compress/decompress files using Burrows-Wheeler Transform (BWT) + Variable length-run encoding + Huffman coding.
- # Reference:
- # Data Compression (Summer 2023) - Lecture 13 - BZip2
- # https://youtube.com/watch?v=cvoZbBZ3M2A
- define(
- CHUNK_SIZE = (1 << 17),
- SIGNATURE = ('BWRL' + 1.chr),
- LOOKAHEAD_LEN = 128,
- )
- func read_bit (FileHandle fh, Ref bitstring) {
- if ((*bitstring \\ '').is_empty) {
- *bitstring = unpack('b*', fh.getc \\ die "error")
- }
- var bit = (*bitstring).substr(-1)
- *bitstring = (*bitstring).substr(0, -1)
- return bit
- }
- func read_bits (FileHandle fh, Number bits_len) {
- var str = ''
- fh.read(\str, bits_len>>3)
- str = unpack('B*', str)
- while (str.len < bits_len) {
- str += unpack('B*', fh.getc \\ die "error")
- }
- if (str.len > bits_len) {
- str.substr!(0, bits_len)
- }
- return str
- }
- func bwt_sort (String s) { # O(n * LOOKAHEAD_LEN) space (fast)
- ^s.len -> map {|i|
- var t = s.slice(i, LOOKAHEAD_LEN)
- if (t.len < LOOKAHEAD_LEN) {
- t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
- }
- [t, i]
- }.sort {|a,b|
- (a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
- }.map { .[1] }
- }
- func bwt_encode(String s) {
- var bwt = bwt_sort(s)
- var ret = bwt.map {|i| s.slice(i-1, 1) }.join
- var idx = bwt.first_index_by { .is_zero }
- return (ret, idx)
- }
- func bwt_decode(String bwt, Number idx) { # fast inversion
- var tail = bwt.chars
- var head = tail.sort
- var indices = Hash()
- tail.each_kv {|i,v|
- indices{v} := [] << i
- }
- var table = []
- head.each_kv {|i,v|
- table[i] = indices{v}.shift
- }
- var dec = ''
- var i = idx
- head.len.times {
- dec += head[i]
- i = table[i]
- }
- return dec
- }
- func delta_encode (Array integers) {
- var deltas = [0, integers.len, integers...].diffs
- var bitstring = FileHandle.new_buf(:raw)
- for d in (deltas) {
- if (d == 0) {
- bitstring << '0'
- }
- else {
- var t = d.abs.as_bin
- bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (t.len-1)), '0', t.substr(1))
- }
- }
- pack('B*', bitstring.parent)
- }
- func delta_decode (FileHandle fh) {
- var deltas = []
- var buffer = ''
- var len = 0
- for (var k = 0 ; k <= len ; ++k) {
- var bit = read_bit(fh, \buffer)
- if (bit == '0') {
- deltas << 0
- }
- else {
- var bit = read_bit(fh, \buffer)
- var n = (^Inf -> first { read_bit(fh, \buffer) != '1' })
- var d = Num('1' + n.of { read_bit(fh, \buffer) }.join, 2)
- deltas << (bit == '1' ? d : -d)
- }
- if (k == 0) {
- len = deltas.pop
- }
- }
- var acc = [len, deltas...].acc
- acc.shift
- return acc
- }
- func walk(Hash n, String s, Hash h) {
- if (n.has(:a)) {
- h{n{:a}} = s
- return nil
- }
- walk(n{:0}, s+'0', h) if n.has(:0)
- walk(n{:1}, s+'1', h) if n.has(:1)
- }
- func mktree_from_freq(Hash freqs) {
- var nodes = freqs.sort_by {|k| k.to_i }.map_2d { |b,f|
- Hash(a => b, freq => f)
- }
- loop {
- nodes.sort_by!{|h| h{:freq} }
- var(x, y) = (nodes.shift, nodes.shift)
- if (defined(x)) {
- if (defined(y)) {
- nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
- }
- else {
- nodes << Hash(:0 => x, :freq => x{:freq})
- }
- }
- nodes.len > 1 || break
- }
- var n = nodes.first
- walk(n, '', n{:tree} = Hash())
- return n
- }
- func huffman_encode(Array bytes, Hash t) {
- join('', t{bytes...})
- }
- func huffman_decode (String bits, Hash tree) {
- bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
- }
- func create_huffman_entry (Array bytes, FileHandle out_fh) {
- var freq = bytes.freq
- var h = mktree_from_freq(freq){:tree}
- var enc = huffman_encode(bytes, h)
- var max_symbol = (bytes.max \\ 0)
- say "Max symbol: #{max_symbol}";
- var freqs = []
- for i in (0 .. max_symbol) {
- freqs << (freq{i} \\ 0)
- }
- out_fh << delta_encode(freqs)
- out_fh << pack("N", enc.len)
- out_fh << pack("B*", enc)
- }
- func decode_huffman_entry (FileHandle fh) {
- var freqs = delta_decode(fh)
- var freq = Hash()
- freqs.each_kv {|k,v|
- if (v != 0) {
- freq{k} = v
- }
- }
- var rev_dict = mktree_from_freq(freq){:tree}.flip
- for k in (rev_dict.keys) {
- rev_dict{k} = "#{rev_dict{k}} "
- }
- var enc_len = unpack('N', 4.of { fh.getc }.join).to_i
- say "Encoded length: #{enc_len}"
- if (enc_len > 0) {
- return huffman_decode(read_bits(fh, enc_len), rev_dict).nums
- }
- return []
- }
- func VLR_encoding (Array bytes) {
- var uncompressed = []
- var bitstream = FileHandle.new_buf(:raw)
- for c,v in (bytes.run_length) {
- uncompressed << c
- if (v == 1) {
- bitstream << '0'
- }
- else {
- var t = v.as_bin
- bitstream << ('1'*(t.len-1) + '0' + t.substr(1))
- }
- }
- return (uncompressed, bitstream.parent)
- }
- func VLR_decoding (Array uncompressed, FileHandle fh) {
- var decoded = FileHandle.new_buf(:raw)
- var buffer = ''
- uncompressed.len.times {
- var c = uncompressed.shift.chr
- var bl = (^Inf -> first { read_bit(fh, \buffer) != '1' })
- if (bl > 0) {
- decoded << c*Num('1' + bl.of { read_bit(fh, \buffer) }.join, 2)
- }
- else {
- decoded << c
- }
- }
- return decoded.parent
- }
- func compression (String chunk, FileHandle out_fh) {
- var (bwt, idx) = bwt_encode(chunk)
- say "\nBWT index = #{idx}"
- var (uncompressed, lengths) = VLR_encoding(bwt.bytes)
- out_fh << pack('N', idx)
- create_huffman_entry(uncompressed, out_fh)
- create_huffman_entry(pack('B*', lengths).bytes, out_fh)
- }
- func decompression (FileHandle fh, FileHandle out_fh) {
- var idx = unpack('N', 4.of { fh.getc \\ die "error" }.join).to_i
- say "\nBWT index = #{idx}"
- var uncompressed = decode_huffman_entry(fh)
- var lengths = decode_huffman_entry(fh).pack('C*')
- var bwt = VLR_decoding(uncompressed, lengths.open_r(:raw))
- out_fh << bwt_decode(bwt, idx)
- }
- # Compress file
- func compress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- var header = SIGNATURE
- # Print the header
- out_fh.print(header)
- # Compress data
- while (in_fh.read(\var chunk, CHUNK_SIZE)) {
- compression(chunk, out_fh)
- }
- # Close the files
- in_fh.close
- out_fh.close
- }
- # Decompress file
- func decompress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
- die "Not a BWRL archive!\n"
- }
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- while (!in_fh.eof) {
- decompression(in_fh, out_fh)
- }
- in_fh.close
- out_fh.close
- }
- ARGV.getopt!('d', \var decode)
- var file = File(ARGV.shift) || do {
- say "usage: #{File(__MAIN__).basename} [-d] [input file]"
- Sys.exit(2)
- }
- if (decode || file.match(/\.bwrl\.enc\z/)) {
- decompress_file(file, File("output.bwrl.dec"))
- }
- else {
- compress_file(file, File("output.bwrl.enc"))
- }
|