123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- #!/usr/bin/ruby
- # Author: Trizen
- # Date: 15 December 2022
- # Edit: 10 September 2023
- # https://github.com/trizen
- # Compress/decompress files using LZ77 compression + fixed-width integers encoding + Huffman coding.
- define(
- CHUNK_SIZE = (1 << 16),
- COMPRESSED_BYTE = 1.chr,
- UNCOMPRESSED_BYTE = 0.chr,
- FORMAT = 'lzih',
- SIGNATURE = ('LZIH' + 4.chr),
- )
- func read_bits (FileHandle fh, Number bits_len) {
- var str = ''
- fh.read(\str, bits_len>>3)
- str = unpack('B*', str)
- while (str.len < bits_len) {
- str += unpack('B*', fh.getc \\ return nil)
- }
- if (str.len > bits_len) {
- str.substr!(0, bits_len)
- }
- return str
- }
- func read_bit (FileHandle fh, Ref bitstring) {
- if ((*bitstring \\ '').is_empty) {
- *bitstring = unpack('b*', fh.getc \\ die "error")
- }
- var bit = (*bitstring).substr(-1)
- *bitstring = (*bitstring).substr(0, -1)
- return bit
- }
- func delta_encode (Array integers) {
- var deltas = [0, integers.len, integers...].diffs
- var bitstring = FileHandle.new_buf(:raw)
- for d in (deltas) {
- if (d == 0) {
- bitstring << '0'
- }
- else {
- var t = d.abs.as_bin
- bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (t.len-1)), '0', t.substr(1))
- }
- }
- pack('B*', bitstring.parent)
- }
- func delta_decode (FileHandle fh) {
- var deltas = []
- var buffer = ''
- var len = 0
- for (var k = 0 ; k <= len ; ++k) {
- var bit = read_bit(fh, \buffer)
- if (bit == '0') {
- deltas << 0
- }
- else {
- var bit = read_bit(fh, \buffer)
- var n = (^Inf -> first { read_bit(fh, \buffer) != '1' })
- var d = Num('1' + n.of { read_bit(fh, \buffer) }.join, 2)
- deltas << (bit == '1' ? d : -d)
- }
- if (k == 0) {
- len = deltas.pop
- }
- }
- var acc = [len, deltas...].acc
- acc.shift
- return acc
- }
- func encode_integers (Array integers) {
- var counts = []
- var count = 0
- var bits_per_symbol = 2
- var processed_len = 0
- for k in (integers) {
- while (k >= bits_per_symbol) {
- if (count > 0) {
- counts << [bits_per_symbol.len(2)-1, count]
- processed_len += count
- }
- count = 0
- bits_per_symbol *= 2
- }
- ++count
- }
- counts << [[bits_per_symbol.len(2)-1, integers.len - processed_len]].grep { .tail > 0 }...
- var header = delta_encode([counts.map{.head}..., counts.map{.tail}...]);
- var bits = []
- for b,len in (counts) {
- bits << integers.splice(0, len).map{ sprintf('%0*s', b, .as_bin) }...
- }
- header + pack('B*', bits.join)
- }
- func decode_integers (FileHandle str_fh) {
- var ints = delta_decode(str_fh)
- var half = (ints.len >> 1)
- var counts = []
- for i in (0 .. (half - 1)) {
- counts << [ints[$i], ints[half + i]]
- }
- var bits_len = 0
- for blen,len in (counts) {
- bits_len += (blen * len)
- }
- var chunks = []
- var bits = read_bits(str_fh, bits_len)
- var bits_fh = bits.open_r(:raw)
- for b,len in (counts) {
- bits_fh.read(\var chunk, len*b)
- chunks << chunk.slices(b)...
- }
- chunks.map { Num(_, 2) }
- }
- func walk(Hash n, String s, Hash h) {
- if (n.has(:a)) {
- h{n{:a}} = s
- return nil
- }
- walk(n{:0}, s+'0', h) if n.has(:0)
- walk(n{:1}, s+'1', h) if n.has(:1)
- }
- func mktree_from_freq(Hash freqs) {
- var nodes = freqs.sort_by {|k| k.to_i }.map_2d { |b,f|
- Hash(a => b, freq => f)
- }
- loop {
- nodes.sort_by!{|h| h{:freq} }
- var(x, y) = (nodes.shift, nodes.shift)
- if (defined(x)) {
- if (defined(y)) {
- nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
- }
- else {
- nodes << Hash(:0 => x, :freq => x{:freq})
- }
- }
- nodes.len > 1 || break
- }
- var n = nodes.first
- walk(n, '', n{:tree} = Hash())
- return n
- }
- func huffman_encode(Array bytes, Hash t) {
- join('', t{bytes...})
- }
- func huffman_decode (String bits, Hash tree) {
- bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
- }
- func create_huffman_entry (Array bytes, FileHandle out_fh) {
- var freq = bytes.freq
- var h = mktree_from_freq(freq){:tree}
- var enc = huffman_encode(bytes, h)
- var max_symbol = (bytes.max \\ 0)
- var freqs = []
- for i in (0 .. max_symbol) {
- freqs << (freq{i} \\ 0)
- }
- out_fh << delta_encode(freqs)
- out_fh << pack("N", enc.len)
- out_fh << pack("B*", enc)
- }
- func decode_huffman_entry (FileHandle fh) {
- var freqs = delta_decode(fh)
- var freq = Hash()
- freqs.each_kv {|k,v|
- if (v != 0) {
- freq{k} = v
- }
- }
- var rev_dict = mktree_from_freq(freq){:tree}.flip
- for k in (rev_dict.keys) {
- rev_dict{k} = "#{rev_dict{k}} "
- }
- var enc_len = unpack('N', 4.of { fh.getc }.join).to_i
- if (enc_len > 0) {
- return huffman_decode(read_bits(fh, enc_len), rev_dict).nums
- }
- return []
- }
- func lz77_compression (String str, Array uncompressed, Array indices, Array lengths) {
- var la = 0
- var prefix = ''
- var chars = str.chars
- var end = chars.end
- while (la <= end) {
- var n = 1
- var p = prefix.len
- var tmp = nil
- var token = chars[la]
- while ((n < 255) && (la+n <= end) && ((tmp = prefix.rindex(token, p)) >= 0)) {
- p = tmp
- token += chars[la + n]
- ++n
- }
- --n
- indices << la-p
- lengths << n
- uncompressed << chars[la+n].ord
- la += n+1
- prefix += token
- }
- return nil
- }
- func lz77_decompression (Array uncompressed, Array indices, Array lengths) {
- var (fh, str) = FileHandle.new_buf(:raw)
- var offset = 0
- [uncompressed, indices, lengths].zip {|c,i,l|
- fh << (str.substr(offset - i, l), c.chr)
- offset += (l + 1)
- }
- return str
- }
- # Compress file
- func compress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- var header = SIGNATURE
- # Print the header
- out_fh.print(header)
- # Compress data
- while (in_fh.read(\var chunk, CHUNK_SIZE)) {
- var (uncompressed, indices, lengths) = ([], [], [])
- lz77_compression(chunk, uncompressed, indices, lengths)
- var est_ratio = (chunk.len / (4 * uncompressed.len))
- say (uncompressed.len, " -> ", est_ratio)
- if (est_ratio > 0.85) {
- out_fh.print(COMPRESSED_BYTE)
- create_huffman_entry(uncompressed, out_fh)
- create_huffman_entry(lengths, out_fh)
- out_fh.print(encode_integers(indices))
- }
- else {
- out_fh.print(UNCOMPRESSED_BYTE)
- create_huffman_entry(chunk.bytes, out_fh)
- }
- }
- # Close the files
- in_fh.close
- out_fh.close
- }
- # Decompress file
- func decompress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
- die "Not a LZIH archive!\n"
- }
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- while (!in_fh.eof) {
- var compression_byte = in_fh.getc
- if (compression_byte == COMPRESSED_BYTE) {
- var uncompressed = decode_huffman_entry(in_fh)
- var lengths = decode_huffman_entry(in_fh)
- var indices = decode_integers(in_fh)
- out_fh.print(lz77_decompression(uncompressed, indices, lengths))
- }
- elsif (compression_byte == UNCOMPRESSED_BYTE) {
- out_fh.print(decode_huffman_entry(in_fh).pack('C*'))
- }
- else {
- die "Invalid compression..."
- }
- }
- in_fh.close
- out_fh.close
- }
- ARGV.getopt!('d', \var decode)
- var file = File(ARGV.shift) || do {
- say "usage: #{File(__MAIN__).basename} [-d] [input file]"
- Sys.exit(2)
- }
- if (decode || file.match(Regex('\.' + FORMAT + '\.enc\z'))) {
- decompress_file(file, File("output.#{FORMAT}.dec"))
- }
- else {
- compress_file(file, File("output.#{FORMAT}.enc"))
- }
|