123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- #!/usr/bin/ruby
- # Compress/decompress files using LZ77 compression + Huffman coding.
- define(
- CHUNK_SIZE = (1 << 16),
- SIGNATURE = ('LZH' + 1.chr)
- )
- func walk(Hash n, String s, Hash h) {
- if (n.has(:a)) {
- h{n{:a}} = s
- return nil
- }
- walk(n{:0}, s+'0', h) if n.has(:0)
- walk(n{:1}, s+'1', h) if n.has(:1)
- }
- func make_tree(Array bytes) {
- var nodes = bytes.freq.sort.map_2d { |b,f|
- Hash(a => b, freq => f)
- }
- loop {
- nodes.sort_by!{|h| h{:freq} }
- var(x, y) = (nodes.shift, nodes.shift)
- if (defined(x)) {
- if (defined(y)) {
- nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
- }
- else {
- nodes << Hash(:0 => x, :freq => x{:freq})
- }
- }
- nodes.len > 1 || break
- }
- var n = nodes.first
- walk(n, '', n{:tree} = Hash())
- return n
- }
- func huffman_encode(Array bytes, Hash t) {
- bytes.map { t{_} }.join
- }
- func huffman_decode (String bits, Hash tree) {
- bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
- }
- func create_huffman_entry (Array bytes, FileHandle out_fh) {
- var h = make_tree(bytes){:tree}
- var enc = huffman_encode(bytes, h)
- var dict = ''
- var codes = ''
- for i in (0..255) {
- var c = (h{i} \\ '')
- codes += c
- dict += c.len.chr
- }
- out_fh.print(dict)
- out_fh.print(pack("B*", codes))
- out_fh.print(pack("N", enc.len))
- out_fh.print(pack("B*", enc))
- }
- func lz77_compress (String str, Array uncompressed, Array indices, Array lengths) {
- var la = 0
- var prefix = ''
- var chars = str.chars
- var end = chars.end
- while (la <= end) {
- var n = 1
- var p = 0
- var token = chars[la]
- while ((n < 255) && (la+n <= end) && ((var tmp = prefix.index(token, p)) >= 0)) {
- p = tmp
- token += chars[la + n]
- ++n
- }
- --n
- indices << p
- lengths << n
- uncompressed << chars[la+n]
- la += n+1
- prefix += token
- }
- return nil
- }
- # Compress file
- func lz77h_compress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- var header = SIGNATURE
- # Print the header
- out_fh.print(header)
- var (uncompressed, indices, lengths) = ([], [], [])
- # Compress data
- while (in_fh.read(\var chunk, CHUNK_SIZE)) {
- lz77_compress(chunk, uncompressed, indices, lengths)
- }
- uncompressed = [unpack('C*', uncompressed.join)]
- indices = [unpack('C*', pack('S*', indices...))]
- create_huffman_entry(uncompressed, out_fh)
- create_huffman_entry(indices, out_fh)
- create_huffman_entry(lengths, out_fh)
- # Close the files
- in_fh.close
- out_fh.close
- }
- func read_bits (FileHandle fh, Number bits_len) {
- var str = ''
- fh.read(\str, bits_len>>3)
- str = unpack('B*', str)
- while (str.len < bits_len) {
- str += unpack('B*', fh.getc \\ return nil)
- }
- if (str.len > bits_len) {
- str.substr!(0, bits_len)
- }
- return str
- }
- func decode_huffman_entry (FileHandle fh) {
- var codes = []
- var codes_len = 0
- fh.read(\var buffer, 256)
- [unpack('C*', buffer)].map{ Num(_) }.each_kv {|c,l|
- if (l > 0) {
- codes_len += l
- codes << [c, l]
- }
- }
- var codes_bin = read_bits(fh, codes_len)
- var rev_dict = Hash()
- for c,l in (codes) {
- var code = substr(codes_bin, 0, l)
- codes_bin.substr!(l)
- rev_dict{code} = c.chr
- }
- var enc_len = Num(unpack('N', 4.of{ fh.getc }.join))
- if (enc_len > 0) {
- var enc_data = read_bits(fh, enc_len)
- return huffman_decode(enc_data, rev_dict)
- }
- return ''
- }
- # Decompress file
- func lz77h_decompress_file(File input, File output) {
- var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
- if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
- die "Not a LZH archive!\n"
- }
- var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
- var uncompressed = decode_huffman_entry(in_fh).chars
- var indices = [unpack('S*', decode_huffman_entry(in_fh))]
- var lengths = [unpack('C*', decode_huffman_entry(in_fh))]
- var chunk = ''
- [uncompressed, indices, lengths].zip {|c,i,l|
- chunk += (chunk.substr(i, l) + c)
- if (chunk.len >= CHUNK_SIZE) {
- out_fh.print(chunk)
- chunk = ''
- }
- }
- out_fh.print(chunk)
- in_fh.close
- out_fh.close
- }
- ARGV.getopt!('d', \var decode)
- var file = File(ARGV.shift) || do {
- say "usage: #{File(__MAIN__).basename} [-d] [input file]"
- Sys.exit(2)
- }
- if (decode || file.match(/\.lzh\.enc\z/)) {
- lz77h_decompress_file(file, File("output.lzh.dec"))
- }
- else {
- lz77h_compress_file(file, File("output.lzh.enc"))
- }
|