lzh_file_compression.sf 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #!/usr/bin/ruby
  2. # Compress/decompress files using LZ77 compression + Huffman coding.
  3. define(
  4. CHUNK_SIZE = (1 << 16),
  5. SIGNATURE = ('LZH' + 1.chr)
  6. )
  7. func walk(Hash n, String s, Hash h) {
  8. if (n.has(:a)) {
  9. h{n{:a}} = s
  10. return nil
  11. }
  12. walk(n{:0}, s+'0', h) if n.has(:0)
  13. walk(n{:1}, s+'1', h) if n.has(:1)
  14. }
  15. func make_tree(Array bytes) {
  16. var nodes = bytes.freq.sort.map_2d { |b,f|
  17. Hash(a => b, freq => f)
  18. }
  19. loop {
  20. nodes.sort_by!{|h| h{:freq} }
  21. var(x, y) = (nodes.shift, nodes.shift)
  22. if (defined(x)) {
  23. if (defined(y)) {
  24. nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
  25. }
  26. else {
  27. nodes << Hash(:0 => x, :freq => x{:freq})
  28. }
  29. }
  30. nodes.len > 1 || break
  31. }
  32. var n = nodes.first
  33. walk(n, '', n{:tree} = Hash())
  34. return n
  35. }
  36. func huffman_encode(Array bytes, Hash t) {
  37. bytes.map { t{_} }.join
  38. }
  39. func huffman_decode (String bits, Hash tree) {
  40. bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
  41. }
  42. func create_huffman_entry (Array bytes, FileHandle out_fh) {
  43. var h = make_tree(bytes){:tree}
  44. var enc = huffman_encode(bytes, h)
  45. var dict = ''
  46. var codes = ''
  47. for i in (0..255) {
  48. var c = (h{i} \\ '')
  49. codes += c
  50. dict += c.len.chr
  51. }
  52. out_fh.print(dict)
  53. out_fh.print(pack("B*", codes))
  54. out_fh.print(pack("N", enc.len))
  55. out_fh.print(pack("B*", enc))
  56. }
  57. func lz77_compress (String str, Array uncompressed, Array indices, Array lengths) {
  58. var la = 0
  59. var prefix = ''
  60. var chars = str.chars
  61. var end = chars.end
  62. while (la <= end) {
  63. var n = 1
  64. var p = 0
  65. var token = chars[la]
  66. while ((n < 255) && (la+n <= end) && ((var tmp = prefix.index(token, p)) >= 0)) {
  67. p = tmp
  68. token += chars[la + n]
  69. ++n
  70. }
  71. --n
  72. indices << p
  73. lengths << n
  74. uncompressed << chars[la+n]
  75. la += n+1
  76. prefix += token
  77. }
  78. return nil
  79. }
  80. # Compress file
  81. func lz77h_compress_file(File input, File output) {
  82. var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
  83. var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
  84. var header = SIGNATURE
  85. # Print the header
  86. out_fh.print(header)
  87. var (uncompressed, indices, lengths) = ([], [], [])
  88. # Compress data
  89. while (in_fh.read(\var chunk, CHUNK_SIZE)) {
  90. lz77_compress(chunk, uncompressed, indices, lengths)
  91. }
  92. uncompressed = [unpack('C*', uncompressed.join)]
  93. indices = [unpack('C*', pack('S*', indices...))]
  94. create_huffman_entry(uncompressed, out_fh)
  95. create_huffman_entry(indices, out_fh)
  96. create_huffman_entry(lengths, out_fh)
  97. # Close the files
  98. in_fh.close
  99. out_fh.close
  100. }
  101. func read_bits (FileHandle fh, Number bits_len) {
  102. var str = ''
  103. fh.read(\str, bits_len>>3)
  104. str = unpack('B*', str)
  105. while (str.len < bits_len) {
  106. str += unpack('B*', fh.getc \\ return nil)
  107. }
  108. if (str.len > bits_len) {
  109. str.substr!(0, bits_len)
  110. }
  111. return str
  112. }
  113. func decode_huffman_entry (FileHandle fh) {
  114. var codes = []
  115. var codes_len = 0
  116. fh.read(\var buffer, 256)
  117. [unpack('C*', buffer)].map{ Num(_) }.each_kv {|c,l|
  118. if (l > 0) {
  119. codes_len += l
  120. codes << [c, l]
  121. }
  122. }
  123. var codes_bin = read_bits(fh, codes_len)
  124. var rev_dict = Hash()
  125. for c,l in (codes) {
  126. var code = substr(codes_bin, 0, l)
  127. codes_bin.substr!(l)
  128. rev_dict{code} = c.chr
  129. }
  130. var enc_len = Num(unpack('N', 4.of{ fh.getc }.join))
  131. if (enc_len > 0) {
  132. var enc_data = read_bits(fh, enc_len)
  133. return huffman_decode(enc_data, rev_dict)
  134. }
  135. return ''
  136. }
  137. # Decompress file
  138. func lz77h_decompress_file(File input, File output) {
  139. var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
  140. if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
  141. die "Not a LZH archive!\n"
  142. }
  143. var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
  144. var uncompressed = decode_huffman_entry(in_fh).chars
  145. var indices = [unpack('S*', decode_huffman_entry(in_fh))]
  146. var lengths = [unpack('C*', decode_huffman_entry(in_fh))]
  147. var chunk = ''
  148. [uncompressed, indices, lengths].zip {|c,i,l|
  149. chunk += (chunk.substr(i, l) + c)
  150. if (chunk.len >= CHUNK_SIZE) {
  151. out_fh.print(chunk)
  152. chunk = ''
  153. }
  154. }
  155. out_fh.print(chunk)
  156. in_fh.close
  157. out_fh.close
  158. }
  159. ARGV.getopt!('d', \var decode)
  160. var file = File(ARGV.shift) || do {
  161. say "usage: #{File(__MAIN__).basename} [-d] [input file]"
  162. Sys.exit(2)
  163. }
  164. if (decode || file.match(/\.lzh\.enc\z/)) {
  165. lz77h_decompress_file(file, File("output.lzh.dec"))
  166. }
  167. else {
  168. lz77h_compress_file(file, File("output.lzh.enc"))
  169. }