compress.sf 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #!/usr/bin/ruby
  2. # A basic implementation of the UNIX `compress` tool, creating a .Z compressed file, using LZW compression.
  3. # This implementation reads from STDIN and outputs to STDOUT:
  4. # sidef compress.sf < input.txt > output.Z
  5. # Reference:
  6. # Data Compression (Summer 2023) - Lecture 4 - The Unix 'compress' Program
  7. # https://youtube.com/watch?v=1cJL9Va80Pk
  8. # See also:
  9. # https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  10. define (
  11. BUFFER_SIZE = 8*512, # must be a multiple of 8
  12. MAGIC_SIGNATURE = "\x1f\x9d\x90",
  13. )
  14. func compress (FileHandle in_fh, FileHandle out_fh) {
  15. in_fh.binmode(':raw')
  16. out_fh.binmode(':raw')
  17. out_fh.print(MAGIC_SIGNATURE)
  18. var dict_size = 256
  19. var dictionary = Hash(dict_size.of {|i| (i.chr, i) }...)
  20. ++dict_size # 256 is the 'RESET' marker
  21. var num_bits = 9
  22. var max_bits = 16
  23. var max_bits_size = (1 << num_bits)
  24. var max_dict_size = (1 << max_bits)
  25. var bitstream = []
  26. var bitstream_size = 0
  27. var output_index = {|symbol|
  28. bitstream << ('%0*b' % (num_bits, dictionary{symbol}) -> flip)
  29. bitstream_size += num_bits
  30. if (bitstream_size % BUFFER_SIZE == 0) {
  31. out_fh.print(pack("b*", bitstream.join))
  32. bitstream = []
  33. bitstream_size = 0
  34. }
  35. }
  36. var w = ''
  37. in_fh.each_char {|c|
  38. var wc = w+c
  39. if (dictionary.has(wc)) {
  40. w = wc
  41. }
  42. else {
  43. output_index.run(w)
  44. if (dict_size < max_dict_size) {
  45. dictionary{wc} = dict_size++
  46. if (dict_size > max_bits_size) {
  47. ++num_bits
  48. max_bits_size <<= 1
  49. }
  50. }
  51. w = c
  52. }
  53. }
  54. if (w != '') {
  55. output_index.run(w)
  56. }
  57. if (bitstream.len > 0) {
  58. out_fh.print(pack('b*', bitstream.join))
  59. }
  60. return true
  61. }
  62. compress(STDIN, STDOUT)