ppm_encoding.sf 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 10 August 2023
  4. # https://github.com/trizen
  5. # Implementation of a PPM (prediction by partial-matching) encoder, using Huffman Coding.
  6. # See also:
  7. # https://rosettacode.org/wiki/huffman_coding
  8. # Reference:
  9. # Data Compression (Summer 2023) - Lecture 16 - Adaptive Methods
  10. # https://youtube.com/watch?v=YKv-w8bXi9c
  11. define(
  12. ESCAPE_SYMBOL => 256, # escape symbol
  13. CONTEXTS_NUM => 4, # maximum number of contexts
  14. )
  15. func walk(Hash n, String s, Hash h) {
  16. if (n.has(:a)) {
  17. h{n{:a}} = s
  18. return nil
  19. }
  20. walk(n{:0}, s+'0', h) if n.has(:0)
  21. walk(n{:1}, s+'1', h) if n.has(:1)
  22. }
  23. func mktree_from_freq(Hash freqs) {
  24. var nodes = freqs.sort_by{.to_i}.map_2d { |b,f|
  25. Hash(a => b, freq => f)
  26. }
  27. loop {
  28. nodes.sort_by!{|h| h{:freq} }
  29. var(x, y) = (nodes.shift, nodes.shift)
  30. if (defined(x)) {
  31. if (defined(y)) {
  32. nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
  33. }
  34. else {
  35. nodes << Hash(:0 => x, :freq => x{:freq})
  36. }
  37. }
  38. nodes.len > 1 || break
  39. }
  40. var n = nodes.first
  41. walk(n, '', n{:tree} = Hash())
  42. return n
  43. }
  44. func encode(Array bytes, Array alphabet) {
  45. var enc = []
  46. var prev = []
  47. var ctx = [
  48. Hash(prev, Hash(freq => alphabet.freq)),
  49. ]
  50. for i in (1..CONTEXTS_NUM) {
  51. ctx << Hash(prev, Hash(freq => [ESCAPE_SYMBOL].freq)),
  52. }
  53. for c in ctx {
  54. c{prev}{:tree} = mktree_from_freq(c{prev}{:freq}){:tree}
  55. }
  56. bytes.each {|b|
  57. for k in (0..ctx.end -> flip) {
  58. var s = prev.last(max(k-1, 0))
  59. if (!ctx[k].has(s)) {
  60. ctx[k]{s}{:freq} = [ESCAPE_SYMBOL].freq
  61. }
  62. if (ctx[k]{s}{:freq}.has(b)) {
  63. if (k != 0) {
  64. ctx[k]{s}{:tree} = mktree_from_freq(ctx[k]{s}{:freq}){:tree}
  65. ++ctx[k]{s}{:freq}{b}
  66. }
  67. enc << ctx[k]{s}{:tree}{b}
  68. prev << b
  69. prev.shift if (prev.len >= CONTEXTS_NUM)
  70. break
  71. }
  72. ctx[k]{s}{:tree} = mktree_from_freq(ctx[k]{s}{:freq}){:tree}
  73. enc << ctx[k]{s}{:tree}{ESCAPE_SYMBOL}
  74. ctx[k]{s}{:freq}{b} = 1
  75. }
  76. }
  77. return enc.join
  78. }
  79. func decode(String enc, Array alphabet) {
  80. var out = []
  81. var prev = []
  82. var prefix = ''
  83. var ctx = [
  84. Hash(prev, Hash(freq => alphabet.freq)),
  85. ]
  86. for i in (1..CONTEXTS_NUM) {
  87. ctx << Hash(prev, Hash(freq => [ESCAPE_SYMBOL].freq)),
  88. }
  89. for c in ctx {
  90. c{prev}{:tree} = mktree_from_freq(c{prev}{:freq}){:tree}.flip
  91. }
  92. var context = CONTEXTS_NUM
  93. var key = prev.last(max(context-1, 0))
  94. enc.each {|bit|
  95. prefix += bit
  96. if (!ctx[context].has(key)) {
  97. ctx[context]{key}{:freq} = [ESCAPE_SYMBOL].freq
  98. ctx[context]{key}{:tree} = mktree_from_freq(ctx[context]{key}{:freq}){:tree}.flip
  99. }
  100. if (ctx[context]{key}{:tree}.has(prefix)) {
  101. var b = Num(ctx[context]{key}{:tree}{prefix})
  102. if (b == ESCAPE_SYMBOL) {
  103. --context
  104. key.shift if (key.len >= context)
  105. }
  106. else {
  107. out << b
  108. for k in (max(context, 1) .. CONTEXTS_NUM) {
  109. var s = prev.last(k-1)
  110. ctx[k]{s}{:freq} := [ESCAPE_SYMBOL].freq
  111. ctx[k]{s}{:freq}{b} := 0 ++
  112. ctx[k]{s}{:tree} = mktree_from_freq(ctx[k]{s}{:freq}){:tree}.flip
  113. }
  114. context = CONTEXTS_NUM
  115. prev << b
  116. prev.shift if (prev.len >= CONTEXTS_NUM)
  117. key = prev.last(max(context-1, 0))
  118. }
  119. prefix = ''
  120. }
  121. }
  122. return out
  123. }
  124. #var text = "this is an example for huffman encoding"
  125. var text = "A SAD DAD; A SAD SALSA"
  126. var bytes = text.bytes
  127. say var enc = encode(bytes, bytes.uniq)
  128. say var dec = decode(enc, bytes.uniq).pack('C*')
  129. assert_eq(dec, text)
  130. printf("Saved: %.3f%%\n", ((dec.len - enc.len/8f) / dec.len * 100))