ppm_encoding_dynamic.sf 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 10 August 2023
  4. # https://github.com/trizen
  5. # Implementation of a PPM (prediction by partial-matching) encoder, using Huffman Coding.
  6. # This variant dynamically increments the context-length, based on the input data, in order to reduce the number of escape symbols being generated.
  7. # See also:
  8. # https://rosettacode.org/wiki/huffman_coding
  9. # Reference:
  10. # Data Compression (Summer 2023) - Lecture 16 - Adaptive Methods
  11. # https://youtube.com/watch?v=YKv-w8bXi9c
  12. define(
  13. ESCAPE_SYMBOL => 256, # escape symbol
  14. CONTEXTS_NUM => 3, # maximum number of contexts
  15. INITIAL_CONTEXT => 1, # start in this context
  16. )
  17. func walk(Hash n, String s, Hash h) {
  18. if (n.has(:a)) {
  19. h{n{:a}} = s
  20. return nil
  21. }
  22. walk(n{:0}, s+'0', h) if n.has(:0)
  23. walk(n{:1}, s+'1', h) if n.has(:1)
  24. }
  25. func mktree_from_freq(Hash freqs) {
  26. var nodes = freqs.sort_by{.to_i}.map_2d { |b,f|
  27. Hash(a => b, freq => f)
  28. }
  29. loop {
  30. nodes.sort_by!{|h| h{:freq} }
  31. var(x, y) = (nodes.shift, nodes.shift)
  32. if (defined(x)) {
  33. if (defined(y)) {
  34. nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
  35. }
  36. else {
  37. nodes << Hash(:0 => x, :freq => x{:freq})
  38. }
  39. }
  40. nodes.len > 1 || break
  41. }
  42. var n = nodes.first
  43. walk(n, '', n{:tree} = Hash())
  44. return n
  45. }
  46. func encode(Array bytes, Array alphabet) {
  47. var enc = []
  48. var prev = []
  49. var ctx = [
  50. Hash(prev, Hash(freq => alphabet.freq)),
  51. ]
  52. for i in (1..CONTEXTS_NUM) {
  53. ctx << Hash(prev, Hash(freq => [ESCAPE_SYMBOL].freq)),
  54. }
  55. for c in ctx {
  56. c{prev}{:tree} = mktree_from_freq(c{prev}{:freq}){:tree}
  57. }
  58. var prev_ctx = INITIAL_CONTEXT
  59. bytes.each {|b|
  60. for k in (0..prev_ctx -> flip) {
  61. var s = prev.last(max(k-1, 0))
  62. if (!ctx[k].has(s)) {
  63. ctx[k]{s}{:freq} = [ESCAPE_SYMBOL].freq
  64. }
  65. if (ctx[k]{s}{:freq}.has(b)) {
  66. if (k != 0) {
  67. ctx[k]{s}{:tree} = mktree_from_freq(ctx[k]{s}{:freq}){:tree}
  68. ++ctx[k]{s}{:freq}{b}
  69. }
  70. enc << ctx[k]{s}{:tree}{b}
  71. ++prev_ctx if (prev_ctx < ctx.end)
  72. prev << b
  73. prev.shift if (prev.len >= CONTEXTS_NUM)
  74. break
  75. }
  76. --prev_ctx
  77. ctx[k]{s}{:tree} = mktree_from_freq(ctx[k]{s}{:freq}){:tree}
  78. enc << ctx[k]{s}{:tree}{ESCAPE_SYMBOL}
  79. ctx[k]{s}{:freq}{b} = 1
  80. }
  81. }
  82. return enc.join
  83. }
  84. func decode(String enc, Array alphabet) {
  85. var out = []
  86. var prev = []
  87. var prefix = ''
  88. var ctx = [
  89. Hash(prev, Hash(freq => alphabet.freq)),
  90. ]
  91. for i in (1..CONTEXTS_NUM) {
  92. ctx << Hash(prev, Hash(freq => [ESCAPE_SYMBOL].freq)),
  93. }
  94. for c in ctx {
  95. c{prev}{:tree} = mktree_from_freq(c{prev}{:freq}){:tree}.flip
  96. }
  97. var prev_ctx = var context = INITIAL_CONTEXT
  98. var key = prev.last(max(context-1, 0))
  99. enc.each {|bit|
  100. prefix += bit
  101. if (!ctx[context].has(key)) {
  102. ctx[context]{key}{:freq} = [ESCAPE_SYMBOL].freq
  103. ctx[context]{key}{:tree} = mktree_from_freq(ctx[context]{key}{:freq}){:tree}.flip
  104. }
  105. if (ctx[context]{key}{:tree}.has(prefix)) {
  106. var b = Num(ctx[context]{key}{:tree}{prefix})
  107. if (b == ESCAPE_SYMBOL) {
  108. --context
  109. key.shift if (key.len >= context)
  110. }
  111. else {
  112. out << b
  113. for k in (max(context,1) .. prev_ctx) {
  114. var s = prev.last(k-1)
  115. ctx[k]{s}{:freq} := [ESCAPE_SYMBOL].freq
  116. ctx[k]{s}{:freq}{b} := 0 ++
  117. ctx[k]{s}{:tree} = mktree_from_freq(ctx[k]{s}{:freq}){:tree}.flip
  118. }
  119. ++context if (context < ctx.end)
  120. prev_ctx = context
  121. prev << b
  122. prev.shift if (prev.len >= CONTEXTS_NUM)
  123. key = prev.last(max(context-1, 0))
  124. }
  125. prefix = ''
  126. }
  127. }
  128. return out
  129. }
  130. #var text = "this is an example for huffman encoding"
  131. var text = "A SAD DAD; A SAD SALSA"
  132. var bytes = text.bytes
  133. say var enc = encode(bytes, bytes.uniq)
  134. say var dec = decode(enc, bytes.uniq).pack('C*')
  135. assert_eq(dec, text)
  136. printf("Saved: %.3f%%\n", ((dec.len - enc.len/8f) / dec.len * 100))