123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- #!/usr/bin/ruby
- # Author: Trizen
- # Date: 04 August 2023
- # https://github.com/trizen
- # Arithmetic coding, implemented using big integers.
- # See also:
- # https://en.wikipedia.org/wiki/Arithmetic_coding#Arithmetic_coding_as_a_generalized_change_of_radix
- func cumulative_freq (freq) {
- var cf = Hash()
- var total = 0
- freq.keys.sort_by { Num(_) }.each {|c|
- cf{c} = total
- total += freq{c}
- }
- return cf
- }
- func ac_encode (symbols) {
- # The frequency characters
- var freq = symbols.freq
- # Create the cumulative frequency table
- var cf = cumulative_freq(freq)
- # Limit and base
- var base = symbols.len
- # Lower bound
- var L = 0
- # Product of all frequencies
- var pf = 1
- # Each term is multiplied by the product of the
- # frequencies of all previously occurring symbols
- for c in (symbols) {
- L *= base
- L += pf*cf{c}
- pf *= freq{c}
- }
- # Upper bound
- var U = (L + pf)
- # Compute the power for left shift
- var pow = pf.ilog2
- # Set enc to (U-1) divided by 2^pow
- var enc = ((U - 1) >> pow)
- # Remove any divisibility by 2
- if (enc>0 && enc.is_even) {
- var v = enc.valuation(2)
- pow += v
- enc >>= v
- }
- var bin = enc.base(2)
- return (bin, pow, freq)
- }
- func ac_decode (bits, pow, freq) {
- # Decode the bits into an integer
- var enc = Num(bits, 2)<<pow
- # Base value == length of decoded input
- var base = freq.values.sum
- if (base == 0) {
- return []
- }
- elsif (base == 1) {
- return freq.keys
- }
- # Create the cumulative frequency table
- var cf = cumulative_freq(freq)
- # Create the dictionary
- var dict = Hash()
- cf.each {|k,v|
- dict{v} = k
- }
- # Fill the gaps in the dictionary
- var lchar = nil
- for i in (^base) {
- if (dict.has(i)) {
- lchar = dict{i}
- }
- elsif (defined(lchar)) {
- dict{i} = lchar
- }
- }
- var dec = []
- # Decode the input number
- for (var pow = base**(base - 1); pow > 0; pow.idiv!(base)) {
- var div = idiv(enc, pow)
- var c = dict{div}
- var fv = freq{c}
- var cv = cf{c}
- enc -= pow*cv
- enc.idiv!(fv)
- dec << c
- }
- return dec
- }
- #
- ## Run some tests
- #
- for str in ([
- '', 'a', 'this is a message for you to encode and to decode correctly!',
- ['a'..'z', 0..9, 'A'..'Z', 0..9].map{.join}.join,
- %w(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT)...,
- 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '+
- 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '+
- '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '+
- 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
- ]) {
- var bytes = str.encode(:utf8).bytes
- var (enc, pow, freq) = ac_encode(bytes)
- var dec_bytes = ac_decode(enc, pow, freq)
- var dec = dec_bytes.decode(:utf8)
- say "Encoded: #{enc}"
- say "Decoded: #{dec}"
- if (str != dec) {
- die "\tHowever that is incorrect!"
- }
- say ("-" * 80)
- }
|