123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- #!/usr/bin/ruby
- # Author: Trizen
- # Date: 12 July 2023
- # Edit: 06 February 2024
- # https://github.com/trizen
- # The Arithmetic Coding algorithm, implemented using native integers.
- # References:
- # Data Compression (Summer 2023) - Lecture 15 - Infinite Precision in Finite Bits
- # https://youtube.com/watch?v=EqKbT3QdtOI
- #
- # Basic arithmetic coder in C++
- # https://github.com/billbird/arith32
- define (
- BITS = 32, # can be modified
- MAX = (1 << BITS)-1,
- EOF_SYMBOL = 256,
- )
- func create_cfreq(freq) {
- var c_low = Hash()
- var c_high = Hash()
- var T = 0
- for i in (0 .. EOF_SYMBOL) {
- freq{i} \\ next
- c_low{i} = T
- T += freq{i}
- c_high{i} = T
- }
- return (c_low, c_high, T)
- }
- func encode (String str) {
- var bytes = str.bytes+[EOF_SYMBOL]
- var enc = FileHandle.new_buf(:raw)
- var freq = bytes.freq
- var(cf_low, cf_high, T) = create_cfreq(freq)
- if (T > MAX) {
- die ("Too few bits: #{T} > ", MAX)
- }
- var low = 0
- var high = MAX
- var uf_count = 0
- for c in (bytes) {
- var w = (high - low + 1)
- high = (low + idiv(w * cf_high{c}, T) - 1)&MAX
- low = (low + idiv(w * cf_low{c}, T))&MAX
- if (high > MAX) {
- die "high > MAX: #{high} > #{MAX}"
- }
- if (low >= high) { die "#{low} >= #{high}" }
- loop {
- if ((high>>(BITS-1)) == (low>>(BITS-1))) {
- var bit = high>>(BITS-1)
- enc << bit
- if (uf_count > 0) {
- enc << Str(1 - bit)*uf_count
- uf_count = 0
- }
- low <<= 1
- high <<= 1 |= 1
- }
- elsif ((((low>>(BITS-2))&0x1) == 1) && (((high>>(BITS-2))&0x1) == 0)) {
- high <<= 1 |= (1 << (BITS-1)) |= 1
- low <<= 1 &= ((1 << (BITS-1)) - 1)
- ++uf_count
- }
- else {
- break
- }
- low &= MAX
- high &= MAX
- }
- }
- enc.print(0)
- enc.print(1)
- while (enc.parent.len % 8 != 0) {
- enc.print(1)
- }
- return (enc.parent, freq)
- }
- func decode (bits, freq) {
- var fh = bits.open_r(:raw)
- var(cf_low, cf_high, T) = create_cfreq(freq)
- if (T > MAX) {
- die "Too large value: #{T} > #{MAX}"
- }
- var dec = FileHandle.new_buf(:raw)
- var low = 0
- var high = MAX
- var enc = Num(BITS.of { fh.getc \\ 1 }.join, 2)
- var table = []
- for i in (0..EOF_SYMBOL) {
- cf_low{i} \\ next
- for j in (cf_low{i} ..^ cf_high{i}) {
- table[j] = i
- }
- }
- loop {
- var w = (high - low + 1)
- var ss = idiv(T*(enc - low + 1) - 1, w)
- var i = table[ss] \\ break
- break if (i == EOF_SYMBOL)
- dec << i.chr
- high = (low + idiv(w * cf_high{i}, T) - 1)&MAX
- low = (low + idiv(w * cf_low{i}, T))&MAX
- if (high > MAX) {
- die "high > MAX: #{high} > #{MAX}"
- }
- if (low >= high) { die "#{low} >= #{high}" }
- loop {
- if ((high>>(BITS-1)) == (low>>(BITS-1))) {
- high <<= 1 |= 1
- low <<= 1
- enc <<= 1 |= (fh.getc \\ 1)
- }
- elsif ((((low>>(BITS-2))&0x1) == 1) && (((high>>(BITS-2))&0x1) == 0)) {
- high <<= 1 |= (1 << (BITS-1)) |= 1
- low <<= 1 &= ((1 << (BITS-1)) - 1)
- enc = ((enc>>(BITS-1))<<(BITS-1))|((enc&((1<<(BITS-2))-1))<<1)|(fh.getc \\ 1)
- }
- else {
- break
- }
- low &= MAX
- high &= MAX
- enc &= MAX
- }
- }
- return dec.parent
- }
- var str = "ABRACADABRA AND A VERY SAD SALAD"
- if (ARGV) {
- if (File(ARGV[0]).exists) {
- str = File(ARGV[0]).read(:raw)
- }
- else {
- str = ARGV[0]
- }
- }
- var (enc, freq) = encode(str)
- say enc
- say ("String length: ", str.len)
- say ("Encoded bytes length: ", length(enc)/8)
- say ("Ratio: ", (enc.len/8)/str.len * 100)
- var dec = decode(enc, freq)
- assert_eq(str.len, dec.len)
- assert_eq(dec, str)
- __END__
- 0100110110111110100000000100000111110000110110011111000010110011011001000101100011011101001110000000010001111111
- String length: 32
- Encoded bytes length: 14
- Ratio: 43.75
|