bzip2_compressor.pl 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 20 August 2024
  4. # https://github.com/trizen
  5. # A very basic Bzip2 compressor.
  6. # References:
  7. # BZIP2: Format Specification, by Joe Tsai
  8. # https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
  9. use 5.036;
  10. use lib qw(../lib);
  11. use POSIX qw(ceil);
  12. use List::Util qw(max);
  13. use Compression::Util qw(:all);
  14. use constant {CHUNK_SIZE => 1 << 16};
  15. local $| = 1;
  16. binmode(STDIN, ":raw");
  17. binmode(STDOUT, ":raw");
  18. sub encode_mtf_alphabet($alphabet) {
  19. my %table;
  20. @table{@$alphabet} = ();
  21. my $populated = 0;
  22. my @marked;
  23. for (my $i = 0 ; $i <= 255 ; $i += 16) {
  24. my $enc = 0;
  25. foreach my $j (0 .. 15) {
  26. if (exists($table{$i + $j})) {
  27. $enc |= 1 << $j;
  28. }
  29. }
  30. $populated <<= 1;
  31. if ($enc > 0) {
  32. $populated |= 1;
  33. push @marked, $enc;
  34. }
  35. }
  36. say STDERR sprintf("Populated: %016b", $populated);
  37. say STDERR "Marked: (@marked)";
  38. return ($populated, \@marked);
  39. }
  40. sub encode_code_lengths($dict) {
  41. my @lengths;
  42. foreach my $symbol (0 .. max(keys %$dict) // 0) {
  43. if (exists($dict->{$symbol})) {
  44. push @lengths, length($dict->{$symbol});
  45. }
  46. else {
  47. die "Incomplete Huffman tree not supported";
  48. push @lengths, 0;
  49. }
  50. }
  51. say STDERR "Code lengths: (@lengths)";
  52. my $deltas = deltas(\@lengths);
  53. say STDERR "Code lengths deltas: (@$deltas)";
  54. my $bitstring = int2bits(shift(@$deltas), 5) . '0';
  55. foreach my $d (@$deltas) {
  56. $bitstring .= (($d > 0) ? ('10' x $d) : ('11' x abs($d))) . '0';
  57. }
  58. say STDERR "Deltas bitstring: $bitstring";
  59. return $bitstring;
  60. }
  61. my $s = "Hello, World!\n";
  62. my $fh;
  63. if (-t STDIN) {
  64. open $fh, "<:raw", \$s;
  65. }
  66. else {
  67. $fh = \*STDIN;
  68. }
  69. print "BZh";
  70. my $level = 1;
  71. if ($level <= 0 or $level > 9) {
  72. die "Invalid level value: $level";
  73. }
  74. print $level;
  75. my $block_header_bitstring = unpack("B48", "1AY&SY");
  76. my $block_footer_bitstring = unpack("B48", "\27rE8P\x90");
  77. my $bitstring = '';
  78. my $stream_crc32 = 0;
  79. while (!eof($fh)) {
  80. read($fh, (my $chunk), CHUNK_SIZE);
  81. $bitstring .= $block_header_bitstring;
  82. my $crc32 = crc32(pack 'B*', unpack 'b*', $chunk);
  83. say STDERR "CRC32: $crc32";
  84. $crc32 = oct('0b' . int2bits_lsb($crc32, 32));
  85. say STDERR "Bzip2-CRC32: $crc32";
  86. $stream_crc32 = ($crc32 ^ (0xffffffff & ((0xffffffff & ($stream_crc32 << 1)) | (($stream_crc32 >> 31) & 0x1)))) & 0xffffffff;
  87. $bitstring .= int2bits($crc32, 32);
  88. $bitstring .= '0'; # not randomized
  89. my $rle4 = rle4_encode($chunk);
  90. ##say STDERR "RLE4: (@$rle4)";
  91. my ($bwt, $bwt_idx) = bwt_encode(symbols2string($rle4));
  92. $bitstring .= int2bits($bwt_idx, 24);
  93. my ($mtf, $alphabet) = mtf_encode($bwt);
  94. ##say STDERR "MTF: (@$mtf)";
  95. say STDERR "MTF Alphabet: (@$alphabet)";
  96. my ($populated, $marked) = encode_mtf_alphabet($alphabet);
  97. $bitstring .= int2bits($populated, 16);
  98. $bitstring .= int2bits_lsb($_, 16) for @$marked;
  99. my @zrle = reverse @{zrle_encode([reverse @$mtf])};
  100. ##say STDERR "ZRLE: @zrle";
  101. my $eob = scalar(@$alphabet) + 1; # end-of-block symbol
  102. say STDERR "EOB symbol: $eob";
  103. push @zrle, $eob;
  104. my ($dict) = huffman_from_symbols([@zrle, 0 .. $eob - 1]);
  105. my $num_sels = ceil(scalar(@zrle) / 50);
  106. say STDERR "Number of selectors: $num_sels";
  107. $bitstring .= int2bits(2, 3);
  108. $bitstring .= int2bits($num_sels, 15);
  109. $bitstring .= '0' x $num_sels;
  110. $bitstring .= encode_code_lengths($dict) x 2;
  111. $bitstring .= join('', @{$dict}{@zrle});
  112. }
  113. $bitstring .= $block_footer_bitstring;
  114. $bitstring .= int2bits($stream_crc32, 32);
  115. print pack("B*", $bitstring);