123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378 |
- #!/usr/bin/perl
- # Author: Trizen
- # Date: 13 January 2024
- # Edit: 14 April 2024
- # https://github.com/trizen
- # Decompress GZIP files (.gz).
- # DEFLATE's block type 0, 1 and 2 are all supported.
- # Reference:
- # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
- # https://youtube.com/watch?v=SJPvNi4HrWQ
- use 5.036;
- use lib qw(../lib);
- use List::Util qw(max);
- use Compression::Util qw(:all);
- local $Compression::Util::LZ_MAX_LEN = 258; # maximum match length in LZ parsing
- local $Compression::Util::LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing
- sub extract_block_type_0 ($in_fh, $buffer) {
- my $len = bits2int_lsb($in_fh, 16, $buffer);
- my $nlen = bits2int_lsb($in_fh, 16, $buffer);
- my $expected_nlen = (~$len) & 0xffff;
- if ($expected_nlen != $nlen) {
- die "[!] The ~length value is not correct: $nlen (actual) != $expected_nlen (expected)\n";
- }
- else {
- print STDERR ":: Chunk length: $len\n";
- }
- read($in_fh, (my $chunk), $len);
- return $chunk;
- }
- my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS) = make_deflate_tables();
- sub decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window) {
- my $data = '';
- my $code = '';
- my $max_ll_code_len = max(map { length($_) } keys %$rev_dict);
- my $max_dist_code_len = max(map { length($_) } keys %$dist_rev_dict);
- while (1) {
- $code .= read_bit_lsb($in_fh, $buffer);
- if (length($code) > $max_ll_code_len) {
- die "[!] Something went wrong: length of LL code `$code` is > $max_ll_code_len.\n";
- }
- if (exists($rev_dict->{$code})) {
- my $symbol = $rev_dict->{$code};
- if ($symbol <= 255) {
- $data .= chr($symbol);
- $$search_window .= chr($symbol);
- }
- elsif ($symbol == 256) { # end-of-block marker
- $code = '';
- last;
- }
- else { # LZSS decoding
- my ($length, $LL_bits) = @{$LENGTH_SYMBOLS->[$symbol - 256 + 1]};
- $length += bits2int_lsb($in_fh, $LL_bits, $buffer) if ($LL_bits > 0);
- my $dist_code = '';
- while (1) {
- $dist_code .= read_bit_lsb($in_fh, $buffer);
- if (length($dist_code) > $max_dist_code_len) {
- die "[!] Something went wrong: length of distance code `$dist_code` is > $max_dist_code_len.\n";
- }
- if (exists($dist_rev_dict->{$dist_code})) {
- last;
- }
- }
- my ($dist, $dist_bits) = @{$DISTANCE_SYMBOLS->[$dist_rev_dict->{$dist_code} + 1]};
- $dist += bits2int_lsb($in_fh, $dist_bits, $buffer) if ($dist_bits > 0);
- if ($dist == 1) {
- $$search_window .= substr($$search_window, -1) x $length;
- }
- elsif ($dist >= $length) { # non-overlapping matches
- $$search_window .= substr($$search_window, length($$search_window) - $dist, $length);
- }
- else { # overlapping matches
- foreach my $i (1 .. $length) {
- $$search_window .= substr($$search_window, length($$search_window) - $dist, 1);
- }
- }
- $data .= substr($$search_window, -$length);
- }
- $code = '';
- }
- }
- if ($code ne '') {
- die "[!] Something went wrong: code `$code` is not empty!\n";
- }
- return $data;
- }
- sub extract_block_type_1 ($in_fh, $buffer, $search_window) {
- state $rev_dict;
- state $dist_rev_dict;
- if (!defined($rev_dict)) {
- my @code_lengths = (0) x 288;
- foreach my $i (0 .. 143) {
- $code_lengths[$i] = 8;
- }
- foreach my $i (144 .. 255) {
- $code_lengths[$i] = 9;
- }
- foreach my $i (256 .. 279) {
- $code_lengths[$i] = 7;
- }
- foreach my $i (280 .. 287) {
- $code_lengths[$i] = 8;
- }
- (undef, $rev_dict) = huffman_from_code_lengths(\@code_lengths);
- (undef, $dist_rev_dict) = huffman_from_code_lengths([(5) x 32]);
- }
- decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window);
- }
- sub decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $size) {
- my @lengths;
- my $code = '';
- while (1) {
- $code .= read_bit_lsb($in_fh, $buffer);
- if (length($code) > 7) {
- die "[!] Something went wrong: length of CL code `$code` is > 7.\n";
- }
- if (exists($CL_rev_dict->{$code})) {
- my $CL_symbol = $CL_rev_dict->{$code};
- if ($CL_symbol <= 15) {
- push @lengths, $CL_symbol;
- }
- elsif ($CL_symbol == 16) {
- push @lengths, ($lengths[-1]) x (3 + bits2int_lsb($in_fh, 2, $buffer));
- }
- elsif ($CL_symbol == 17) {
- push @lengths, (0) x (3 + bits2int_lsb($in_fh, 3, $buffer));
- }
- elsif ($CL_symbol == 18) {
- push @lengths, (0) x (11 + bits2int_lsb($in_fh, 7, $buffer));
- }
- else {
- die "Unknown CL symbol: $CL_symbol\n";
- }
- $code = '';
- last if (scalar(@lengths) >= $size);
- }
- }
- if (scalar(@lengths) != $size) {
- die "Something went wrong: size $size (expected) != ", scalar(@lengths);
- }
- if ($code ne '') {
- die "Something went wrong: code `$code` is not empty!";
- }
- return @lengths;
- }
- sub extract_block_type_2 ($in_fh, $buffer, $search_window) {
- # (5 bits) HLIT = (number of LL code entries present) - 257
- my $HLIT = bits2int_lsb($in_fh, 5, $buffer) + 257;
- # (5 bits) HDIST = (number of distance code entries present) - 1
- my $HDIST = bits2int_lsb($in_fh, 5, $buffer) + 1;
- # (4 bits) HCLEN = (number of CL code entries present) - 4
- my $HCLEN = bits2int_lsb($in_fh, 4, $buffer) + 4;
- say STDERR ":: Number of LL codes: $HLIT";
- say STDERR ":: Number of dist codes: $HDIST";
- say STDERR ":: Number of CL codes: $HCLEN";
- my @CL_code_lenghts = (0) x 19;
- my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
- foreach my $i (0 .. $HCLEN - 1) {
- $CL_code_lenghts[$CL_order[$i]] = bits2int_lsb($in_fh, 3, $buffer);
- }
- say STDERR ":: CL code lengths: @CL_code_lenghts";
- my (undef, $CL_rev_dict) = huffman_from_code_lengths(\@CL_code_lenghts);
- my @LL_CL_lengths = decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HLIT);
- my @dist_CL_lengths = decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HDIST);
- my (undef, $LL_rev_dict) = huffman_from_code_lengths(\@LL_CL_lengths);
- my (undef, $dist_rev_dict) = huffman_from_code_lengths(\@dist_CL_lengths);
- decode_huffman($in_fh, $buffer, $LL_rev_dict, $dist_rev_dict, $search_window);
- }
- sub extract ($in_fh, $output_file, $defined_output_file) {
- my $MAGIC = (getc($in_fh) // die "error") . (getc($in_fh) // die "error");
- if ($MAGIC ne pack('C*', 0x1f, 0x8b)) {
- die "Not a valid Gzip container!\n";
- }
- my $CM = getc($in_fh) // die "error"; # 0x08 = DEFLATE
- my $FLAGS = ord(getc($in_fh) // die "error"); # flags
- my $MTIME = join('', map { getc($in_fh) // die "error" } 1 .. 4); # modification time
- my $XFLAGS = getc($in_fh) // die "error"; # extra flags
- my $OS = getc($in_fh) // die "error"; # 0x03 = Unix
- if ($CM ne chr(0x08)) {
- die "Only DEFLATE compression method is supported (0x08)! Got: 0x", sprintf('%02x', ord($CM));
- }
- # Reference:
- # https://web.archive.org/web/20240221024029/https://forensics.wiki/gzip/
- my $has_filename = 0;
- my $has_comment = 0;
- my $has_header_checksum = 0;
- my $has_extra_fields = 0;
- if ($FLAGS & 0x08) {
- $has_filename = 1;
- }
- if ($FLAGS & 0x10) {
- $has_comment = 1;
- }
- if ($FLAGS & 0x02) {
- $has_header_checksum = 1;
- }
- if ($FLAGS & 0x04) {
- $has_extra_fields = 1;
- }
- if ($has_extra_fields) {
- my $size = bytes2int_lsb($in_fh, 2);
- read($in_fh, (my $extra_field_data), $size) // die "can't read extra field data: $!";
- say STDERR ":: Extra field data: $extra_field_data";
- }
- if ($has_filename) {
- my $filename = read_null_terminated($in_fh); # filename
- say STDERR ":: Filename: ", $filename;
- if (not $defined_output_file) {
- $output_file = $filename;
- }
- }
- if ($has_comment) {
- my $comment = read_null_terminated($in_fh); # comment
- say STDERR ":: Comment: $comment";
- }
- if ($has_header_checksum) {
- my $header_checksum = bytes2int_lsb($in_fh, 2);
- say STDERR ":: Header checksum: $header_checksum";
- }
- my $out_fh = ref($output_file) eq 'GLOB' ? $output_file : undef;
- if (!defined($out_fh)) {
- open $out_fh, '>:raw', $output_file or die "Can't create file <<$output_file>>: $!";
- }
- my $crc32 = 0;
- my $actual_length = 0;
- my $buffer = '';
- my $search_window = '';
- my $window_size = $Compression::Util::LZ_MAX_DIST;
- while (1) {
- my $is_last = read_bit_lsb($in_fh, \$buffer);
- my $block_type = bits2int_lsb($in_fh, 2, \$buffer);
- my $chunk = '';
- if ($block_type == 0) {
- say STDERR "\n:: Extracting block of type 0";
- $buffer = ''; # pad to a byte
- $chunk = extract_block_type_0($in_fh, \$buffer);
- $search_window .= $chunk;
- }
- elsif ($block_type == 1) {
- say STDERR "\n:: Extracting block of type 1";
- $chunk = extract_block_type_1($in_fh, \$buffer, \$search_window);
- }
- elsif ($block_type == 2) {
- say STDERR "\n:: Extracting block of type 2";
- $chunk = extract_block_type_2($in_fh, \$buffer, \$search_window);
- }
- else {
- die "[!] Unknown block of type: $block_type";
- }
- print $out_fh $chunk;
- $crc32 = crc32($chunk, $crc32);
- $actual_length += length($chunk);
- $search_window = substr($search_window, -$window_size) if (length($search_window) > 2 * $window_size);
- last if $is_last;
- }
- $buffer = ''; # discard any padding bits
- my $stored_crc32 = bits2int_lsb($in_fh, 32, \$buffer);
- my $actual_crc32 = $crc32;
- say STDERR '';
- if ($stored_crc32 != $actual_crc32) {
- print STDERR "[!] The CRC32 does not match: $actual_crc32 (actual) != $stored_crc32 (stored)\n";
- }
- else {
- print STDERR ":: CRC32 value: $actual_crc32\n";
- }
- my $stored_length = bits2int_lsb($in_fh, 32, \$buffer);
- if ($stored_length != $actual_length) {
- print STDERR "[!] The length does not match: $actual_length (actual) != $stored_length (stored)\n";
- }
- else {
- print STDERR ":: Total length: $actual_length\n";
- }
- if (eof($in_fh)) {
- print STDERR "\n:: Reached the end of the file.\n";
- }
- else {
- print STDERR "\n:: There is something else in the container! Trying to recurse!\n\n";
- __SUB__->($in_fh, $out_fh, 1);
- }
- }
- if (-t STDIN) {
- my $input = $ARGV[0] // die "usage: $0 [input] [output.gz]\n";
- my $output = $ARGV[1] // ($input =~ s/\.gz\z//ir);
- open my $fh, '<:raw', $input or die "Can't open file <<$input>> for reading: $!";
- extract($fh, $output, defined($ARGV[1]));
- }
- else {
- extract(\*STDIN, \*STDOUT, 1);
- }
|