gzip_decompressor.pl 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 13 January 2024
  4. # Edit: 14 April 2024
  5. # https://github.com/trizen
  6. # Decompress GZIP files (.gz).
  7. # DEFLATE's block type 0, 1 and 2 are all supported.
  8. # Reference:
  9. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  10. # https://youtube.com/watch?v=SJPvNi4HrWQ
  11. use 5.036;
  12. use lib qw(../lib);
  13. use List::Util qw(max);
  14. use Compression::Util qw(:all);
  15. local $Compression::Util::LZ_MAX_LEN = 258; # maximum match length in LZ parsing
  16. local $Compression::Util::LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing
  17. sub extract_block_type_0 ($in_fh, $buffer) {
  18. my $len = bits2int_lsb($in_fh, 16, $buffer);
  19. my $nlen = bits2int_lsb($in_fh, 16, $buffer);
  20. my $expected_nlen = (~$len) & 0xffff;
  21. if ($expected_nlen != $nlen) {
  22. die "[!] The ~length value is not correct: $nlen (actual) != $expected_nlen (expected)\n";
  23. }
  24. else {
  25. print STDERR ":: Chunk length: $len\n";
  26. }
  27. read($in_fh, (my $chunk), $len);
  28. return $chunk;
  29. }
  30. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS) = make_deflate_tables();
  31. sub decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window) {
  32. my $data = '';
  33. my $code = '';
  34. my $max_ll_code_len = max(map { length($_) } keys %$rev_dict);
  35. my $max_dist_code_len = max(map { length($_) } keys %$dist_rev_dict);
  36. while (1) {
  37. $code .= read_bit_lsb($in_fh, $buffer);
  38. if (length($code) > $max_ll_code_len) {
  39. die "[!] Something went wrong: length of LL code `$code` is > $max_ll_code_len.\n";
  40. }
  41. if (exists($rev_dict->{$code})) {
  42. my $symbol = $rev_dict->{$code};
  43. if ($symbol <= 255) {
  44. $data .= chr($symbol);
  45. $$search_window .= chr($symbol);
  46. }
  47. elsif ($symbol == 256) { # end-of-block marker
  48. $code = '';
  49. last;
  50. }
  51. else { # LZSS decoding
  52. my ($length, $LL_bits) = @{$LENGTH_SYMBOLS->[$symbol - 256 + 1]};
  53. $length += bits2int_lsb($in_fh, $LL_bits, $buffer) if ($LL_bits > 0);
  54. my $dist_code = '';
  55. while (1) {
  56. $dist_code .= read_bit_lsb($in_fh, $buffer);
  57. if (length($dist_code) > $max_dist_code_len) {
  58. die "[!] Something went wrong: length of distance code `$dist_code` is > $max_dist_code_len.\n";
  59. }
  60. if (exists($dist_rev_dict->{$dist_code})) {
  61. last;
  62. }
  63. }
  64. my ($dist, $dist_bits) = @{$DISTANCE_SYMBOLS->[$dist_rev_dict->{$dist_code} + 1]};
  65. $dist += bits2int_lsb($in_fh, $dist_bits, $buffer) if ($dist_bits > 0);
  66. if ($dist == 1) {
  67. $$search_window .= substr($$search_window, -1) x $length;
  68. }
  69. elsif ($dist >= $length) { # non-overlapping matches
  70. $$search_window .= substr($$search_window, length($$search_window) - $dist, $length);
  71. }
  72. else { # overlapping matches
  73. foreach my $i (1 .. $length) {
  74. $$search_window .= substr($$search_window, length($$search_window) - $dist, 1);
  75. }
  76. }
  77. $data .= substr($$search_window, -$length);
  78. }
  79. $code = '';
  80. }
  81. }
  82. if ($code ne '') {
  83. die "[!] Something went wrong: code `$code` is not empty!\n";
  84. }
  85. return $data;
  86. }
  87. sub extract_block_type_1 ($in_fh, $buffer, $search_window) {
  88. state $rev_dict;
  89. state $dist_rev_dict;
  90. if (!defined($rev_dict)) {
  91. my @code_lengths = (0) x 288;
  92. foreach my $i (0 .. 143) {
  93. $code_lengths[$i] = 8;
  94. }
  95. foreach my $i (144 .. 255) {
  96. $code_lengths[$i] = 9;
  97. }
  98. foreach my $i (256 .. 279) {
  99. $code_lengths[$i] = 7;
  100. }
  101. foreach my $i (280 .. 287) {
  102. $code_lengths[$i] = 8;
  103. }
  104. (undef, $rev_dict) = huffman_from_code_lengths(\@code_lengths);
  105. (undef, $dist_rev_dict) = huffman_from_code_lengths([(5) x 32]);
  106. }
  107. decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window);
  108. }
  109. sub decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $size) {
  110. my @lengths;
  111. my $code = '';
  112. while (1) {
  113. $code .= read_bit_lsb($in_fh, $buffer);
  114. if (length($code) > 7) {
  115. die "[!] Something went wrong: length of CL code `$code` is > 7.\n";
  116. }
  117. if (exists($CL_rev_dict->{$code})) {
  118. my $CL_symbol = $CL_rev_dict->{$code};
  119. if ($CL_symbol <= 15) {
  120. push @lengths, $CL_symbol;
  121. }
  122. elsif ($CL_symbol == 16) {
  123. push @lengths, ($lengths[-1]) x (3 + bits2int_lsb($in_fh, 2, $buffer));
  124. }
  125. elsif ($CL_symbol == 17) {
  126. push @lengths, (0) x (3 + bits2int_lsb($in_fh, 3, $buffer));
  127. }
  128. elsif ($CL_symbol == 18) {
  129. push @lengths, (0) x (11 + bits2int_lsb($in_fh, 7, $buffer));
  130. }
  131. else {
  132. die "Unknown CL symbol: $CL_symbol\n";
  133. }
  134. $code = '';
  135. last if (scalar(@lengths) >= $size);
  136. }
  137. }
  138. if (scalar(@lengths) != $size) {
  139. die "Something went wrong: size $size (expected) != ", scalar(@lengths);
  140. }
  141. if ($code ne '') {
  142. die "Something went wrong: code `$code` is not empty!";
  143. }
  144. return @lengths;
  145. }
  146. sub extract_block_type_2 ($in_fh, $buffer, $search_window) {
  147. # (5 bits) HLIT = (number of LL code entries present) - 257
  148. my $HLIT = bits2int_lsb($in_fh, 5, $buffer) + 257;
  149. # (5 bits) HDIST = (number of distance code entries present) - 1
  150. my $HDIST = bits2int_lsb($in_fh, 5, $buffer) + 1;
  151. # (4 bits) HCLEN = (number of CL code entries present) - 4
  152. my $HCLEN = bits2int_lsb($in_fh, 4, $buffer) + 4;
  153. say STDERR ":: Number of LL codes: $HLIT";
  154. say STDERR ":: Number of dist codes: $HDIST";
  155. say STDERR ":: Number of CL codes: $HCLEN";
  156. my @CL_code_lenghts = (0) x 19;
  157. my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
  158. foreach my $i (0 .. $HCLEN - 1) {
  159. $CL_code_lenghts[$CL_order[$i]] = bits2int_lsb($in_fh, 3, $buffer);
  160. }
  161. say STDERR ":: CL code lengths: @CL_code_lenghts";
  162. my (undef, $CL_rev_dict) = huffman_from_code_lengths(\@CL_code_lenghts);
  163. my @LL_CL_lengths = decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HLIT);
  164. my @dist_CL_lengths = decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HDIST);
  165. my (undef, $LL_rev_dict) = huffman_from_code_lengths(\@LL_CL_lengths);
  166. my (undef, $dist_rev_dict) = huffman_from_code_lengths(\@dist_CL_lengths);
  167. decode_huffman($in_fh, $buffer, $LL_rev_dict, $dist_rev_dict, $search_window);
  168. }
  169. sub extract ($in_fh, $output_file, $defined_output_file) {
  170. my $MAGIC = (getc($in_fh) // die "error") . (getc($in_fh) // die "error");
  171. if ($MAGIC ne pack('C*', 0x1f, 0x8b)) {
  172. die "Not a valid Gzip container!\n";
  173. }
  174. my $CM = getc($in_fh) // die "error"; # 0x08 = DEFLATE
  175. my $FLAGS = ord(getc($in_fh) // die "error"); # flags
  176. my $MTIME = join('', map { getc($in_fh) // die "error" } 1 .. 4); # modification time
  177. my $XFLAGS = getc($in_fh) // die "error"; # extra flags
  178. my $OS = getc($in_fh) // die "error"; # 0x03 = Unix
  179. if ($CM ne chr(0x08)) {
  180. die "Only DEFLATE compression method is supported (0x08)! Got: 0x", sprintf('%02x', ord($CM));
  181. }
  182. # Reference:
  183. # https://web.archive.org/web/20240221024029/https://forensics.wiki/gzip/
  184. my $has_filename = 0;
  185. my $has_comment = 0;
  186. my $has_header_checksum = 0;
  187. my $has_extra_fields = 0;
  188. if ($FLAGS & 0x08) {
  189. $has_filename = 1;
  190. }
  191. if ($FLAGS & 0x10) {
  192. $has_comment = 1;
  193. }
  194. if ($FLAGS & 0x02) {
  195. $has_header_checksum = 1;
  196. }
  197. if ($FLAGS & 0x04) {
  198. $has_extra_fields = 1;
  199. }
  200. if ($has_extra_fields) {
  201. my $size = bytes2int_lsb($in_fh, 2);
  202. read($in_fh, (my $extra_field_data), $size) // die "can't read extra field data: $!";
  203. say STDERR ":: Extra field data: $extra_field_data";
  204. }
  205. if ($has_filename) {
  206. my $filename = read_null_terminated($in_fh); # filename
  207. say STDERR ":: Filename: ", $filename;
  208. if (not $defined_output_file) {
  209. $output_file = $filename;
  210. }
  211. }
  212. if ($has_comment) {
  213. my $comment = read_null_terminated($in_fh); # comment
  214. say STDERR ":: Comment: $comment";
  215. }
  216. if ($has_header_checksum) {
  217. my $header_checksum = bytes2int_lsb($in_fh, 2);
  218. say STDERR ":: Header checksum: $header_checksum";
  219. }
  220. my $out_fh = ref($output_file) eq 'GLOB' ? $output_file : undef;
  221. if (!defined($out_fh)) {
  222. open $out_fh, '>:raw', $output_file or die "Can't create file <<$output_file>>: $!";
  223. }
  224. my $crc32 = 0;
  225. my $actual_length = 0;
  226. my $buffer = '';
  227. my $search_window = '';
  228. my $window_size = $Compression::Util::LZ_MAX_DIST;
  229. while (1) {
  230. my $is_last = read_bit_lsb($in_fh, \$buffer);
  231. my $block_type = bits2int_lsb($in_fh, 2, \$buffer);
  232. my $chunk = '';
  233. if ($block_type == 0) {
  234. say STDERR "\n:: Extracting block of type 0";
  235. $buffer = ''; # pad to a byte
  236. $chunk = extract_block_type_0($in_fh, \$buffer);
  237. $search_window .= $chunk;
  238. }
  239. elsif ($block_type == 1) {
  240. say STDERR "\n:: Extracting block of type 1";
  241. $chunk = extract_block_type_1($in_fh, \$buffer, \$search_window);
  242. }
  243. elsif ($block_type == 2) {
  244. say STDERR "\n:: Extracting block of type 2";
  245. $chunk = extract_block_type_2($in_fh, \$buffer, \$search_window);
  246. }
  247. else {
  248. die "[!] Unknown block of type: $block_type";
  249. }
  250. print $out_fh $chunk;
  251. $crc32 = crc32($chunk, $crc32);
  252. $actual_length += length($chunk);
  253. $search_window = substr($search_window, -$window_size) if (length($search_window) > 2 * $window_size);
  254. last if $is_last;
  255. }
  256. $buffer = ''; # discard any padding bits
  257. my $stored_crc32 = bits2int_lsb($in_fh, 32, \$buffer);
  258. my $actual_crc32 = $crc32;
  259. say STDERR '';
  260. if ($stored_crc32 != $actual_crc32) {
  261. print STDERR "[!] The CRC32 does not match: $actual_crc32 (actual) != $stored_crc32 (stored)\n";
  262. }
  263. else {
  264. print STDERR ":: CRC32 value: $actual_crc32\n";
  265. }
  266. my $stored_length = bits2int_lsb($in_fh, 32, \$buffer);
  267. if ($stored_length != $actual_length) {
  268. print STDERR "[!] The length does not match: $actual_length (actual) != $stored_length (stored)\n";
  269. }
  270. else {
  271. print STDERR ":: Total length: $actual_length\n";
  272. }
  273. if (eof($in_fh)) {
  274. print STDERR "\n:: Reached the end of the file.\n";
  275. }
  276. else {
  277. print STDERR "\n:: There is something else in the container! Trying to recurse!\n\n";
  278. __SUB__->($in_fh, $out_fh, 1);
  279. }
  280. }
  281. if (-t STDIN) {
  282. my $input = $ARGV[0] // die "usage: $0 [input] [output.gz]\n";
  283. my $output = $ARGV[1] // ($input =~ s/\.gz\z//ir);
  284. open my $fh, '<:raw', $input or die "Can't open file <<$input>> for reading: $!";
  285. extract($fh, $output, defined($ARGV[1]));
  286. }
  287. else {
  288. extract(\*STDIN, \*STDOUT, 1);
  289. }