gzip_file_compression.pl 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 05 May 2024
  4. # https://github.com/trizen
  5. # A valid Gzip file compressor/decompressor, generating DEFLATE blocks of type 0, 1 or 2, whichever is smaller.
  6. # Reference:
  7. # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
  8. # https://youtube.com/watch?v=SJPvNi4HrWQ
  9. use 5.036;
  10. use lib qw(../lib);
  11. use File::Basename qw(basename);
  12. use Compression::Util qw(:all);
  13. use List::Util qw(all min max);
  14. use Getopt::Std qw(getopts);
  15. use constant {
  16. FORMAT => 'gz',
  17. CHUNK_SIZE => (1 << 15) - 1,
  18. };
  19. local $Compression::Util::LZ_MIN_LEN = 4; # minimum match length in LZ parsing
  20. local $Compression::Util::LZ_MAX_LEN = 258; # maximum match length in LZ parsing
  21. local $Compression::Util::LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing
  22. local $Compression::Util::LZ_MAX_CHAIN_LEN = 64; # how many recent positions to remember in LZ parsing
  23. my $MAGIC = pack('C*', 0x1f, 0x8b); # magic MIME type
  24. my $CM = chr(0x08); # 0x08 = DEFLATE
  25. my $FLAGS = chr(0x00); # flags
  26. my $MTIME = pack('C*', (0x00) x 4); # modification time
  27. my $XFLAGS = chr(0x00); # extra flags
  28. my $OS = chr(0x03); # 0x03 = Unix
  29. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables();
  30. sub usage ($code) {
  31. print <<"EOH";
  32. usage: $0 [options] [input file] [output file]
  33. options:
  34. -e : extract
  35. -i <filename> : input filename
  36. -o <filename> : output filename
  37. -r : rewrite output
  38. -h : this message
  39. examples:
  40. $0 document.txt
  41. $0 document.txt archive.${\FORMAT}
  42. $0 archive.${\FORMAT} document.txt
  43. $0 -e -i archive.${\FORMAT} -o document.txt
  44. EOH
  45. exit($code // 0);
  46. }
  47. #################
  48. # GZIP COMPRESSOR
  49. #################
  50. sub code_length_encoding ($dict) {
  51. my @lengths;
  52. foreach my $symbol (0 .. max(keys %$dict) // 0) {
  53. if (exists($dict->{$symbol})) {
  54. push @lengths, length($dict->{$symbol});
  55. }
  56. else {
  57. push @lengths, 0;
  58. }
  59. }
  60. my $size = scalar(@lengths);
  61. my $rl = run_length(\@lengths);
  62. my $offset_bits = '';
  63. my @CL_symbols;
  64. foreach my $pair (@$rl) {
  65. my ($v, $run) = @$pair;
  66. while ($v == 0 and $run >= 3) {
  67. if ($run >= 11) {
  68. push @CL_symbols, 18;
  69. $run -= 11;
  70. $offset_bits .= int2bits_lsb(min($run, 127), 7);
  71. $run -= 127;
  72. }
  73. if ($run >= 3 and $run < 11) {
  74. push @CL_symbols, 17;
  75. $run -= 3;
  76. $offset_bits .= int2bits_lsb(min($run, 7), 3);
  77. $run -= 7;
  78. }
  79. }
  80. if ($v == 0) {
  81. push(@CL_symbols, (0) x $run) if ($run > 0);
  82. next;
  83. }
  84. push @CL_symbols, $v;
  85. $run -= 1;
  86. while ($run >= 3) {
  87. push @CL_symbols, 16;
  88. $run -= 3;
  89. $offset_bits .= int2bits_lsb(min($run, 3), 2);
  90. $run -= 3;
  91. }
  92. push(@CL_symbols, ($v) x $run) if ($run > 0);
  93. }
  94. return (\@CL_symbols, $size, $offset_bits);
  95. }
  96. sub cl_encoded_bitstring ($cl_dict, $cl_symbols, $offset_bits) {
  97. my $bitstring = '';
  98. foreach my $cl_symbol (@$cl_symbols) {
  99. $bitstring .= $cl_dict->{$cl_symbol};
  100. if ($cl_symbol == 16) {
  101. $bitstring .= substr($offset_bits, 0, 2, '');
  102. }
  103. elsif ($cl_symbol == 17) {
  104. $bitstring .= substr($offset_bits, 0, 3, '');
  105. }
  106. elsif ($cl_symbol == 18) {
  107. $bitstring .= substr($offset_bits, 0, 7, '');
  108. }
  109. }
  110. return $bitstring;
  111. }
  112. sub create_cl_dictionary (@cl_symbols) {
  113. my @keys;
  114. my $freq = frequencies(\@cl_symbols);
  115. while (1) {
  116. my ($cl_dict) = huffman_from_freq($freq);
  117. # The CL codes must have at most 7 bits
  118. return $cl_dict if all { length($_) <= 7 } values %$cl_dict;
  119. if (scalar(@keys) == 0) {
  120. @keys = sort { $freq->{$b} <=> $freq->{$a} } keys %$freq;
  121. }
  122. # Scale down the frequencies and try again
  123. foreach my $k (@keys) {
  124. if ($freq->{$k} > 1) {
  125. $freq->{$k} >>= 1;
  126. }
  127. else {
  128. last;
  129. }
  130. }
  131. }
  132. }
  133. sub block_type_2 ($literals, $distances, $lengths) {
  134. my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
  135. my $bitstring = '01';
  136. my @len_symbols;
  137. my @dist_symbols;
  138. my $offset_bits = '';
  139. foreach my $k (0 .. $#$literals) {
  140. if ($lengths->[$k] == 0) {
  141. push @len_symbols, $literals->[$k];
  142. next;
  143. }
  144. my $len = $lengths->[$k];
  145. my $dist = $distances->[$k];
  146. {
  147. my $len_idx = $LENGTH_INDICES->[$len];
  148. my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]};
  149. push @len_symbols, [$len_idx + 256 - 1, $bits];
  150. $offset_bits .= int2bits_lsb($len - $min, $bits) if ($bits > 0);
  151. }
  152. {
  153. my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  154. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]};
  155. push @dist_symbols, [$dist_idx - 1, $bits];
  156. $offset_bits .= int2bits_lsb($dist - $min, $bits) if ($bits > 0);
  157. }
  158. }
  159. push @len_symbols, 256; # end-of-block marker
  160. my ($dict) = huffman_from_symbols([map { ref($_) eq 'ARRAY' ? $_->[0] : $_ } @len_symbols]);
  161. my ($dist_dict) = huffman_from_symbols([map { $_->[0] } @dist_symbols]);
  162. my ($LL_code_lengths, $LL_cl_len, $LL_offset_bits) = code_length_encoding($dict);
  163. my ($distance_code_lengths, $distance_cl_len, $distance_offset_bits) = code_length_encoding($dist_dict);
  164. my $cl_dict = create_cl_dictionary(@$LL_code_lengths, @$distance_code_lengths);
  165. my @CL_code_lenghts;
  166. foreach my $symbol (0 .. 18) {
  167. if (exists($cl_dict->{$symbol})) {
  168. push @CL_code_lenghts, length($cl_dict->{$symbol});
  169. }
  170. else {
  171. push @CL_code_lenghts, 0;
  172. }
  173. }
  174. # Put the CL codes in the required order
  175. @CL_code_lenghts = @CL_code_lenghts[@CL_order];
  176. while (scalar(@CL_code_lenghts) > 4 and $CL_code_lenghts[-1] == 0) {
  177. pop @CL_code_lenghts;
  178. }
  179. my $CL_code_lengths_bitstring = join('', map { int2bits_lsb($_, 3) } @CL_code_lenghts);
  180. my $LL_code_lengths_bitstring = cl_encoded_bitstring($cl_dict, $LL_code_lengths, $LL_offset_bits);
  181. my $distance_code_lengths_bitstring = cl_encoded_bitstring($cl_dict, $distance_code_lengths, $distance_offset_bits);
  182. # (5 bits) HLIT = (number of LL code entries present) - 257
  183. my $HLIT = $LL_cl_len - 257;
  184. # (5 bits) HDIST = (number of distance code entries present) - 1
  185. my $HDIST = $distance_cl_len - 1;
  186. # (4 bits) HCLEN = (number of CL code entries present) - 4
  187. my $HCLEN = scalar(@CL_code_lenghts) - 4;
  188. $bitstring .= int2bits_lsb($HLIT, 5);
  189. $bitstring .= int2bits_lsb($HDIST, 5);
  190. $bitstring .= int2bits_lsb($HCLEN, 4);
  191. $bitstring .= $CL_code_lengths_bitstring;
  192. $bitstring .= $LL_code_lengths_bitstring;
  193. $bitstring .= $distance_code_lengths_bitstring;
  194. foreach my $symbol (@len_symbols) {
  195. if (ref($symbol) eq 'ARRAY') {
  196. my ($len, $len_offset) = @$symbol;
  197. $bitstring .= $dict->{$len};
  198. $bitstring .= substr($offset_bits, 0, $len_offset, '') if ($len_offset > 0);
  199. my ($dist, $dist_offset) = @{shift(@dist_symbols)};
  200. $bitstring .= $dist_dict->{$dist};
  201. $bitstring .= substr($offset_bits, 0, $dist_offset, '') if ($dist_offset > 0);
  202. }
  203. else {
  204. $bitstring .= $dict->{$symbol};
  205. }
  206. }
  207. return $bitstring;
  208. }
  209. sub block_type_1 ($literals, $distances, $lengths) {
  210. state $dict;
  211. state $dist_dict;
  212. if (!defined($dict)) {
  213. my @code_lengths = (0) x 288;
  214. foreach my $i (0 .. 143) {
  215. $code_lengths[$i] = 8;
  216. }
  217. foreach my $i (144 .. 255) {
  218. $code_lengths[$i] = 9;
  219. }
  220. foreach my $i (256 .. 279) {
  221. $code_lengths[$i] = 7;
  222. }
  223. foreach my $i (280 .. 287) {
  224. $code_lengths[$i] = 8;
  225. }
  226. ($dict) = huffman_from_code_lengths(\@code_lengths);
  227. ($dist_dict) = huffman_from_code_lengths([(5) x 32]);
  228. }
  229. my $bitstring = '10';
  230. foreach my $k (0 .. $#$literals) {
  231. if ($lengths->[$k] == 0) {
  232. $bitstring .= $dict->{$literals->[$k]};
  233. next;
  234. }
  235. my $len = $lengths->[$k];
  236. my $dist = $distances->[$k];
  237. {
  238. my $len_idx = $LENGTH_INDICES->[$len];
  239. my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]};
  240. $bitstring .= $dict->{$len_idx + 256 - 1};
  241. $bitstring .= int2bits_lsb($len - $min, $bits) if ($bits > 0);
  242. }
  243. {
  244. my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  245. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]};
  246. $bitstring .= $dist_dict->{$dist_idx - 1};
  247. $bitstring .= int2bits_lsb($dist - $min, $bits) if ($bits > 0);
  248. }
  249. }
  250. $bitstring .= $dict->{256}; # end-of-block symbol
  251. return $bitstring;
  252. }
  253. sub block_type_0($chunk) {
  254. my $chunk_len = length($chunk);
  255. my $len = int2bits_lsb($chunk_len, 16);
  256. my $nlen = int2bits_lsb((~$chunk_len) & 0xffff, 16);
  257. $len . $nlen;
  258. }
  259. sub my_gzip_compress ($in_fh, $out_fh) {
  260. print $out_fh $MAGIC, $CM, $FLAGS, $MTIME, $XFLAGS, $OS;
  261. my $total_length = 0;
  262. my $crc32 = 0;
  263. my $bitstring = '';
  264. if (eof($in_fh)) { # empty file
  265. $bitstring = '1' . '10' . '0000000';
  266. }
  267. while (read($in_fh, (my $chunk), CHUNK_SIZE)) {
  268. $crc32 = crc32($chunk, $crc32);
  269. $total_length += length($chunk);
  270. my ($literals, $distances, $lengths) = lzss_encode($chunk);
  271. $bitstring .= eof($in_fh) ? '1' : '0';
  272. my $bt1_bitstring = block_type_1($literals, $distances, $lengths);
  273. # When block type 1 is larger than the input, then we have random uncompressible data: use block type 0
  274. if ((length($bt1_bitstring) >> 3) > length($chunk) + 5) {
  275. say STDERR ":: Using block type: 0";
  276. $bitstring .= '00';
  277. print $out_fh pack('b*', $bitstring); # pads to a byte
  278. print $out_fh pack('b*', block_type_0($chunk));
  279. print $out_fh $chunk;
  280. $bitstring = '';
  281. next;
  282. }
  283. my $bt2_bitstring = block_type_2($literals, $distances, $lengths);
  284. # When block type 2 is larger than block type 1, then we may have very small data
  285. if (length($bt2_bitstring) > length($bt1_bitstring)) {
  286. say STDERR ":: Using block type: 1";
  287. $bitstring .= $bt1_bitstring;
  288. }
  289. else {
  290. say STDERR ":: Using block type: 2";
  291. $bitstring .= $bt2_bitstring;
  292. }
  293. print $out_fh pack('b*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), ''));
  294. }
  295. if ($bitstring ne '') {
  296. print $out_fh pack('b*', $bitstring);
  297. }
  298. print $out_fh pack('b*', int2bits_lsb($crc32, 32));
  299. print $out_fh pack('b*', int2bits_lsb($total_length, 32));
  300. return 1;
  301. }
  302. ###################
  303. # GZIP DECOMPRESSOR
  304. ###################
  305. sub extract_block_type_0 ($in_fh, $buffer) {
  306. my $len = bits2int_lsb($in_fh, 16, $buffer);
  307. my $nlen = bits2int_lsb($in_fh, 16, $buffer);
  308. my $expected_nlen = (~$len) & 0xffff;
  309. if ($expected_nlen != $nlen) {
  310. die "[!] The ~length value is not correct: $nlen (actual) != $expected_nlen (expected)\n";
  311. }
  312. else {
  313. print STDERR ":: Chunk length: $len\n";
  314. }
  315. read($in_fh, (my $chunk), $len);
  316. return $chunk;
  317. }
  318. sub decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window) {
  319. my $data = '';
  320. my $code = '';
  321. my $max_ll_code_len = max(map { length($_) } keys %$rev_dict);
  322. my $max_dist_code_len = max(map { length($_) } keys %$dist_rev_dict);
  323. while (1) {
  324. $code .= read_bit_lsb($in_fh, $buffer);
  325. if (length($code) > $max_ll_code_len) {
  326. die "[!] Something went wrong: length of LL code `$code` is > $max_ll_code_len.\n";
  327. }
  328. if (exists($rev_dict->{$code})) {
  329. my $symbol = $rev_dict->{$code};
  330. if ($symbol <= 255) {
  331. $data .= chr($symbol);
  332. $$search_window .= chr($symbol);
  333. }
  334. elsif ($symbol == 256) { # end-of-block marker
  335. $code = '';
  336. last;
  337. }
  338. else { # LZSS decoding
  339. my ($length, $LL_bits) = @{$LENGTH_SYMBOLS->[$symbol - 256 + 1]};
  340. $length += bits2int_lsb($in_fh, $LL_bits, $buffer) if ($LL_bits > 0);
  341. my $dist_code = '';
  342. while (1) {
  343. $dist_code .= read_bit_lsb($in_fh, $buffer);
  344. if (length($dist_code) > $max_dist_code_len) {
  345. die "[!] Something went wrong: length of distance code `$dist_code` is > $max_dist_code_len.\n";
  346. }
  347. if (exists($dist_rev_dict->{$dist_code})) {
  348. last;
  349. }
  350. }
  351. my ($dist, $dist_bits) = @{$DISTANCE_SYMBOLS->[$dist_rev_dict->{$dist_code} + 1]};
  352. $dist += bits2int_lsb($in_fh, $dist_bits, $buffer) if ($dist_bits > 0);
  353. if ($dist == 1) {
  354. $$search_window .= substr($$search_window, -1) x $length;
  355. }
  356. elsif ($dist >= $length) { # non-overlapping matches
  357. $$search_window .= substr($$search_window, length($$search_window) - $dist, $length);
  358. }
  359. else { # overlapping matches
  360. foreach my $i (1 .. $length) {
  361. $$search_window .= substr($$search_window, length($$search_window) - $dist, 1);
  362. }
  363. }
  364. $data .= substr($$search_window, -$length);
  365. }
  366. $code = '';
  367. }
  368. }
  369. if ($code ne '') {
  370. die "[!] Something went wrong: code `$code` is not empty!\n";
  371. }
  372. return $data;
  373. }
  374. sub extract_block_type_1 ($in_fh, $buffer, $search_window) {
  375. state $rev_dict;
  376. state $dist_rev_dict;
  377. if (!defined($rev_dict)) {
  378. my @code_lengths = (0) x 288;
  379. foreach my $i (0 .. 143) {
  380. $code_lengths[$i] = 8;
  381. }
  382. foreach my $i (144 .. 255) {
  383. $code_lengths[$i] = 9;
  384. }
  385. foreach my $i (256 .. 279) {
  386. $code_lengths[$i] = 7;
  387. }
  388. foreach my $i (280 .. 287) {
  389. $code_lengths[$i] = 8;
  390. }
  391. (undef, $rev_dict) = huffman_from_code_lengths(\@code_lengths);
  392. (undef, $dist_rev_dict) = huffman_from_code_lengths([(5) x 32]);
  393. }
  394. decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window);
  395. }
  396. sub decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $size) {
  397. my @lengths;
  398. my $code = '';
  399. while (1) {
  400. $code .= read_bit_lsb($in_fh, $buffer);
  401. if (length($code) > 7) {
  402. die "[!] Something went wrong: length of CL code `$code` is > 7.\n";
  403. }
  404. if (exists($CL_rev_dict->{$code})) {
  405. my $CL_symbol = $CL_rev_dict->{$code};
  406. if ($CL_symbol <= 15) {
  407. push @lengths, $CL_symbol;
  408. }
  409. elsif ($CL_symbol == 16) {
  410. push @lengths, ($lengths[-1]) x (3 + bits2int_lsb($in_fh, 2, $buffer));
  411. }
  412. elsif ($CL_symbol == 17) {
  413. push @lengths, (0) x (3 + bits2int_lsb($in_fh, 3, $buffer));
  414. }
  415. elsif ($CL_symbol == 18) {
  416. push @lengths, (0) x (11 + bits2int_lsb($in_fh, 7, $buffer));
  417. }
  418. else {
  419. die "Unknown CL symbol: $CL_symbol\n";
  420. }
  421. $code = '';
  422. last if (scalar(@lengths) >= $size);
  423. }
  424. }
  425. if (scalar(@lengths) != $size) {
  426. die "Something went wrong: size $size (expected) != ", scalar(@lengths);
  427. }
  428. if ($code ne '') {
  429. die "Something went wrong: code `$code` is not empty!";
  430. }
  431. return @lengths;
  432. }
  433. sub extract_block_type_2 ($in_fh, $buffer, $search_window) {
  434. # (5 bits) HLIT = (number of LL code entries present) - 257
  435. my $HLIT = bits2int_lsb($in_fh, 5, $buffer) + 257;
  436. # (5 bits) HDIST = (number of distance code entries present) - 1
  437. my $HDIST = bits2int_lsb($in_fh, 5, $buffer) + 1;
  438. # (4 bits) HCLEN = (number of CL code entries present) - 4
  439. my $HCLEN = bits2int_lsb($in_fh, 4, $buffer) + 4;
  440. say STDERR ":: Number of LL codes: $HLIT";
  441. say STDERR ":: Number of dist codes: $HDIST";
  442. say STDERR ":: Number of CL codes: $HCLEN";
  443. my @CL_code_lenghts = (0) x 19;
  444. my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
  445. foreach my $i (0 .. $HCLEN - 1) {
  446. $CL_code_lenghts[$CL_order[$i]] = bits2int_lsb($in_fh, 3, $buffer);
  447. }
  448. say STDERR ":: CL code lengths: @CL_code_lenghts";
  449. my (undef, $CL_rev_dict) = huffman_from_code_lengths(\@CL_code_lenghts);
  450. my @LL_CL_lengths = decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HLIT);
  451. my @dist_CL_lengths = decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HDIST);
  452. my (undef, $LL_rev_dict) = huffman_from_code_lengths(\@LL_CL_lengths);
  453. my (undef, $dist_rev_dict) = huffman_from_code_lengths(\@dist_CL_lengths);
  454. decode_huffman($in_fh, $buffer, $LL_rev_dict, $dist_rev_dict, $search_window);
  455. }
  456. sub my_gzip_decompress ($in_fh, $out_fh) {
  457. my $MAGIC = (getc($in_fh) // die "error") . (getc($in_fh) // die "error");
  458. if ($MAGIC ne pack('C*', 0x1f, 0x8b)) {
  459. die "Not a valid Gzip container!\n";
  460. }
  461. my $CM = getc($in_fh) // die "error"; # 0x08 = DEFLATE
  462. my $FLAGS = ord(getc($in_fh) // die "error"); # flags
  463. my $MTIME = join('', map { getc($in_fh) // die "error" } 1 .. 4); # modification time
  464. my $XFLAGS = getc($in_fh) // die "error"; # extra flags
  465. my $OS = getc($in_fh) // die "error"; # 0x03 = Unix
  466. if ($CM ne chr(0x08)) {
  467. die "Only DEFLATE compression method is supported (0x08)! Got: 0x", sprintf('%02x', ord($CM));
  468. }
  469. # Reference:
  470. # https://web.archive.org/web/20240221024029/https://forensics.wiki/gzip/
  471. my $has_filename = 0;
  472. my $has_comment = 0;
  473. my $has_header_checksum = 0;
  474. my $has_extra_fields = 0;
  475. if ($FLAGS & 0x08) {
  476. $has_filename = 1;
  477. }
  478. if ($FLAGS & 0x10) {
  479. $has_comment = 1;
  480. }
  481. if ($FLAGS & 0x02) {
  482. $has_header_checksum = 1;
  483. }
  484. if ($FLAGS & 0x04) {
  485. $has_extra_fields = 1;
  486. }
  487. if ($has_extra_fields) {
  488. my $size = bytes2int_lsb($in_fh, 2);
  489. read($in_fh, (my $extra_field_data), $size) // die "can't read extra field data: $!";
  490. say STDERR ":: Extra field data: $extra_field_data";
  491. }
  492. if ($has_filename) {
  493. my $filename = read_null_terminated($in_fh); # filename
  494. say STDERR ":: Filename: $filename";
  495. }
  496. if ($has_comment) {
  497. my $comment = read_null_terminated($in_fh); # comment
  498. say STDERR ":: Comment: $comment";
  499. }
  500. if ($has_header_checksum) {
  501. my $header_checksum = bytes2int_lsb($in_fh, 2);
  502. say STDERR ":: Header checksum: $header_checksum";
  503. }
  504. my $crc32 = 0;
  505. my $actual_length = 0;
  506. my $buffer = '';
  507. my $search_window = '';
  508. my $window_size = $Compression::Util::LZ_MAX_DIST;
  509. while (1) {
  510. my $is_last = read_bit_lsb($in_fh, \$buffer);
  511. my $block_type = bits2int_lsb($in_fh, 2, \$buffer);
  512. my $chunk = '';
  513. if ($block_type == 0) {
  514. say STDERR "\n:: Extracting block of type 0";
  515. $buffer = ''; # pad to a byte
  516. $chunk = extract_block_type_0($in_fh, \$buffer);
  517. $search_window .= $chunk;
  518. }
  519. elsif ($block_type == 1) {
  520. say STDERR "\n:: Extracting block of type 1";
  521. $chunk = extract_block_type_1($in_fh, \$buffer, \$search_window);
  522. }
  523. elsif ($block_type == 2) {
  524. say STDERR "\n:: Extracting block of type 2";
  525. $chunk = extract_block_type_2($in_fh, \$buffer, \$search_window);
  526. }
  527. else {
  528. die "[!] Unknown block of type: $block_type";
  529. }
  530. print $out_fh $chunk;
  531. $crc32 = crc32($chunk, $crc32);
  532. $actual_length += length($chunk);
  533. $search_window = substr($search_window, -$window_size) if (length($search_window) > 2 * $window_size);
  534. last if $is_last;
  535. }
  536. $buffer = ''; # discard any padding bits
  537. my $stored_crc32 = bits2int_lsb($in_fh, 32, \$buffer);
  538. my $actual_crc32 = $crc32;
  539. say STDERR '';
  540. if ($stored_crc32 != $actual_crc32) {
  541. print STDERR "[!] The CRC32 does not match: $actual_crc32 (actual) != $stored_crc32 (stored)\n";
  542. }
  543. else {
  544. print STDERR ":: CRC32 value: $actual_crc32\n";
  545. }
  546. my $stored_length = bits2int_lsb($in_fh, 32, \$buffer);
  547. if ($stored_length != $actual_length) {
  548. print STDERR "[!] The length does not match: $actual_length (actual) != $stored_length (stored)\n";
  549. }
  550. else {
  551. print STDERR ":: Total length: $actual_length\n";
  552. }
  553. if (eof($in_fh)) {
  554. print STDERR "\n:: Reached the end of the file.\n";
  555. }
  556. else {
  557. print STDERR "\n:: There is something else in the container! Trying to recurse!\n\n";
  558. __SUB__->($in_fh, $out_fh);
  559. }
  560. }
  561. sub main {
  562. my %opt;
  563. getopts('ei:o:vhr', \%opt);
  564. $opt{h} && usage(0);
  565. $opt{v} && version();
  566. my ($input, $output) = @ARGV;
  567. $input //= $opt{i} // usage(2);
  568. $output //= $opt{o};
  569. my $ext = qr{\.${\FORMAT}\z}io;
  570. if ($opt{e} || $input =~ $ext) {
  571. if (not defined $output) {
  572. ($output = basename($input)) =~ s{$ext}{}
  573. || die "$0: no output file specified!\n";
  574. }
  575. if (not $opt{r} and -e $output) {
  576. print "'$output' already exists! -- Replace? [y/N] ";
  577. <STDIN> =~ /^y/i || exit 17;
  578. }
  579. open my $in_fh, '<:raw', $input
  580. or die "Can't open file <<$input>> for reading: $!";
  581. open my $out_fh, '>:raw', $output
  582. or die "Can't open file <<$output>> for writing: $!";
  583. my_gzip_decompress($in_fh, $out_fh)
  584. || die "$0: error: decompression failed!\n";
  585. }
  586. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  587. $output //= basename($input) . '.' . FORMAT;
  588. open my $in_fh, '<:raw', $input
  589. or die "Can't open file <<$input>> for reading: $!";
  590. open my $out_fh, '>:raw', $output
  591. or die "Can't open file <<$output>> for writing: $!";
  592. my_gzip_compress($in_fh, $out_fh)
  593. || die "$0: error: compression failed!\n";
  594. }
  595. else {
  596. warn "$0: don't know what to do...\n";
  597. usage(1);
  598. }
  599. }
  600. main();
  601. exit(0);