gzip-member.scm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. ;;; Disarchive
  2. ;;; Copyright © 2020, 2021 Timothy Sample <samplet@ngyro.com>
  3. ;;;
  4. ;;; This file is part of Disarchive.
  5. ;;;
  6. ;;; Disarchive is free software: you can redistribute it and/or modify
  7. ;;; it under the terms of the GNU General Public License as published by
  8. ;;; the Free Software Foundation, either version 3 of the License, or
  9. ;;; (at your option) any later version.
  10. ;;;
  11. ;;; Disarchive is distributed in the hope that it will be useful,
  12. ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. ;;; GNU General Public License for more details.
  15. ;;;
  16. ;;; You should have received a copy of the GNU General Public License
  17. ;;; along with Disarchive. If not, see <http://www.gnu.org/licenses/>.
  18. (define-module (disarchive assemblers gzip-member)
  19. #:use-module (disarchive assemblers)
  20. #:use-module (disarchive config)
  21. #:use-module (disarchive digests)
  22. #:use-module (disarchive disassemblers)
  23. #:use-module (disarchive formats gzip)
  24. #:use-module (disarchive logging)
  25. #:use-module (disarchive utils)
  26. #:use-module (gcrypt base64)
  27. #:use-module (gcrypt hash)
  28. #:use-module (ice-9 binary-ports)
  29. #:use-module (ice-9 match)
  30. #:use-module (ice-9 popen)
  31. #:use-module ((rnrs io ports) #:select (call-with-port))
  32. #:use-module (srfi srfi-1)
  33. #:use-module (srfi srfi-9 gnu)
  34. #:use-module (srfi srfi-26)
  35. #:export (<gzip-member>
  36. make-gzip-member
  37. gzip-member?
  38. gzip-member-name
  39. gzip-member-input
  40. gzip-member-header
  41. gzip-member-footer
  42. gzip-member-compressor
  43. gzip-member-digest
  44. serialize-gzip-member
  45. serialized-gzip-member?
  46. deserialize-gzip-member
  47. gzip-member-file?
  48. disassemble-gzip-member
  49. gzip-member-assembler
  50. gzip-member-disassembler))
  51. ;;; Commentary:
  52. ;;;
  53. ;;; This module provides procedures for taking apart and reassembling
  54. ;;; Gzip-compressed files. The idea is to store a small amount of
  55. ;;; metadata that allows recreating the Gzip file bit-for-bit given
  56. ;;; the uncompressed data.
  57. ;;;
  58. ;;; Code:
  59. ;; Data
  60. (define-immutable-record-type <gzip-member>
  61. (make-gzip-member name input header footer compressor digest)
  62. gzip-member?
  63. (name gzip-member-name)
  64. (input gzip-member-input)
  65. (header gzip-member-header)
  66. (footer gzip-member-footer)
  67. (compressor gzip-member-compressor set-gzip-member-compressor)
  68. (digest gzip-member-digest))
  69. (define (gzip-header->sexp header)
  70. (match-let ((($ <gzip-header> text? reserved-flags mtime extra-flags os
  71. extra-field filename comment crc) header))
  72. `(,@(if text? '((text? #t)) '())
  73. ,@(if (zero? reserved-flags) '() `((reserved-flags ,reserved-flags)))
  74. (mtime ,mtime)
  75. (extra-flags ,extra-flags)
  76. (os ,os)
  77. ,@(if extra-field `((extra-field ,(base64-encode extra-field))) '())
  78. ,@(if filename `((filename ,filename)) '())
  79. ,@(if comment `((comment ,comment)) '())
  80. ,@(if crc `((header-crc ,crc)) '()))))
  81. (define (gzip-footer->sexp footer)
  82. (match-let ((($ <gzip-footer> crc isize) footer))
  83. `((crc ,crc)
  84. (isize ,isize))))
  85. (define (serialize-gzip-member member)
  86. (match-let ((($ <gzip-member> name input header footer
  87. compressor digest) member))
  88. `(gzip-member
  89. (name ,name)
  90. (digest ,(digest->sexp digest))
  91. (header ,@(gzip-header->sexp header))
  92. (footer ,@(gzip-footer->sexp footer))
  93. (compressor ,compressor)
  94. (input ,(serialize-blueprint input)))))
  95. (define (assrq-ref arlist key)
  96. (and=> (assq-ref arlist key) car))
  97. (define (sexp->gzip-header sexp)
  98. (make-gzip-header
  99. (eq? (assrq-ref sexp 'text?) #t)
  100. (or (assrq-ref sexp 'reserved-flags) 0)
  101. (or (assrq-ref sexp 'mtime) 0)
  102. (or (assrq-ref sexp 'extra-flags) 0)
  103. (or (assrq-ref sexp 'os) 255)
  104. (and=> (assrq-ref sexp 'extra-field) base64-decode)
  105. (assrq-ref sexp 'filename)
  106. (assrq-ref sexp 'comment)
  107. (assrq-ref sexp 'header-crc)))
  108. (define (sexp->gzip-footer sexp)
  109. (make-gzip-footer
  110. (assrq-ref sexp 'crc)
  111. (assrq-ref sexp 'isize)))
  112. (define (serialized-gzip-member? sexp)
  113. (match sexp
  114. (('gzip-member _ ...) #t)
  115. (_ #f)))
  116. (define (deserialize-gzip-member sexp)
  117. (match sexp
  118. (('gzip-member
  119. ('name name)
  120. ('digest digest-sexp)
  121. ('header header-sexp ...)
  122. ('footer footer-sexp ...)
  123. ('compressor compressor)
  124. ('input input-sexp))
  125. (make-gzip-member
  126. name
  127. (deserialize-blueprint input-sexp)
  128. (sexp->gzip-header header-sexp)
  129. (sexp->gzip-footer footer-sexp)
  130. compressor
  131. (sexp->digest digest-sexp)))
  132. (_ #f)))
  133. ;; Assembly
  134. (define* (gnu-gzip speed rsync? input)
  135. (let* ((args (append '("--gnu")
  136. (if speed (list (format #f "-~a" speed)) '())
  137. (if rsync? '("--rsyncable") '())
  138. '("-c"))))
  139. (with-input-from-file input
  140. (lambda ()
  141. (apply open-pipe* OPEN_READ (%zgz) args)))))
  142. (define* (pristine-gnu-gzip speed rsync input)
  143. (let* ((args (append '("--gnu")
  144. (if speed (list (format #f "-~a" speed)) '())
  145. (if rsync (list rsync) '())
  146. '("-c"))))
  147. (with-input-from-file input
  148. (lambda ()
  149. (apply open-pipe* OPEN_READ (%zgz) args)))))
  150. (define* (zlib-gzip speed perl-style? input)
  151. ;; The order of the arguments matter! It looks like the speed has
  152. ;; to come after the quirk.
  153. (let* ((args (append (if perl-style? '("--quirk" "perl") '())
  154. (if speed (list (format #f "-~a" speed)) '())
  155. '("-c"))))
  156. (with-input-from-file input
  157. (lambda ()
  158. (apply open-pipe* OPEN_READ (%zgz) args)))))
  159. (define %compressors
  160. `((gnu-best . ,(cut gnu-gzip 9 #f <>))
  161. (gnu-best-rsync . ,(cut gnu-gzip 9 #t <>))
  162. (gnu . ,(cut gnu-gzip #f #f <>))
  163. (gnu-rsync . ,(cut gnu-gzip #f #t <>))
  164. (gnu-fast . ,(cut gnu-gzip 1 #f <>))
  165. (gnu-fast-rsync . ,(cut gnu-gzip 1 #t <>))
  166. (zlib-best . ,(cut zlib-gzip 9 #f <>))
  167. (zlib . ,(cut zlib-gzip #f #f <>))
  168. (zlib-fast . ,(cut zlib-gzip 1 #f <>))
  169. (zlib-best-perl . ,(cut zlib-gzip 9 #t <>))
  170. (zlib-perl . ,(cut zlib-gzip #f #t <>))
  171. (zlib-fast-perl . ,(cut zlib-gzip 1 #t <>))
  172. (gnu-best-rsync-1.4 . ,(cut pristine-gnu-gzip 9 "--new-rsyncable" <>))
  173. (gnu-rsync-1.4 . ,(cut pristine-gnu-gzip #f "--new-rsyncable" <>))
  174. (gnu-fast-rsync-1.4 . ,(cut pristine-gnu-gzip 1 "--new-rsyncable" <>))))
  175. (define (compressor-pipe compressor input)
  176. ((assq-ref %compressors compressor) input))
  177. (define (call-with-metadataless-compressor-pipe compressor input proc)
  178. "Run COMPRESSOR on INPUT and call PROC with its output port."
  179. (let ((raw-in (compressor-pipe compressor input)))
  180. (dynamic-wind
  181. noop
  182. (lambda ()
  183. (call-with-port (strip-gzip-metadata raw-in) proc))
  184. (lambda ()
  185. (let* ((status (close-pipe raw-in))
  186. (exit-val (status:exit-val status))
  187. (term-sig (status:term-sig status)))
  188. (unless (or (and exit-val (zero? exit-val))
  189. (and term-sig (= term-sig SIGPIPE)))
  190. (error "unexpected exit status" compressor)))))))
  191. (define (assemble-gzip-member member workspace)
  192. (match-let* ((($ <gzip-member> name input-blueprint header footer
  193. compressor digest) member)
  194. (input-digest (blueprint-digest input-blueprint))
  195. (input (digest->filename input-digest workspace))
  196. (output (digest->filename digest workspace)))
  197. (message "Assembling the Gzip file ~a" name)
  198. (mkdir-p (dirname output))
  199. (call-with-output-file output
  200. (lambda (out)
  201. (write-gzip-header out header)
  202. (call-with-metadataless-compressor-pipe compressor input
  203. (lambda (in)
  204. (dump-port-all in out)))
  205. (write-gzip-footer out footer)))))
  206. ;; Disassemblly
  207. (define (gzip-member-file? filename st)
  208. (and (eq? (stat:type st) 'regular)
  209. (call-with-input-file filename
  210. (lambda (port)
  211. (equal? (get-bytevector-n port 2) #vu8(#x1f #x8b))))))
  212. #;(
  213. ;; This is how to extract a single Gzip member. It is very slow
  214. ;; because it relies on a Scheme implementation of inflate.
  215. (define-crc crc-32)
  216. (define (inflate/crc-32 in out)
  217. (inflate in out crc-32-init crc-32-update crc-32-finish))
  218. (define* (extract-gzip-member in out #:optional
  219. (algorithm (hash-algorithm sha256)))
  220. "Extract one Gzip member from IN and write it to OUT, returning its
  221. metadata."
  222. (let* ((header (read-gzip-header in))
  223. (chash-port get-chash (open-hash-input-port algorithm in))
  224. (actual-crc size buf (inflate/crc-32 chash-port out))
  225. (footer (read-gzip-footer in buf)))
  226. (values header (make-digest algorithm (get-chash)) footer)))
  227. )
  228. (define (read-gzip-metadata port)
  229. "Extract Gzip metadata from PORT and return two values: a
  230. Gzip-header and a Gzip-footer. Note that PORT must be a file port
  231. that yields a single Gzip member."
  232. (let ((header (read-gzip-header port)))
  233. (seek port -8 SEEK_END)
  234. (values header (read-gzip-footer port))))
  235. (define (try-assembling-gzip-member member workspace)
  236. (assemble member workspace #:verify? #f)
  237. (let* ((digest (gzip-member-digest member))
  238. (out (digest->filename digest workspace))
  239. (actual-digest (file-digest out (digest-algorithm digest))))
  240. (equal? digest actual-digest)))
  241. (define (call-with-sigpipe thunk)
  242. "Call THUNK with the SIGPIPE handler set to SIG_DFL, restoring the
  243. handler afterwards."
  244. (let ((handler #f)
  245. (flags #f))
  246. (dynamic-wind
  247. (lambda ()
  248. (match-let (((handler* . flags*) (sigaction SIGPIPE)))
  249. (unless handler*
  250. (error "could not save SIGPIPE handler"))
  251. (set! handler handler*)
  252. (set! flags flags*))
  253. (sigaction SIGPIPE SIG_DFL))
  254. thunk
  255. (lambda ()
  256. (sigaction SIGPIPE handler flags)))))
  257. (define (file-compressor? inflated deflated compressor)
  258. "Check if COMPRESSOR was used on INFLATED to create DEFLATED."
  259. (define (port=? port1 port2)
  260. (let loop ()
  261. (define b1 (get-u8 port1))
  262. (define b2 (get-u8 port2))
  263. (cond
  264. ((and (eof-object? b1) (eof-object? b2)) #t)
  265. ((equal? b1 b2) (loop))
  266. (else #f))))
  267. (call-with-input-file deflated
  268. (lambda (raw-port1)
  269. (call-with-port (strip-gzip-metadata raw-port1)
  270. (lambda (port1)
  271. (call-with-sigpipe
  272. (lambda ()
  273. (call-with-metadataless-compressor-pipe compressor inflated
  274. (cut port=? port1 <>)))))))))
  275. (define (find-compressor inflated deflated)
  276. "Find the compressor used on INFLATED to create DEFLATED."
  277. (message "Trying up to ~a compressors" (length %compressors))
  278. (find (lambda (compressor)
  279. (start-message " ~a... " compressor)
  280. (if (file-compressor? inflated deflated compressor)
  281. (begin (message "yes!") #t)
  282. (begin (message "no") #f)))
  283. (map car %compressors)))
  284. (define* (disassemble-gzip-member filename #:optional
  285. (algorithm (hash-algorithm sha256))
  286. #:key (name (basename filename)))
  287. "Disassemble FILENAME into a Gzip-member blueprint object. The file
  288. at FILENAME must be a Gzip file containing a single member. If
  289. ALGORITHM is set, use it for computing digests."
  290. (message "Disassembling the Gzip file ~a" name)
  291. (call-with-temporary-output-file
  292. (lambda (tmpname tmp)
  293. (with-output-to-port tmp
  294. (lambda ()
  295. (message "Decompressing the Gzip file ~a" name)
  296. (invoke %gzip "-d" "-c" filename)))
  297. (close-port tmp)
  298. (let* ((compressor (or (find-compressor tmpname filename)
  299. (error "could not find Gzip compressor")))
  300. (input (disassemble tmpname algorithm
  301. #:name (basename name ".gz"))))
  302. (call-with-values (lambda () (call-with-input-file filename
  303. read-gzip-metadata))
  304. (lambda (header footer)
  305. (make-gzip-member name input header footer compressor
  306. (file-digest filename algorithm))))))))
  307. ;; Interfaces
  308. (define gzip-member-assembler
  309. (make-assembler gzip-member?
  310. gzip-member-name
  311. gzip-member-digest
  312. (compose list gzip-member-input)
  313. serialize-gzip-member
  314. serialized-gzip-member?
  315. deserialize-gzip-member
  316. assemble-gzip-member))
  317. (define gzip-member-disassembler
  318. (make-disassembler gzip-member-file?
  319. disassemble-gzip-member))