xz-file.scm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. ;;; Disarchive
  2. ;;; Copyright © 2021 Timothy Sample <samplet@ngyro.com>
  3. ;;;
  4. ;;; This file is part of Disarchive.
  5. ;;;
  6. ;;; Disarchive is free software: you can redistribute it and/or modify
  7. ;;; it under the terms of the GNU General Public License as published by
  8. ;;; the Free Software Foundation, either version 3 of the License, or
  9. ;;; (at your option) any later version.
  10. ;;;
  11. ;;; Disarchive is distributed in the hope that it will be useful,
  12. ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. ;;; GNU General Public License for more details.
  15. ;;;
  16. ;;; You should have received a copy of the GNU General Public License
  17. ;;; along with Disarchive. If not, see <http://www.gnu.org/licenses/>.
  18. (define-module (disarchive assemblers xz-file)
  19. #:use-module (disarchive assemblers)
  20. #:use-module (disarchive config)
  21. #:use-module (disarchive digests)
  22. #:use-module (disarchive disassemblers)
  23. #:use-module (disarchive kinds xz)
  24. #:use-module (disarchive logging)
  25. #:use-module (disarchive utils)
  26. #:use-module (gcrypt hash)
  27. #:use-module (ice-9 binary-ports)
  28. #:use-module (ice-9 match)
  29. #:use-module (ice-9 popen)
  30. #:use-module (lzma)
  31. #:use-module (rnrs bytevectors)
  32. #:use-module (srfi srfi-1)
  33. #:use-module (srfi srfi-9 gnu)
  34. #:use-module (srfi srfi-26)
  35. #:use-module (srfi srfi-43)
  36. #:export (make-xz-file
  37. xz-file?
  38. xz-file-name
  39. xz-file-input
  40. xz-file-compressor
  41. xz-file-digest
  42. serialize-xz-file
  43. serialized-xz-file?
  44. deserialize-xz-file
  45. xz-file-file?
  46. disassemble-xz
  47. xz-file-assembler
  48. xz-file-disassembler))
  49. ;; Data
  50. (define (get-serialized-value key dflt fields)
  51. "Lookup KEY in the serialized fields FIELDS, returning DFLT if
  52. KEY is not found."
  53. (match (assq key fields)
  54. (#f dflt)
  55. ((key value) value)))
  56. (define-immutable-record-type <xz-block>
  57. (make-xz-block inflated-size deflated-size
  58. header-sizes? head-padding level extreme?)
  59. xz-block?
  60. (inflated-size xz-block-inflated-size)
  61. (deflated-size xz-block-deflated-size)
  62. (header-sizes? xz-block-header-sizes?)
  63. (head-padding xz-block-head-padding
  64. set-xz-block-head-padding)
  65. (level xz-block-level)
  66. (extreme? xz-block-extreme?))
  67. (define (xz-block-default-head-padding block)
  68. (let* ((inflated-size (xz-block-inflated-size block))
  69. (deflated-size (xz-block-deflated-size block))
  70. (header-sizes? (xz-block-header-sizes? block))
  71. (header-sizes-size (if header-sizes?
  72. (+ (xz-integer-length inflated-size)
  73. (xz-integer-length deflated-size))
  74. 0))
  75. (base (+ 1 1 4 header-sizes-size 3))
  76. (remainder (modulo base 4)))
  77. (if (zero? remainder) 0 (- 4 remainder))))
  78. (define (set-xz-block-default-head-padding block)
  79. (set-xz-block-head-padding block
  80. (xz-block-default-head-padding block)))
  81. (define (serialize-xz-block xzb)
  82. (match-let ((($ <xz-block> inflated-size deflated-size
  83. header-sizes? head-padding level extreme?) xzb))
  84. `(block
  85. (inflated-size ,inflated-size)
  86. (deflated-size ,deflated-size)
  87. ,@(if header-sizes? `((header-sizes? #t)) '())
  88. ,@(if (= (xz-block-head-padding xzb)
  89. (xz-block-default-head-padding xzb))
  90. '()
  91. `((head-padding ,(xz-block-head-padding xzb))))
  92. ,@(if (= level 6) '() `((level ,level)))
  93. ,@(if extreme? `((extreme? #t)) '()))))
  94. (define (deserialize-xz-block sexp)
  95. (match sexp
  96. (('block . fields)
  97. (let* ((i-size (get-serialized-value 'inflated-size #f fields))
  98. (d-size (get-serialized-value 'deflated-size #f fields))
  99. (xzb (make-xz-block
  100. (or i-size (error "XZ block is missing inflated size"))
  101. (or d-size (error "XZ block is missing deflated size"))
  102. (get-serialized-value 'header-sizes? #f fields)
  103. (get-serialized-value 'head-padding #f fields)
  104. (get-serialized-value 'level 6 fields)
  105. (get-serialized-value 'extreme? #f fields))))
  106. (if (xz-block-head-padding xzb)
  107. xzb
  108. (set-xz-block-default-head-padding xzb))))))
  109. (define-immutable-record-type <xz-stream-blueprint>
  110. (make-xz-stream-blueprint check blocks)
  111. xz-stream-blueprint?
  112. (check xz-stream-blueprint-check)
  113. (blocks xz-stream-blueprint-blocks))
  114. (define (serialize-xz-stream-blueprint xzsb)
  115. (match-let ((($ <xz-stream-blueprint> check blocks) xzsb))
  116. `(stream
  117. ,@(if (= check LZMA_CHECK_CRC64) '() `((check ,check)))
  118. ,@(if (null? blocks) '()
  119. `((blocks ,(map serialize-xz-block blocks)))))))
  120. (define (deserialize-xz-stream-blueprint sexp)
  121. (match sexp
  122. (('stream . fields)
  123. (make-xz-stream-blueprint
  124. (get-serialized-value 'check LZMA_CHECK_CRC64 fields)
  125. (map deserialize-xz-block
  126. (get-serialized-value 'blocks '() fields))))))
  127. (define-immutable-record-type <xz-file>
  128. (make-xz-file name input streams digest)
  129. xz-file?
  130. (name xz-file-name)
  131. (input xz-file-input)
  132. (streams xz-file-streams) ; list of <xz-stream-blueprint>
  133. (digest xz-file-digest))
  134. (define (serialize-xz-file xzf)
  135. (match-let ((($ <xz-file> name input streams digest) xzf))
  136. `(xz-file
  137. (name ,name)
  138. (digest ,(digest->sexp digest))
  139. (streams ,(map serialize-xz-stream-blueprint streams))
  140. (input ,(serialize-blueprint input)))))
  141. (define (serialized-xz-file? sexp)
  142. (match sexp
  143. (('xz-file _ ...) #t)
  144. (_ #f)))
  145. (define (deserialize-xz-file sexp)
  146. (match sexp
  147. (('xz-file
  148. ('name name)
  149. ('digest digest-sexp)
  150. ('streams streams)
  151. ('input input-sexp))
  152. (make-xz-file
  153. name
  154. (deserialize-blueprint input-sexp)
  155. (map deserialize-xz-stream-blueprint streams)
  156. (sexp->digest digest-sexp)))
  157. (_ #f)))
  158. ;; Helpers
  159. ;; This is lifted from the XZ source code.
  160. (define %dictionary-size-levels
  161. (map (lambda (x k) (cons (expt 2 x) k))
  162. '(18 20 21 22 22 23 23 24 25 26)
  163. (iota 10)))
  164. (define (check-size check)
  165. (cond
  166. ((= check LZMA_CHECK_NONE) 0)
  167. ((= check LZMA_CHECK_CRC32) 4)
  168. ((= check LZMA_CHECK_CRC64) 8)
  169. ((= check LZMA_CHECK_SHA256) 32)
  170. (else (error "Unknown XZ check type" check))))
  171. (define (call-with-truncated-port port count proc)
  172. (define remaining count)
  173. (define (read! bv start count)
  174. (let ((n (min remaining count)))
  175. (match (get-bytevector-n! port bv start n)
  176. ((? eof-object?) 0)
  177. (m (begin (set! remaining (- remaining m)) m)))))
  178. (call-with-port
  179. (make-custom-binary-input-port "truncated" read! #f #f
  180. (lambda () (close-port port)))
  181. proc))
  182. (define (call-with-input-file-part filename offset size proc)
  183. (call-with-input-file filename
  184. (lambda (port)
  185. (seek port offset SEEK_SET)
  186. (call-with-truncated-port port size proc))))
  187. (define (call-with-xz-input-block filename i-offset i-size
  188. level extreme? check proc)
  189. (define (skip-xz-block-header port)
  190. (define size (* (1+ (get-u8 port)) 4))
  191. (get-bytevector-n port (1- size))
  192. size)
  193. (call-with-input-file-part filename i-offset i-size
  194. (lambda (raw-in)
  195. (call-with-xz-input-port/compressed raw-in
  196. (lambda (xz-in)
  197. (get-bytevector-n xz-in 12)
  198. (skip-xz-block-header xz-in)
  199. (proc xz-in))
  200. #:level level
  201. #:extreme? extreme?
  202. #:check check))))
  203. ;; Assembly
  204. (define (level->xz-filter-flags level)
  205. "Convert LEVEL to a list XZ filter flags."
  206. (define (encode-dictionary-size size)
  207. (if (= size (1- (expt 2 32)))
  208. 40
  209. (let* ((exponent (integer-length size))
  210. (base (* (- exponent 13) 2)))
  211. (match (logcount size)
  212. (1 base)
  213. (2 (1+ base))
  214. (_ (error "Invalid dictionary size"))))))
  215. (let* ((size (any (match-lambda ((s . l) (and (= level l) s)))
  216. %dictionary-size-levels))
  217. (props (make-bytevector 1 (encode-dictionary-size size))))
  218. (list (make-xz-filter-flags #x21 props))))
  219. (define (xz-block-xz-block-header xzb)
  220. (let* ((reserved 0)
  221. (d-size (and (xz-block-header-sizes? xzb)
  222. (xz-block-deflated-size xzb)))
  223. (i-size (and (xz-block-header-sizes? xzb)
  224. (xz-block-inflated-size xzb)))
  225. (flags (level->xz-filter-flags (xz-block-level xzb)))
  226. (padding (xz-block-head-padding xzb)))
  227. (make-xz-block-header reserved d-size i-size
  228. flags padding #f)))
  229. (define (write-xz-block-header xzb port)
  230. (let ((bh (xz-block-xz-block-header xzb)))
  231. (put-bytevector port (encode-xz-block-header bh))))
  232. (define (assemble-xz-block xzb check inflated offset port)
  233. (let* ((i-size (xz-block-inflated-size xzb))
  234. (d-size (xz-block-deflated-size xzb))
  235. (level (xz-block-level xzb))
  236. (extreme? (xz-block-extreme? xzb))
  237. (remainder (modulo d-size 4))
  238. (padding (if (zero? remainder) 0 (- 4 remainder))))
  239. (write-xz-block-header xzb port)
  240. (call-with-xz-input-block inflated offset i-size level extreme? check
  241. (lambda (in)
  242. (dump-port-n in port (+ d-size padding (check-size check)))))))
  243. (define (xz-block->xz-index-record xzb check)
  244. (let* ((d-size (xz-block-deflated-size xzb))
  245. (bh (xz-block-xz-block-header xzb))
  246. (header-size (xz-block-header-size bh)))
  247. (make-xz-index-record (+ header-size d-size (check-size check))
  248. (xz-block-inflated-size xzb))))
  249. (define (assemble-xz-stream xzsb inflated offset port)
  250. (define check (xz-stream-blueprint-check xzsb))
  251. (define xzbs (xz-stream-blueprint-blocks xzsb))
  252. (let ((head (make-xz-stream-header check '(0 0) #f)))
  253. (put-bytevector port (encode-xz-stream-header head)))
  254. (let loop ((xzbs xzbs) (offset offset))
  255. (match xzbs
  256. (() *unspecified*)
  257. ((xzb . rest)
  258. (assemble-xz-block xzb check inflated offset port)
  259. (loop rest (+ offset (xz-block-inflated-size xzb))))))
  260. (let* ((idx (make-xz-index (map (lambda (xzb)
  261. (xz-block->xz-index-record xzb check))
  262. xzbs)
  263. #f))
  264. (foot (make-xz-stream-footer check '(0 0) (xz-index-size idx) #f)))
  265. (put-bytevector port (encode-xz-index idx))
  266. (put-bytevector port (encode-xz-stream-footer foot))))
  267. (define (assemble-xz-streams streams inflated port)
  268. (define (stream-size stream)
  269. (reduce + 0 (map xz-block-inflated-size
  270. (xz-stream-blueprint-blocks stream))))
  271. (let loop ((streams streams) (offset 0))
  272. (match streams
  273. (() *unspecified*)
  274. ((stream . rest)
  275. (assemble-xz-stream stream inflated offset port)
  276. (loop rest (+ offset (stream-size stream)))))))
  277. (define (assemble-xz-file xzf workspace)
  278. (match-let* ((($ <xz-file> name input-blueprint streams digest) xzf)
  279. (input-digest (blueprint-digest input-blueprint))
  280. (input (digest->filename input-digest workspace))
  281. (output (digest->filename digest workspace)))
  282. (message "Assembling the XZ file ~a" name)
  283. (mkdir-p (dirname output))
  284. (call-with-output-file output
  285. (lambda (out)
  286. (assemble-xz-streams streams input out)))))
  287. ;; Disassemblly
  288. (define (xz-file-file? filename st)
  289. (and (eq? (stat:type st) 'regular)
  290. (call-with-input-file filename
  291. (lambda (port)
  292. (equal? (get-bytevector-n port 6)
  293. #vu8(#xfd #x37 #x7a #x58 #x5a #x00))))))
  294. (define (xz-filters->levels filters)
  295. "Find a list of candidate compression levels based on the XZ filter
  296. flags FILTERS."
  297. (define (decode-dictionary-size bits)
  298. (if (= bits 40)
  299. (1- (expt 2 32))
  300. (ash (logior 2 (logand bits 1))
  301. (+ (quotient bits 2) 11))))
  302. (let ((filter (last filters)))
  303. (if (= (xz-filter-flags-id filter) #x21)
  304. (let* ((props (xz-filter-flags-properties filter))
  305. (rawds (bit-extract (bytevector-u8-ref props 0) 0 6))
  306. (ds (decode-dictionary-size rawds)))
  307. (filter-map (match-lambda
  308. ((size . level) (and (= ds size) level)))
  309. %dictionary-size-levels))
  310. '())))
  311. (define (disassemble-block block d-offset d-size i-offset i-size
  312. deflated inflated)
  313. "Disassable the XZ block header (<xz-block-header>) BLOCK into an XZ
  314. block (<xz-block>). The block must start at D-OFFSET in the file
  315. named DEFLATED, and be D-SIZE bytes long. It also must be the
  316. compressed counterpart to the I-SIZE bytes starting at I-OFFSET in the
  317. file named INFLATED."
  318. (define* (port=? port1 port2 #:optional count)
  319. (let loop ((k 0))
  320. (define b1 (get-u8 port1))
  321. (define b2 (get-u8 port2))
  322. (cond
  323. ((or (and count (>= k count))
  324. (and (eof-object? b1) (eof-object? b2))) #t)
  325. ((equal? b1 b2) (loop (1+ k)))
  326. (else #f))))
  327. (define (block-compressor? level extreme?)
  328. (call-with-input-file deflated
  329. (lambda (in1)
  330. (seek in1 d-offset SEEK_SET)
  331. (seek in1 (xz-block-header-size block) SEEK_CUR)
  332. ;; Note that the CHECK argument doesn't matter since we don't
  333. ;; compare the checksums.
  334. (call-with-xz-input-block
  335. inflated i-offset i-size level extreme? LZMA_CHECK_CRC64
  336. (lambda (in2)
  337. (port=? in1 in2 d-size))))))
  338. (define header-sizes? (and (xz-block-header-compressed-size block)
  339. (xz-block-header-uncompressed-size block)
  340. #t))
  341. (message "Disassembling XZ block at ~d (~d bytes)" d-offset d-size)
  342. (message "In the inflated file, this is ~d and ~d" i-offset i-size)
  343. (let ((levels (xz-filters->levels (xz-block-header-filters block))))
  344. (message "Trying up to ~a compressors" (* (length levels) 2))
  345. (or (any (match-lambda
  346. ((level . extreme?)
  347. (start-message " Level ~a~a... " level
  348. (if extreme? " extreme!" ""))
  349. (if (block-compressor? level extreme?)
  350. (begin (message "yes!")
  351. (make-xz-block i-size d-size header-sizes?
  352. (xz-block-header-padding block)
  353. level extreme?))
  354. (begin (message "no") #f))))
  355. (append-map (lambda (x) `((,x . #f) (,x . #t))) levels))
  356. (error "Could not find XZ compressor"))))
  357. (define (disassemble-stream strm d-offset i-offset deflated inflated)
  358. "Disassemble the XZ stream STRM into a list of XZ
  359. blocks (<xz-block>). The stream must start at D-OFFSET in the file
  360. named DEFLATED, and must be the compressed counterpart to the bytes
  361. starting at I-OFFSET in the file named INFLATED."
  362. (define check (xz-stream-header-check-type (xz-stream-header strm)))
  363. (let loop ((blocks (xz-stream-blocks strm))
  364. (records (xz-index-records (xz-stream-index strm)))
  365. (d-offset (+ d-offset 12)) ; skip the stream header
  366. (i-offset i-offset)
  367. (acc '()))
  368. (match blocks
  369. (() (match records
  370. (() (make-xz-stream-blueprint check (reverse acc)))
  371. (_ (error "more XZ index records than blocks"))))
  372. ((block . blocks-rest)
  373. (match records
  374. (() (error "more XZ blocks than index records"))
  375. ((record . records-rest)
  376. (let ((d-size (- (xz-index-record-unpadded-size record)
  377. (xz-block-header-size block) (check-size check)))
  378. (d-size* (xz-index-record-block-size record))
  379. (i-size (xz-index-record-uncompressed-size record)))
  380. (loop blocks-rest records-rest
  381. (+ d-offset d-size*) (+ i-offset i-size)
  382. (cons (disassemble-block block d-offset d-size
  383. i-offset i-size
  384. deflated inflated)
  385. acc)))))))))
  386. (define (disassemble-streams deflated inflated)
  387. "Disassemble the file named DEFLATED into a list of lists of XZ
  388. blocks (<xz-block>). The file named INFLATED must be uncompressed
  389. counterpart of DEFLATED."
  390. (let loop ((streams (call-with-input-file deflated read-xz-streams))
  391. (d-offset 0)
  392. (i-offset 0)
  393. (acc '()))
  394. (match streams
  395. (() (reverse acc))
  396. ((strm . rest)
  397. (message "Disassembling XZ stream at ~d" d-offset)
  398. (loop rest
  399. (+ d-offset (xz-stream-size strm))
  400. (+ i-offset (xz-stream-uncompressed-size strm))
  401. (cons (disassemble-stream strm d-offset i-offset
  402. deflated inflated)
  403. acc))))))
  404. (define* (disassemble-xz-file filename #:optional
  405. (algorithm (hash-algorithm sha256))
  406. #:key (name (basename filename)))
  407. "Disassemble FILENAME into a XZ file blueprint object. If ALGORITHM
  408. is set, use it for computing digests."
  409. (message "Disassembling the XZ file ~a" name)
  410. (call-with-temporary-output-file
  411. (lambda (tmpname tmp)
  412. (with-output-to-port tmp
  413. (lambda ()
  414. (message "Decompressing the XZ file ~a" name)
  415. (invoke %xz "-d" "-c" filename)))
  416. (close-port tmp)
  417. (let* ((streams (disassemble-streams filename tmpname))
  418. (input (disassemble tmpname algorithm
  419. #:name (basename name ".xz"))))
  420. (make-xz-file name input streams
  421. (file-digest filename algorithm))))))
  422. ;; Interfaces
  423. (define xz-file-assembler
  424. (make-assembler xz-file?
  425. xz-file-name
  426. xz-file-digest
  427. (compose list xz-file-input)
  428. serialize-xz-file
  429. serialized-xz-file?
  430. deserialize-xz-file
  431. assemble-xz-file))
  432. (define xz-file-disassembler
  433. (make-disassembler xz-file-file?
  434. disassemble-xz-file))