git-hash.scm 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. ;;; Disarchive
  2. ;;; Copyright © 2020 Timothy Sample <samplet@ngyro.com>
  3. ;;;
  4. ;;; This file is part of Disarchive.
  5. ;;;
  6. ;;; Disarchive is free software: you can redistribute it and/or modify
  7. ;;; it under the terms of the GNU General Public License as published by
  8. ;;; the Free Software Foundation, either version 3 of the License, or
  9. ;;; (at your option) any later version.
  10. ;;;
  11. ;;; Disarchive is distributed in the hope that it will be useful,
  12. ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. ;;; GNU General Public License for more details.
  15. ;;;
  16. ;;; You should have received a copy of the GNU General Public License
  17. ;;; along with Disarchive. If not, see <http://www.gnu.org/licenses/>.
  18. (define-module (disarchive git-hash)
  19. #:use-module (disarchive utils)
  20. #:use-module (gcrypt hash)
  21. #:use-module (ice-9 binary-ports)
  22. #:use-module (ice-9 ftw)
  23. #:use-module (ice-9 match)
  24. #:use-module (rnrs bytevectors)
  25. #:use-module (srfi srfi-1)
  26. #:use-module (srfi srfi-26)
  27. #:use-module (srfi srfi-71)
  28. #:export (git-hash-file
  29. git-hash-directory))
  30. ;;; Commentary:
  31. ;;;
  32. ;;; This module provides functions for hashing files and directories in
  33. ;;; the style of Git. The hope is that these hashes can be used to find
  34. ;;; data in the Software Heritage archive. Hence, the standard of
  35. ;;; correctness is that the results match Software Heritage.
  36. ;;;
  37. ;;; Code:
  38. (define (write-git-hash-header port type size)
  39. "Construct a Git hash header from TYPE and SIZE, and write it to
  40. PORT."
  41. (display type port)
  42. (display #\space port)
  43. (display size port)
  44. (display #\nul port))
  45. (define* (git-hash-blob bv #:optional
  46. (algorithm (hash-algorithm sha1)))
  47. "Compute the Git hash of BV. If ALGORITHM is set, compute hashes
  48. using ALGORITHM. Otherwise, use SHA-1."
  49. (let ((out get-hash (open-hash-port algorithm)))
  50. (write-git-hash-header out "blob" (bytevector-length bv))
  51. (put-bytevector out bv)
  52. (force-output out)
  53. (get-hash)))
  54. (define* (git-hash-file* filename st #:optional
  55. (algorithm (hash-algorithm sha1)))
  56. "Compute the Git hash of FILENAME (a regular file). The ST
  57. parameter must be the object returned by '(stat FILENAME)'. If
  58. ALGORITHM is set, compute hashes using ALGORITHM. Otherwise, use
  59. SHA-1."
  60. (let ((out get-hash (open-hash-port algorithm)))
  61. (write-git-hash-header out "blob" (number->string (stat:size st)))
  62. (call-with-input-file filename (cut dump-port-all <> out))
  63. (force-output out)
  64. (get-hash)))
  65. (define* (git-hash-file filename #:optional
  66. (algorithm (hash-algorithm sha1)))
  67. "Compute the Git hash of FILENAME (a regular file). If ALGORITHM is
  68. set, compute hashes using ALGORITHM. Otherwise, use SHA-1."
  69. (git-hash-file* filename (stat filename) algorithm))
  70. (define (make-tree-node mode name hash)
  71. "Serialize the bytevectors MODE, NAME, and HASH into a Git tree node."
  72. (let* ((name-offset (+ (bytevector-length mode) 1))
  73. (hash-offset (+ name-offset (bytevector-length name) 1))
  74. (node (make-bytevector (+ hash-offset (bytevector-length hash)))))
  75. (bytevector-copy! mode 0 node 0 (bytevector-length mode))
  76. (bytevector-u8-set! node (1- name-offset) #x20)
  77. (bytevector-copy! name 0 node name-offset (bytevector-length name))
  78. (bytevector-u8-set! node (1- hash-offset) 0)
  79. (bytevector-copy! hash 0 node hash-offset (bytevector-length hash))
  80. node))
  81. (define (%read-tree-node filename algorithm select?)
  82. "Read the file at FILENAME and turn it into a Git tree node. The file
  83. may be a regular file, directory, or symlink. Hashes will be computed
  84. using ALGORITHM."
  85. (let ((st (lstat filename))
  86. (name (basename filename)))
  87. (and (select? filename st)
  88. (case (stat:type st)
  89. ((regular)
  90. (make-tree-node (if (zero? (bit-extract (stat:perms st) 6 7))
  91. (string->utf8 "100644")
  92. (string->utf8 "100755"))
  93. (string->utf8 name)
  94. (git-hash-file* filename st algorithm)))
  95. ((directory)
  96. (make-tree-node (string->utf8 "40000")
  97. (string->utf8 name)
  98. (git-hash-directory filename algorithm)))
  99. ((symlink)
  100. (make-tree-node (string->utf8 "120000")
  101. (string->utf8 name)
  102. (git-hash-blob (string->utf8 (readlink filename))
  103. algorithm)))))))
  104. ;; XXX: Guile 3 seems to fail when optimizing the call from
  105. ;; 'git-hash-directory' to 'read-tree-node'. It decides not to use
  106. ;; 'read-tree-node' and just calls 'git-hash-directory' (recursively)
  107. ;; instead. Since 'read-tree-node' does a lot of useful work,
  108. ;; everything breaks when Guile 3 does this. The following indirection
  109. ;; tricks the compiler into doing the right thing.
  110. (define read-tree-node (car (list %read-tree-node)))
  111. (define (tree-node-directory? node)
  112. "Check if NODE is a directory tree node."
  113. (let loop ((k 0) (digits '(#x34 #x30 #x30 #x30 #x30)))
  114. (match digits
  115. (() #t)
  116. ((digit . rest)
  117. (and (= (bytevector-u8-ref node k) digit) (loop (1+ k) rest))))))
  118. (define (tree-node-name-index node)
  119. "Get the index of the name field of NODE."
  120. (let loop ((k 0))
  121. (if (= (bytevector-u8-ref node k) #x20)
  122. (1+ k)
  123. (loop (1+ k)))))
  124. (define (tree-node-hash-index node)
  125. "Get the index of the hash field of NODE."
  126. (let loop ((k 0))
  127. (if (= (bytevector-u8-ref node k) 0)
  128. (1+ k)
  129. (loop (1+ k)))))
  130. (define (display-tree-node node)
  131. "Write a representation of NODE to the current output port."
  132. (let* ((name-index (tree-node-name-index node))
  133. (mode (make-bytevector (1- name-index)))
  134. (hash-index (tree-node-hash-index node))
  135. (name (make-bytevector (- hash-index name-index 1)))
  136. (hash (make-bytevector (- (bytevector-length node) hash-index))))
  137. (bytevector-copy! node 0 mode 0 (bytevector-length mode))
  138. (bytevector-copy! node name-index name 0 (bytevector-length name))
  139. (bytevector-copy! node hash-index hash 0 (bytevector-length hash))
  140. (display (utf8->string mode))
  141. (display " ")
  142. (display (utf8->string name))
  143. (display " ")
  144. (display ((@ (guix base16) bytevector->base16-string) hash))
  145. (newline)))
  146. (define (tree-node<? n1 n2)
  147. "Check if the name of N1 comes before N2 when sorting
  148. lexicographically."
  149. (let loop ((k1 (tree-node-name-index n1)) (k2 (tree-node-name-index n2)))
  150. (let* ((b1 (bytevector-u8-ref n1 k1))
  151. (b1* (if (and (zero? b1) (tree-node-directory? n1)) #x2f b1))
  152. (b2 (bytevector-u8-ref n2 k2))
  153. (b2* (if (and (zero? b2) (tree-node-directory? n2)) #x2f b2)))
  154. (cond
  155. ((< b1* b2*) #t)
  156. ((> b1* b2*) #f)
  157. (else (and (> b1 0)
  158. (> b2 0)
  159. (loop (1+ k1) (1+ k2))))))))
  160. (define* (git-hash-directory directory #:optional
  161. (algorithm (hash-algorithm sha1))
  162. #:key (select? (const #t)))
  163. "Compute the Git-style hash of DIRECTORY. If ALGORITHM is set,
  164. compute hashes using ALGORITHM. Otherwise, use SHA-1. Note that by
  165. default, the result will include empty directories, which Git itself
  166. would ignore. However, you can control exactly which files are
  167. included by specifying a SELECT? predicate that takes two arguments, a
  168. filename and a stat object."
  169. (let* ((filenames (remove (cut member <> '("." "..")) (scandir directory)))
  170. (filenames* (map (cut string-append directory "/" <>) filenames))
  171. (nodes (sort (filter-map (cut read-tree-node <> algorithm select?)
  172. filenames*)
  173. tree-node<?))
  174. (len (fold + 0 (map bytevector-length nodes))))
  175. ;;(format #t "--==-- ~a --==--~%" directory)
  176. ;;(for-each display-tree-node nodes)
  177. (let ((out get-hash (open-hash-port algorithm)))
  178. (write-git-hash-header out "tree" len)
  179. (for-each (cut put-bytevector out <>) nodes)
  180. (force-output out)
  181. (get-hash))))