summaryrefslogtreecommitdiff
path: root/guix/store/deduplication.scm
diff options
context:
space:
mode:
Diffstat (limited to 'guix/store/deduplication.scm')
-rw-r--r--guix/store/deduplication.scm69
1 files changed, 60 insertions, 9 deletions
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
index cd9660174c..370df4a74c 100644
--- a/guix/store/deduplication.scm
+++ b/guix/store/deduplication.scm
@@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
-;;; Copyright © 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018-2021 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -22,12 +22,13 @@
(define-module (guix store deduplication)
#:use-module (gcrypt hash)
- #:use-module (guix build utils)
+ #:use-module ((guix build utils) #:hide (dump-port))
#:use-module (guix build syscalls)
#:use-module (guix base32)
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-34)
#:use-module (srfi srfi-35)
+ #:use-module (rnrs bytevectors)
#:use-module (rnrs io ports)
#:use-module (ice-9 ftw)
#:use-module (ice-9 match)
@@ -37,6 +38,31 @@
dump-file/deduplicate
copy-file/deduplicate))
+;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len'
+;; parameter.
+(define* (dump-port in out
+ #:optional len
+ #:key (buffer-size 16384))
+ "Read LEN bytes from IN (or as much as possible if LEN is #f) and write it
+to OUT, using chunks of BUFFER-SIZE bytes."
+ (define buffer
+ (make-bytevector buffer-size))
+
+ (let loop ((total 0)
+ (bytes (get-bytevector-n! in buffer 0
+ (if len
+ (min len buffer-size)
+ buffer-size))))
+ (or (eof-object? bytes)
+ (and len (= total len))
+ (let ((total (+ total bytes)))
+ (put-bytevector out buffer 0 bytes)
+ (loop total
+ (get-bytevector-n! in buffer 0
+ (if len
+ (min (- len total) buffer-size)
+ buffer-size)))))))
+
(define (nar-sha256 file)
"Gives the sha256 hash of a file and the size of the file in nar form."
(let-values (((port get-hash) (open-sha256-port)))
@@ -127,11 +153,27 @@ Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system."
(unless (= EMLINK (system-error-errno args))
(apply throw args)))))))
+(define %deduplication-minimum-size
+ ;; Size below which files are not deduplicated. This avoids adding too many
+ ;; entries to '.links', which would slow down 'removeUnusedLinks' while
+ ;; saving little space. Keep in sync with optimize-store.cc.
+ 8192)
+
(define* (deduplicate path hash #:key (store (%store-directory)))
"Check if a store item with sha256 hash HASH already exists. If so,
replace PATH with a hardlink to the already-existing one. If not, register
PATH so that future duplicates can hardlink to it. PATH is assumed to be
under STORE."
+ ;; Lightweight promises.
+ (define-syntax-rule (delay exp)
+ (let ((value #f))
+ (lambda ()
+ (unless value
+ (set! value exp))
+ value)))
+ (define-syntax-rule (force promise)
+ (promise))
+
(define links-directory
(string-append store "/.links"))
@@ -144,13 +186,18 @@ under STORE."
((file . properties)
(unless (member file '("." ".."))
(let* ((file (string-append path "/" file))
+ (st (delay (lstat file)))
(type (match (assoc-ref properties 'type)
((or 'unknown #f)
- (stat:type (lstat file)))
+ (stat:type (force st)))
(type type))))
- (loop file type
- (and (not (eq? 'directory type))
- (nar-sha256 file)))))))
+ (when (or (eq? 'directory type)
+ (and (eq? 'regular type)
+ (>= (stat:size (force st))
+ %deduplication-minimum-size)))
+ (loop file type
+ (and (not (eq? 'directory type))
+ (nar-sha256 file))))))))
(scandir* path))
(let ((link-file (string-append links-directory "/"
(bytevector->nix-base32-string hash))))
@@ -222,9 +269,9 @@ OUTPUT as it goes."
This procedure is suitable as a #:dump-file argument to 'restore-file'. When
used that way, it deduplicates files on the fly as they are restored, thereby
-removing the need to a deduplication pass that would re-read all the files
+removing the need for a deduplication pass that would re-read all the files
down the road."
- (define hash
+ (define (dump-and-compute-hash)
(call-with-output-file file
(lambda (output)
(let-values (((hash-port get-hash)
@@ -236,7 +283,11 @@ down the road."
(close-port hash-port)
(get-hash)))))
- (deduplicate file hash #:store store))
+ (if (>= size %deduplication-minimum-size)
+ (deduplicate file (dump-and-compute-hash) #:store store)
+ (call-with-output-file file
+ (lambda (output)
+ (dump-port input output size)))))
(define* (copy-file/deduplicate source target
#:key (store (%store-directory)))