diff options
Diffstat (limited to 'guix/scripts/offload.scm')
-rw-r--r-- | guix/scripts/offload.scm | 365 |
1 files changed, 284 insertions, 81 deletions
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index 00a145e5e9..4d2f78f711 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -23,7 +23,7 @@ #:use-module (guix derivations) #:use-module (guix nar) #:use-module (guix utils) - #:use-module ((guix build utils) #:select (which)) + #:use-module ((guix build utils) #:select (which mkdir-p)) #:use-module (guix ui) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) @@ -122,38 +122,40 @@ determined." (leave (_ "failed to load machine file '~a': ~s~%") file args)))))) -(define (open-ssh-gateway machine) - "Initiate an SSH connection gateway to MACHINE, and return the PID of the -running lsh gateway upon success, or #f on failure." - (catch 'system-error - (lambda () - (let* ((port (open-pipe* OPEN_READ %lsh-command - "-l" (build-machine-user machine) - "-i" (build-machine-private-key machine) - ;; XXX: With lsh 2.1, passing '--write-pid' - ;; last causes the PID not to be printed. - "--write-pid" "--gateway" "--background" "-z" - (build-machine-name machine))) - (line (read-line port)) - (status (close-pipe port))) - (if (zero? status) - (let ((pid (string->number line))) - (if (integer? pid) - pid - (begin - (warning (_ "'~a' did not write its PID on stdout: ~s~%") - %lsh-command line) - #f))) - (begin - (warning (_ "failed to initiate SSH connection to '~a':\ - '~a' exited with ~a~%") - (build-machine-name machine) - %lsh-command - (status:exit-val status)) - #f)))) - (lambda args - (leave (_ "failed to execute '~a': ~a~%") - %lsh-command (strerror (system-error-errno args)))))) +;;; FIXME: The idea was to open the connection to MACHINE once for all, but +;;; lshg is currently non-functional. +;; (define (open-ssh-gateway machine) +;; "Initiate an SSH connection gateway to MACHINE, and return the PID of the +;; running lsh gateway upon success, or #f on failure." +;; (catch 'system-error +;; (lambda () +;; (let* ((port (open-pipe* OPEN_READ %lsh-command +;; "-l" (build-machine-user machine) +;; "-i" (build-machine-private-key machine) +;; ;; XXX: With lsh 2.1, passing '--write-pid' +;; ;; last causes the PID not to be printed. +;; "--write-pid" "--gateway" "--background" "-z" +;; (build-machine-name machine))) +;; (line (read-line port)) +;; (status (close-pipe port))) +;; (if (zero? status) +;; (let ((pid (string->number line))) +;; (if (integer? pid) +;; pid +;; (begin +;; (warning (_ "'~a' did not write its PID on stdout: ~s~%") +;; %lsh-command line) +;; #f))) +;; (begin +;; (warning (_ "failed to initiate SSH connection to '~a':\ +;; '~a' exited with ~a~%") +;; (build-machine-name machine) +;; %lsh-command +;; (status:exit-val status)) +;; #f)))) +;; (lambda args +;; (leave (_ "failed to execute '~a': ~a~%") +;; %lsh-command (strerror (system-error-errno args)))))) (define (remote-pipe machine mode command) "Run COMMAND on MACHINE, assuming an lsh gateway has been set up." @@ -161,6 +163,10 @@ running lsh gateway upon success, or #f on failure." (lambda () (apply open-pipe* mode %lshg-command "-l" (build-machine-user machine) "-z" + + ;; XXX: Remove '-i' when %LSHG-COMMAND really is lshg. + "-i" (build-machine-private-key machine) + (build-machine-name machine) command)) (lambda args @@ -168,9 +174,89 @@ running lsh gateway upon success, or #f on failure." %lshg-command (strerror (system-error-errno args))) #f))) + +;;; +;;; Synchronization. +;;; + +(define (lock-file file) + "Wait and acquire an exclusive lock on FILE. Return an open port." + (mkdir-p (dirname file)) + (let ((port (open-file file "w0"))) + (fcntl-flock port 'write-lock) + port)) + +(define (unlock-file lock) + "Unlock LOCK." + (fcntl-flock lock 'unlock) + (close-port lock) + #t) + +(define-syntax-rule (with-file-lock file exp ...) + "Wait to acquire a lock on FILE and evaluate EXP in that context." + (let ((port (lock-file file))) + (dynamic-wind + (lambda () + #t) + (lambda () + exp ...) + (lambda () + (unlock-file port))))) + +(define-syntax-rule (with-machine-lock machine hint exp ...) + "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that +context." + (with-file-lock (machine-lock-file machine hint) + exp ...)) + + +(define (machine-slot-file machine slot) + "Return the file name of MACHINE's file for SLOT." + ;; For each machine we have a bunch of files representing each build slot. + ;; When choosing a build machine, we attempt to get an exclusive lock on one + ;; of these; if we fail, that means all the build slots are already taken. + ;; Inspired by Nix's build-remote.pl. + (string-append (string-append %state-directory "/offload/" + (build-machine-name machine) + "/" (number->string slot)))) + +(define (acquire-build-slot machine) + "Attempt to acquire a build slot on MACHINE. Return the port representing +the slot, or #f if none is available. + +This mechanism allows us to set a hard limit on the number of simultaneous +connections allowed to MACHINE." + (mkdir-p (dirname (machine-slot-file machine 0))) + (with-machine-lock machine 'slots + (any (lambda (slot) + (let ((port (open-file (machine-slot-file machine slot) + "w0"))) + (catch 'flock-error + (lambda () + (fcntl-flock port 'write-lock #:wait? #f) + ;; Got it! + (format (current-error-port) + "process ~a acquired build slot '~a'~%" + (getpid) (port-filename port)) + port) + (lambda args + ;; PORT is already locked by another process. + (close-port port) + #f)))) + (iota (build-machine-parallel-builds machine))))) + +(define (release-build-slot slot) + "Release SLOT, a build slot as returned as by 'acquire-build-slot'." + (close-port slot)) + + +;;; +;;; Offloading. +;;; + (define* (offload drv machine #:key print-build-trace? (max-silent-time 3600) - (build-timeout 7200) (log-port (current-output-port))) + build-timeout (log-port (current-output-port))) "Perform DRV on MACHINE, assuming DRV and its prerequisites are available there, and write the build log to LOG-PORT. Return the exit status." (format (current-error-port) "offloading '~a' to '~a'...~%" @@ -181,9 +267,12 @@ there, and write the build log to LOG-PORT. Return the exit status." ;; FIXME: Protect DRV from garbage collection on MACHINE. (let ((pipe (remote-pipe machine OPEN_READ `("guix" "build" - ;; FIXME: more options ,(format #f "--max-silent-time=~a" max-silent-time) + ,@(if build-timeout + (list (format #f "--timeout=~a" + build-timeout)) + '()) ,(derivation-file-name drv))))) (let loop ((line (read-line pipe))) (unless (eof-object? line) @@ -193,6 +282,43 @@ there, and write the build log to LOG-PORT. Return the exit status." (close-pipe pipe))) +(define* (transfer-and-offload drv machine + #:key + (inputs '()) + (outputs '()) + (max-silent-time 3600) + build-timeout + print-build-trace?) + "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of +INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from +MACHINE." + ;; Acquire MACHINE's exclusive lock to serialize file transfers + ;; to/from MACHINE in the presence of several 'offload' hook + ;; instance. + (when (with-machine-lock machine 'bandwidth + (send-files (cons (derivation-file-name drv) inputs) + machine)) + (let ((status (offload drv machine + #:print-build-trace? print-build-trace? + #:max-silent-time max-silent-time + #:build-timeout build-timeout))) + (if (zero? status) + (begin + ;; Likewise (see above.) + (with-machine-lock machine 'bandwidth + (retrieve-files outputs machine)) + (format (current-error-port) + "done with offloaded '~a'~%" + (derivation-file-name drv))) + (begin + (format (current-error-port) + "derivation '~a' offloaded to '~a' failed \ +with exit code ~a~%" + (derivation-file-name drv) + (build-machine-name machine) + (status:exit-val status)) + (primitive-exit (status:exit-val status))))))) + (define (send-files files machine) "Send the subset of FILES that's missing to MACHINE's store. Return #t on success, #f otherwise." @@ -256,6 +382,11 @@ success, #f otherwise." (zero? (close-pipe pipe))))))) + +;;; +;;; Scheduling. +;;; + (define (machine-matches? machine requirements) "Return #t if MACHINE matches REQUIREMENTS." (and (string=? (build-requirements-system requirements) @@ -268,57 +399,124 @@ success, #f otherwise." "Return #t if M1 is faster than M2." (> (build-machine-speed m1) (build-machine-speed m2))) -(define (choose-build-machine requirements machines) - "Return the best machine among MACHINES fulfilling REQUIREMENTS, or #f." - ;; FIXME: Take machine load into account, and/or shuffle MACHINES. - (let ((machines (sort (filter (cut machine-matches? <> requirements) - machines) - machine-faster?))) - (match machines - ((head . _) - head) - (_ #f)))) +(define (machine-load machine) + "Return the load of MACHINE, divided by the number of parallel builds +allowed on MACHINE." + (let* ((pipe (remote-pipe machine OPEN_READ `("cat" "/proc/loadavg"))) + (line (read-line pipe))) + (close-pipe pipe) + (if (eof-object? line) + 1. + (match (string-tokenize line) + ((one five fifteen . _) + (let* ((raw (string->number five)) + (jobs (build-machine-parallel-builds machine)) + (normalized (/ raw jobs))) + (format (current-error-port) "load on machine '~a' is ~s\ + (normalized: ~s)~%" + (build-machine-name machine) raw normalized) + normalized)) + (_ + 1.))))) + +(define (machine-less-loaded? m1 m2) + "Return #t if the load on M1 is lower than that on M2." + (< (machine-load m1) (machine-load m2))) + +(define (machine-less-loaded-or-faster? m1 m2) + "Return #t if M1 is either less loaded or faster than M2." + (or (machine-less-loaded? m1 m2) + (machine-faster? m1 m2))) + +(define (machine-lock-file machine hint) + "Return the name of MACHINE's lock file for HINT." + (string-append %state-directory "/offload/" + (build-machine-name machine) + "." (symbol->string hint) ".lock")) + +(define (machine-choice-lock-file) + "Return the name of the file used as a lock when choosing a build machine." + (string-append %state-directory "/offload/machine-choice.lock")) + + +(define %slots + ;; List of acquired build slots (open ports). + '()) + +(define (choose-build-machine machines) + "Return the best machine among MACHINES, or #f." + + ;; Proceed like this: + ;; 1. Acquire the global machine-choice lock. + ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out + ;; those machines for which we failed. + ;; 3. Choose the best machine among those that are left. + ;; 4. Release the previously-acquired build slots of the other machines. + ;; 5. Release the global machine-choice lock. + + (with-file-lock (machine-choice-lock-file) + (define machines+slots + (filter-map (lambda (machine) + (let ((slot (acquire-build-slot machine))) + (and slot (list machine slot)))) + machines)) + + (define (undecorate pred) + (match-lambda + ((machine slot) + (and (pred machine) + (list machine slot))))) + + (let ((machines+slots (sort machines+slots + (undecorate machine-less-loaded-or-faster?)))) + (match machines+slots + (((best slot) (others slots) ...) + ;; Release slots from the uninteresting machines. + (for-each release-build-slot slots) + + ;; Return the best machine unless it's already overloaded. + (if (< (machine-load best) 2.) + (begin + ;; Prevent SLOT from being GC'd. + (set! %slots (cons slot %slots)) + best) + (begin + (release-build-slot slot) + #f))) + (() #f))))) (define* (process-request wants-local? system drv features #:key print-build-trace? (max-silent-time 3600) - (build-timeout 7200)) + build-timeout) "Process a request to build DRV." - (let* ((local? (and wants-local? (string=? system (%current-system)))) - (reqs (build-requirements - (system system) - (features features))) - (machine (choose-build-machine reqs (build-machines)))) - (if machine - (match (open-ssh-gateway machine) - ((? integer? pid) - (display "# accept\n") - (let ((inputs (string-tokenize (read-line))) - (outputs (string-tokenize (read-line)))) - (when (send-files (cons (derivation-file-name drv) inputs) - machine) - (let ((status (offload drv machine - #:print-build-trace? print-build-trace? - #:max-silent-time max-silent-time - #:build-timeout build-timeout))) - (kill pid SIGTERM) - (if (zero? status) - (begin - (retrieve-files outputs machine) - (format (current-error-port) - "done with offloaded '~a'~%" - (derivation-file-name drv))) - (begin - (format (current-error-port) - "derivation '~a' offloaded to '~a' failed \ -with exit code ~a~%" - (derivation-file-name drv) - (build-machine-name machine) - (status:exit-val status)) - (primitive-exit (status:exit-val status)))))))) - (#f - (display "# decline\n"))) - (display "# decline\n")))) + (let* ((local? (and wants-local? (string=? system (%current-system)))) + (reqs (build-requirements + (system system) + (features features))) + (candidates (filter (cut machine-matches? <> reqs) + (build-machines)))) + (match candidates + (() + ;; We'll never be able to match REQS. + (display "# decline\n")) + ((_ ...) + (let ((machine (choose-build-machine candidates))) + (if machine + (begin + ;; Offload DRV to MACHINE. + (display "# accept\n") + (let ((inputs (string-tokenize (read-line))) + (outputs (string-tokenize (read-line)))) + (transfer-and-offload drv machine + #:inputs inputs + #:outputs outputs + #:max-silent-time max-silent-time + #:build-timeout build-timeout + #:print-build-trace? print-build-trace?))) + + ;; Not now, all the machines are busy. + (display "# postpone\n"))))))) (define-syntax-rule (with-nar-error-handling body ...) "Execute BODY with any &nar-error suitably reported to the user." @@ -388,4 +586,9 @@ This tool is meant to be used internally by 'guix-daemon'.\n")) (x (leave (_ "invalid arguments: ~{~s ~}~%") x)))) +;;; Local Variables: +;;; eval: (put 'with-machine-lock 'scheme-indent-function 2) +;;; eval: (put 'with-file-lock 'scheme-indent-function 1) +;;; End: + ;;; offload.scm ends here |