From 0f6b9e9828dfc269bfc4eade771efed1753e8c62 Mon Sep 17 00:00:00 2001 From: Ricardo Wurmus Date: Thu, 3 Dec 2015 16:12:09 +0100 Subject: import: cran: Parse DESCRIPTION instead of HTML. * guix/import/cran.scm (description->alist, listify, beautify-description, description->package): New procedures. (table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove proceduces. (latest-release): Use parsed DESCRIPTION instead of SXML. * tests/cran.scm: Rewrite to match importer. --- guix/import/cran.scm | 265 +++++++++++++++++++++++++-------------------------- 1 file changed, 130 insertions(+), 135 deletions(-) (limited to 'guix/import/cran.scm') diff --git a/guix/import/cran.scm b/guix/import/cran.scm index 43dc2c80b6..845ecb5832 100644 --- a/guix/import/cran.scm +++ b/guix/import/cran.scm @@ -20,26 +20,26 @@ (define-module (guix import cran) #:use-module (ice-9 match) #:use-module (ice-9 regex) + #:use-module ((ice-9 rdelim) #:select (read-string)) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) - #:use-module (sxml simple) - #:use-module (sxml match) - #:use-module (sxml xpath) #:use-module (guix http-client) #:use-module (guix hash) #:use-module (guix store) #:use-module (guix base32) #:use-module ((guix download) #:select (download-to-store)) #:use-module (guix import utils) + #:use-module ((guix build-system r) #:select (cran-uri)) #:use-module (guix upstream) #:use-module (guix packages) + #:use-module (gnu packages) #:export (cran->guix-package %cran-updater)) ;;; Commentary: ;;; ;;; Generate a package declaration template for the latest version of an R -;;; package on CRAN, using the HTML description downloaded from +;;; package on CRAN, using the DESCRIPTION file downloaded from ;;; cran.r-project.org. ;;; ;;; Code: @@ -67,6 +67,31 @@ ((lst ...) `(list ,@(map string->license lst))) (_ #f))) + +(define (description->alist description) + "Convert a DESCRIPTION string into an alist." + (let ((lines (string-split description #\newline)) + (parse (lambda (line acc) + (if (string-null? line) acc + ;; Keys usually start with a capital letter and end with + ;; ":". There are some exceptions, unfortunately (such + ;; as "biocViews"). There are no blanks in a key. + (if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line) + ;; New key/value pair + (let* ((pos (string-index line #\:)) + (key (string-take line pos)) + (value (string-drop line (+ 1 pos)))) + (cons (cons key + (string-trim-both value)) + acc)) + ;; This is a continuation of the previous pair + (match-let ((((key . value) . rest) acc)) + (cons (cons key (string-join + (list value + (string-trim-both line)))) + rest))))))) + (fold parse '() lines))) + (define (format-inputs names) "Generate a sorted list of package inputs from a list of package NAMES." (map (lambda (name) @@ -82,125 +107,94 @@ package definition." ((package-inputs ...) `((,type (,'quasiquote ,(format-inputs package-inputs))))))) -(define (table-datum tree label) - "Extract the datum node following a LABEL in the sxml table TREE. Only the -first cell of a table row is considered a label cell." - ((node-pos 1) - ((sxpath `(xhtml:tr - (xhtml:td 1) ; only first cell can contain label - (equal? ,label) - ,(node-parent tree) ; go up to label cell - ,(node-parent tree) ; go up to matching row - (xhtml:td 2))) ; select second cell - tree))) - (define %cran-url "http://cran.r-project.org/web/packages/") (define (cran-fetch name) - "Return an sxml representation of the CRAN page for the R package NAME, -or #f on failure. NAME is case-sensitive." + "Return an alist of the contents of the DESCRIPTION file for the R package +NAME, or #f on failure. NAME is case-sensitive." ;; This API always returns the latest release of the module. - (let ((cran-url (string-append %cran-url name "/"))) - (false-if-exception - (xml->sxml (http-fetch cran-url) - #:trim-whitespace? #t - #:namespaces '((xhtml . "http://www.w3.org/1999/xhtml")) - #:default-entity-handler - (lambda (port name) - (case name - ((nbsp) " ") - ((ge) ">=") - ((gt) ">") - ((lt) "<") - (else - (format (current-warning-port) - "~a:~a:~a: undefined entitity: ~a\n" - cran-url (port-line port) (port-column port) - name) - (symbol->string name)))))))) - -(define (downloads->url downloads) - "Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the -download URL." - (string-append "mirror://cran/" - ;; Remove double dots, because we want an - ;; absolute path. - (regexp-substitute/global - #f "\\.\\./" - (string-join ((sxpath '((xhtml:a 1) @ href *text*)) - (table-datum downloads " Package source: "))) - 'pre 'post))) - -(define (nodes->text nodeset) - "Return the concatenation of the text nodes among NODESET." - (string-join ((sxpath '(// *text*)) nodeset) " ")) - -(define (cran-sxml->sexp sxml) - "Return the `package' s-expression for a CRAN package from the SXML -representation of the package page." + (let ((url (string-append %cran-url name "/DESCRIPTION"))) + (description->alist (read-string (http-fetch url))))) + +(define (listify meta field) + "Look up FIELD in the alist META. If FIELD contains a comma-separated +string, turn it into a list and strip off parenthetic expressions. Return the +empty list when the FIELD cannot be found." + (let ((value (assoc-ref meta field))) + (if (not value) + '() + ;; Strip off parentheses + (let ((items (string-split (regexp-substitute/global + #f "( *\\([^\\)]+\\)) *" + value 'pre 'post) + #\,))) + ;; When there is whitespace inside of items it is probably because + ;; this was not an actual list to begin with. + (remove (cut string-any char-set:whitespace <>) + (map string-trim-both items)))))) + +(define (beautify-description description) + "Improve the package DESCRIPTION by turning a beginning sentence fragment +into a proper sentence and by using two spaces between sentences." + (let ((cleaned (if (string-prefix? "A " description) + (string-append "This package provides a" + (substring description 1)) + description))) + ;; Use double spacing between sentences + (regexp-substitute/global #f "\\. \\b" + cleaned 'pre ". " 'post))) + +(define (description->package meta) + "Return the `package' s-expression for a CRAN package from the alist META, +which was derived from the R package's DESCRIPTION file." (define (guix-name name) (if (string-prefix? "r-" name) (string-downcase name) (string-append "r-" (string-downcase name)))) - (sxml-match-let* - (((*TOP* (xhtml:html - ,head - (xhtml:body - (xhtml:h2 ,name-and-synopsis) - (xhtml:p ,description) - ,summary - (xhtml:h4 "Downloads:") ,downloads - . ,rest))) - sxml)) - (let* ((name (match:prefix (string-match ": " name-and-synopsis))) - (synopsis (match:suffix (string-match ": " name-and-synopsis))) - (version (nodes->text (table-datum summary "Version:"))) - (license ((compose string->license nodes->text) - (table-datum summary "License:"))) - (home-page (nodes->text ((sxpath '((xhtml:a 1))) - (table-datum summary "URL:")))) - (source-url (downloads->url downloads)) - (tarball (with-store store (download-to-store store source-url))) - (sysdepends (map match:substring - (list-matches - "[^ ]+" - ;; Strip off comma and parenthetical - ;; expressions. - (regexp-substitute/global - #f "(,|\\([^\\)]+\\))" - (nodes->text (table-datum summary - "SystemRequirements:")) - 'pre 'post)))) - (imports (map guix-name - ((sxpath '(// xhtml:a *text*)) - (table-datum summary "Imports:"))))) - `(package - (name ,(guix-name name)) - (version ,version) - (source (origin - (method url-fetch) - (uri (cran-uri ,name version)) - (sha256 - (base32 - ,(bytevector->nix-base32-string (file-sha256 tarball)))))) - (build-system r-build-system) - ,@(maybe-inputs sysdepends) - ,@(maybe-inputs imports 'propagated-inputs) - (home-page ,(if (string-null? home-page) - (string-append %cran-url name) - home-page)) - (synopsis ,synopsis) - ;; Use double spacing - (description ,(regexp-substitute/global #f "\\. \\b" description - 'pre ". " 'post)) - (license ,license))))) + (let* ((name (assoc-ref meta "Package")) + (synopsis (assoc-ref meta "Title")) + (version (assoc-ref meta "Version")) + (license (string->license (assoc-ref meta "License"))) + ;; Some packages have multiple home pages. Some have none. + (home-page (match (listify meta "URL") + ((url rest ...) url) + (_ (string-append %cran-url name)))) + (source-url (match (cran-uri name version) + ((url rest ...) url) + (_ #f))) + (tarball (with-store store (download-to-store store source-url))) + (sysdepends (map string-downcase (listify meta "SystemRequirements"))) + (propagate (map guix-name (lset-union equal? + (listify meta "Imports") + (listify meta "LinkingTo") + (delete "R" + (listify meta "Depends")))))) + `(package + (name ,(guix-name name)) + (version ,version) + (source (origin + (method url-fetch) + (uri (cran-uri ,name version)) + (sha256 + (base32 + ,(bytevector->nix-base32-string (file-sha256 tarball)))))) + (properties ,`(,'quasiquote ((,'upstream-name . ,name)))) + (build-system r-build-system) + ,@(maybe-inputs sysdepends) + ,@(maybe-inputs propagate 'propagated-inputs) + (home-page ,(if (string-null? home-page) + (string-append %cran-url name) + home-page)) + (synopsis ,synopsis) + (description ,(beautify-description (assoc-ref meta "Description"))) + (license ,license)))) (define (cran->guix-package package-name) "Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the `package' s-expression corresponding to that package, or #f on failure." (let ((module-meta (cran-fetch package-name))) - (and=> module-meta cran-sxml->sexp))) + (and=> module-meta description->package))) ;;; @@ -209,32 +203,33 @@ representation of the package page." (define (latest-release package) "Return an for the latest release of PACKAGE." - (define name - (if (string-prefix? "r-" package) - (string-drop package 2) - package)) - - (define sxml - (cran-fetch name)) - - (and sxml - (sxml-match-let* - (((*TOP* (xhtml:html - ,head - (xhtml:body - (xhtml:h2 ,name-and-synopsis) - (xhtml:p ,description) - ,summary - (xhtml:h4 "Downloads:") ,downloads - . ,rest))) - sxml)) - (let ((version (nodes->text (table-datum summary "Version:"))) - (url (downloads->url downloads))) - ;; CRAN does not provide signatures. - (upstream-source - (package package) - (version version) - (urls (list url))))))) + + (define (package->cran-name package) + (match (package-source package) + ((? origin? origin) + (match (origin-uri origin) + ((url rest ...) + (let ((end (string-rindex url #\_)) + (start (string-rindex url #\/))) + ;; The URL ends on + ;; (string-append "/" name "_" version ".tar.gz") + (substring url start end))) + (_ #f))) + (_ #f))) + + (define cran-name + (package->cran-name (specification->package package))) + + (define meta + (cran-fetch cran-name)) + + (and meta + (let ((version (assoc-ref meta "Version"))) + ;; CRAN does not provide signatures. + (upstream-source + (package package) + (version version) + (urls (cran-uri cran-name version)))))) (define (cran-package? package) "Return true if PACKAGE is an R package from CRAN." -- cgit v1.2.3