summaryrefslogtreecommitdiff
path: root/gnu/packages/python-web.scm
diff options
context:
space:
mode:
authorMaxim Cournoyer <maxim.cournoyer@gmail.com>2022-05-02 00:39:09 -0400
committerMaxim Cournoyer <maxim.cournoyer@gmail.com>2022-05-31 14:52:33 -0400
commit32ffbb16e8cbb9c5416274320a56885c45a88ebf (patch)
tree6bf234e8d7ec788a3668f06cdfba30a03aabac02 /gnu/packages/python-web.scm
parent77afe03cf9e90f597571b4181f620da0997e84af (diff)
downloadguix-patches-32ffbb16e8cbb9c5416274320a56885c45a88ebf.tar
guix-patches-32ffbb16e8cbb9c5416274320a56885c45a88ebf.tar.gz
gnu: Add python-html-text.
* gnu/packages/python-web.scm (python-html-text): New variable.
Diffstat (limited to 'gnu/packages/python-web.scm')
-rw-r--r--gnu/packages/python-web.scm22
1 files changed, 22 insertions, 0 deletions
diff --git a/gnu/packages/python-web.scm b/gnu/packages/python-web.scm
index cb52db3bbd..00fe560a36 100644
--- a/gnu/packages/python-web.scm
+++ b/gnu/packages/python-web.scm
@@ -7387,3 +7387,25 @@ mining to monitoring and automated testing.")
Contrary to the standard Python @code{json} library, it understands js-style
comments. Trailing comma is also supported.")
(license license:expat)))
+
+(define-public python-html-text
+ (package
+ (name "python-html-text")
+ (version "0.5.2")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (pypi-uri "html_text" version))
+ (sha256
+ (base32 "1v9x171l3bmyayc1144nrkn9410lp4lhlrrjii54j7b5f2xipmmg"))))
+ (build-system python-build-system)
+ (native-inputs (list python-pytest))
+ (propagated-inputs (list python-lxml))
+ (home-page "https://github.com/TeamHG-Memex/html-text")
+ (synopsis "Extract text from HTML")
+ (description "HTML to Text is a Python library for extract text from HTML.
+Contrary to other solution such as LXML or Beautiful Soup, the text extracted
+with @code{html_text} does not contain elements such as JavaScript or inline
+styles not normally visible to users. It also normalizes white space
+characters in a smarter, more visually pleasing style.")
+ (license license:expat)))