1{
2 lib,
3 buildPythonPackage,
4 certifi,
5 charset-normalizer,
6 courlan,
7 fetchPypi,
8 htmldate,
9 justext,
10 lxml,
11 pytestCheckHook,
12 pythonOlder,
13 setuptools,
14 urllib3,
15}:
16
17buildPythonPackage rec {
18 pname = "trafilatura";
19 version = "1.9.0";
20 pyproject = true;
21
22 disabled = pythonOlder "3.9";
23
24 src = fetchPypi {
25 inherit pname version;
26 hash = "sha256-5oM9KauKE+2FOTfXyR5oaLxi774QIUrCsQZDbdI9FBI=";
27 };
28
29 # Patch out gui cli because it is not supported in this packaging and
30 # nixify path to the trafilatura binary in the test suite
31 postPatch = ''
32 substituteInPlace setup.py \
33 --replace-fail '"trafilatura_gui=trafilatura.gui:main",' ""
34 substituteInPlace tests/cli_tests.py \
35 --replace-fail "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'"
36 '';
37
38 build-system = [ setuptools ];
39
40 dependencies = [
41 certifi
42 charset-normalizer
43 courlan
44 htmldate
45 justext
46 lxml
47 urllib3
48 ];
49
50 nativeCheckInputs = [ pytestCheckHook ];
51
52 disabledTests = [
53 # Disable tests that require an internet connection
54 "test_cli_pipeline"
55 "test_crawl_page"
56 "test_download"
57 "test_fetch"
58 "test_meta_redirections"
59 "test_probing"
60 "test_queue"
61 "test_redirection"
62 "test_whole"
63 ];
64
65 pythonImportsCheck = [ "trafilatura" ];
66
67 meta = with lib; {
68 description = "Python package and command-line tool designed to gather text on the Web";
69 homepage = "https://trafilatura.readthedocs.io";
70 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
71 license = licenses.asl20;
72 maintainers = with maintainers; [ jokatzke ];
73 mainProgram = "trafilatura";
74 };
75}