1{
2 lib,
3 buildPythonPackage,
4 certifi,
5 charset-normalizer,
6 courlan,
7 fetchPypi,
8 htmldate,
9 justext,
10 lxml,
11 pytestCheckHook,
12 pythonOlder,
13 setuptools,
14 urllib3,
15}:
16
17buildPythonPackage rec {
18 pname = "trafilatura";
19 version = "1.12.2";
20 pyproject = true;
21
22 disabled = pythonOlder "3.9";
23
24 src = fetchPypi {
25 inherit pname version;
26 hash = "sha256-TJyxQ09+E+8LFstE7h1E6EUj7HJolAuVWcN05+/8mpY=";
27 };
28
29 # Patch out gui cli because it is not supported in this packaging and
30 # nixify path to the trafilatura binary in the test suite
31 postPatch = ''
32 substituteInPlace setup.py \
33 --replace-fail '"trafilatura_gui=trafilatura.gui:main",' ""
34 substituteInPlace tests/cli_tests.py \
35 --replace-fail 'trafilatura_bin = "trafilatura"' \
36 'trafilatura_bin = "${placeholder "out"}/bin/trafilatura"'
37 '';
38
39 build-system = [ setuptools ];
40
41 dependencies = [
42 certifi
43 charset-normalizer
44 courlan
45 htmldate
46 justext
47 lxml
48 urllib3
49 ];
50
51 nativeCheckInputs = [ pytestCheckHook ];
52
53 disabledTests = [
54 # Disable tests that require an internet connection
55 "test_cli_pipeline"
56 "test_crawl_page"
57 "test_download"
58 "test_feeds_helpers"
59 "test_fetch"
60 "test_is_live_page"
61 "test_meta_redirections"
62 "test_probing"
63 "test_queue"
64 "test_redirection"
65 "test_whole"
66 ];
67
68 pythonImportsCheck = [ "trafilatura" ];
69
70 meta = {
71 description = "Python package and command-line tool designed to gather text on the Web";
72 homepage = "https://trafilatura.readthedocs.io";
73 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
74 license = lib.licenses.asl20;
75 maintainers = with lib.maintainers; [ jokatzke ];
76 mainProgram = "trafilatura";
77 };
78}