1{
2 lib,
3 buildPythonPackage,
4 certifi,
5 charset-normalizer,
6 courlan,
7 fetchPypi,
8 htmldate,
9 justext,
10 lxml,
11 pytestCheckHook,
12 pythonOlder,
13 setuptools,
14 urllib3,
15}:
16
17buildPythonPackage rec {
18 pname = "trafilatura";
19 version = "2.0.0";
20 pyproject = true;
21
22 disabled = pythonOlder "3.9";
23
24 src = fetchPypi {
25 inherit pname version;
26 hash = "sha256-zrcJSm7Ml+cv6nPH26NnFMXFtXe2Rw5FINyok3BtYkc=";
27 };
28
29 postPatch = ''
30 # nixify path to the trafilatura binary in the test suite
31 substituteInPlace tests/cli_tests.py \
32 --replace-fail 'trafilatura_bin = "trafilatura"' \
33 'trafilatura_bin = "${placeholder "out"}/bin/trafilatura"'
34 '';
35
36 build-system = [ setuptools ];
37
38 dependencies = [
39 certifi
40 charset-normalizer
41 courlan
42 htmldate
43 justext
44 lxml
45 urllib3
46 ];
47
48 nativeCheckInputs = [ pytestCheckHook ];
49
50 disabledTests = [
51 # Disable tests that require an internet connection
52 "test_cli_pipeline"
53 "test_crawl_page"
54 "test_download"
55 "test_feeds_helpers"
56 "test_fetch"
57 "test_input_type"
58 "test_is_live_page"
59 "test_meta_redirections"
60 "test_probing"
61 "test_queue"
62 "test_redirection"
63 "test_whole"
64 ];
65
66 pythonImportsCheck = [ "trafilatura" ];
67
68 meta = {
69 description = "Python package and command-line tool designed to gather text on the Web";
70 homepage = "https://trafilatura.readthedocs.io";
71 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
72 license = lib.licenses.asl20;
73 maintainers = with lib.maintainers; [ jokatzke ];
74 mainProgram = "trafilatura";
75 };
76}