1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build-system
7 setuptools,
8
9 # dependencies
10 certifi,
11 charset-normalizer,
12 courlan,
13 htmldate,
14 justext,
15 lxml,
16 urllib3,
17
18 # tests
19 pytestCheckHook,
20}:
21
22buildPythonPackage rec {
23 pname = "trafilatura";
24 version = "2.0.0";
25 pyproject = true;
26
27 src = fetchFromGitHub {
28 owner = "adbar";
29 repo = "trafilatura";
30 tag = "v${version}";
31 hash = "sha256-Cf1W3JEGSMkVmRZVTXYsXzZK/Nt/aDG890Sf0/0OZAA=";
32 };
33
34 postPatch = ''
35 # nixify path to the trafilatura binary in the test suite
36 substituteInPlace tests/cli_tests.py \
37 --replace-fail 'trafilatura_bin = "trafilatura"' \
38 'trafilatura_bin = "${placeholder "out"}/bin/trafilatura"'
39 '';
40
41 build-system = [ setuptools ];
42
43 dependencies = [
44 certifi
45 charset-normalizer
46 courlan
47 htmldate
48 justext
49 lxml
50 urllib3
51 ];
52
53 nativeCheckInputs = [ pytestCheckHook ];
54
55 disabledTests = [
56 # TypeError: argument of type 'NoneType' is not iterable
57 # https://github.com/adbar/trafilatura/issues/805
58 "test_external"
59 "test_extract"
60
61 # AttributeError: 'NoneType' object has no attribute 'find'
62 # https://github.com/adbar/trafilatura/issues/805
63 "test_table_processing"
64
65 # Disable tests that require an internet connection
66 "test_cli_pipeline"
67 "test_crawl_page"
68 "test_download"
69 "test_feeds_helpers"
70 "test_fetch"
71 "test_input_type"
72 "test_is_live_page"
73 "test_meta_redirections"
74 "test_probing"
75 "test_queue"
76 "test_redirection"
77 "test_whole"
78 ];
79
80 pythonImportsCheck = [ "trafilatura" ];
81
82 meta = {
83 description = "Python package and command-line tool designed to gather text on the Web";
84 homepage = "https://trafilatura.readthedocs.io";
85 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
86 license = lib.licenses.asl20;
87 maintainers = with lib.maintainers; [ jokatzke ];
88 mainProgram = "trafilatura";
89 };
90}