1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build-system 7 setuptools, 8 9 # dependencies 10 certifi, 11 charset-normalizer, 12 courlan, 13 htmldate, 14 justext, 15 lxml, 16 urllib3, 17 18 # tests 19 pytestCheckHook, 20}: 21 22buildPythonPackage rec { 23 pname = "trafilatura"; 24 version = "2.0.0"; 25 pyproject = true; 26 27 src = fetchFromGitHub { 28 owner = "adbar"; 29 repo = "trafilatura"; 30 tag = "v${version}"; 31 hash = "sha256-Cf1W3JEGSMkVmRZVTXYsXzZK/Nt/aDG890Sf0/0OZAA="; 32 }; 33 34 postPatch = '' 35 # nixify path to the trafilatura binary in the test suite 36 substituteInPlace tests/cli_tests.py \ 37 --replace-fail 'trafilatura_bin = "trafilatura"' \ 38 'trafilatura_bin = "${placeholder "out"}/bin/trafilatura"' 39 ''; 40 41 build-system = [ setuptools ]; 42 43 dependencies = [ 44 certifi 45 charset-normalizer 46 courlan 47 htmldate 48 justext 49 lxml 50 urllib3 51 ]; 52 53 nativeCheckInputs = [ pytestCheckHook ]; 54 55 disabledTests = [ 56 # TypeError: argument of type 'NoneType' is not iterable 57 # https://github.com/adbar/trafilatura/issues/805 58 "test_external" 59 "test_extract" 60 61 # AttributeError: 'NoneType' object has no attribute 'find' 62 # https://github.com/adbar/trafilatura/issues/805 63 "test_table_processing" 64 65 # Disable tests that require an internet connection 66 "test_cli_pipeline" 67 "test_crawl_page" 68 "test_download" 69 "test_feeds_helpers" 70 "test_fetch" 71 "test_input_type" 72 "test_is_live_page" 73 "test_meta_redirections" 74 "test_probing" 75 "test_queue" 76 "test_redirection" 77 "test_whole" 78 ]; 79 80 pythonImportsCheck = [ "trafilatura" ]; 81 82 meta = { 83 description = "Python package and command-line tool designed to gather text on the Web"; 84 homepage = "https://trafilatura.readthedocs.io"; 85 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md"; 86 license = lib.licenses.asl20; 87 maintainers = with lib.maintainers; [ jokatzke ]; 88 mainProgram = "trafilatura"; 89 }; 90}