1{ 2 lib, 3 buildPythonPackage, 4 certifi, 5 charset-normalizer, 6 courlan, 7 fetchPypi, 8 htmldate, 9 justext, 10 lxml, 11 pytestCheckHook, 12 pythonOlder, 13 setuptools, 14 urllib3, 15}: 16 17buildPythonPackage rec { 18 pname = "trafilatura"; 19 version = "2.0.0"; 20 pyproject = true; 21 22 disabled = pythonOlder "3.9"; 23 24 src = fetchPypi { 25 inherit pname version; 26 hash = "sha256-zrcJSm7Ml+cv6nPH26NnFMXFtXe2Rw5FINyok3BtYkc="; 27 }; 28 29 postPatch = '' 30 # nixify path to the trafilatura binary in the test suite 31 substituteInPlace tests/cli_tests.py \ 32 --replace-fail 'trafilatura_bin = "trafilatura"' \ 33 'trafilatura_bin = "${placeholder "out"}/bin/trafilatura"' 34 ''; 35 36 build-system = [ setuptools ]; 37 38 dependencies = [ 39 certifi 40 charset-normalizer 41 courlan 42 htmldate 43 justext 44 lxml 45 urllib3 46 ]; 47 48 nativeCheckInputs = [ pytestCheckHook ]; 49 50 disabledTests = [ 51 # Disable tests that require an internet connection 52 "test_cli_pipeline" 53 "test_crawl_page" 54 "test_download" 55 "test_feeds_helpers" 56 "test_fetch" 57 "test_input_type" 58 "test_is_live_page" 59 "test_meta_redirections" 60 "test_probing" 61 "test_queue" 62 "test_redirection" 63 "test_whole" 64 ]; 65 66 pythonImportsCheck = [ "trafilatura" ]; 67 68 meta = { 69 description = "Python package and command-line tool designed to gather text on the Web"; 70 homepage = "https://trafilatura.readthedocs.io"; 71 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md"; 72 license = lib.licenses.asl20; 73 maintainers = with lib.maintainers; [ jokatzke ]; 74 mainProgram = "trafilatura"; 75 }; 76}