at 24.11-pre 75 lines 1.7 kB view raw
1{ 2 lib, 3 buildPythonPackage, 4 certifi, 5 charset-normalizer, 6 courlan, 7 fetchPypi, 8 htmldate, 9 justext, 10 lxml, 11 pytestCheckHook, 12 pythonOlder, 13 setuptools, 14 urllib3, 15}: 16 17buildPythonPackage rec { 18 pname = "trafilatura"; 19 version = "1.9.0"; 20 pyproject = true; 21 22 disabled = pythonOlder "3.9"; 23 24 src = fetchPypi { 25 inherit pname version; 26 hash = "sha256-5oM9KauKE+2FOTfXyR5oaLxi774QIUrCsQZDbdI9FBI="; 27 }; 28 29 # Patch out gui cli because it is not supported in this packaging and 30 # nixify path to the trafilatura binary in the test suite 31 postPatch = '' 32 substituteInPlace setup.py \ 33 --replace-fail '"trafilatura_gui=trafilatura.gui:main",' "" 34 substituteInPlace tests/cli_tests.py \ 35 --replace-fail "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'" 36 ''; 37 38 build-system = [ setuptools ]; 39 40 dependencies = [ 41 certifi 42 charset-normalizer 43 courlan 44 htmldate 45 justext 46 lxml 47 urllib3 48 ]; 49 50 nativeCheckInputs = [ pytestCheckHook ]; 51 52 disabledTests = [ 53 # Disable tests that require an internet connection 54 "test_cli_pipeline" 55 "test_crawl_page" 56 "test_download" 57 "test_fetch" 58 "test_meta_redirections" 59 "test_probing" 60 "test_queue" 61 "test_redirection" 62 "test_whole" 63 ]; 64 65 pythonImportsCheck = [ "trafilatura" ]; 66 67 meta = with lib; { 68 description = "Python package and command-line tool designed to gather text on the Web"; 69 homepage = "https://trafilatura.readthedocs.io"; 70 changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md"; 71 license = licenses.asl20; 72 maintainers = with maintainers; [ jokatzke ]; 73 mainProgram = "trafilatura"; 74 }; 75}