Merge pull request #244656 from jokatzke/trafilatura

python3Packages.trafilatura: init at 1.6.3

authored by Sandro and committed by GitHub bc48aabf aeddcf46

+279
+6
maintainers/maintainer-list.nix
··· 9162 fingerprint = "7249 70E6 A661 D84E 8B47 678A 0590 93B1 A278 BCD0"; 9163 }]; 9164 }; 9165 joko = { 9166 email = "ioannis.koutras@gmail.com"; 9167 github = "jokogr";
··· 9162 fingerprint = "7249 70E6 A661 D84E 8B47 678A 0590 93B1 A278 BCD0"; 9163 }]; 9164 }; 9165 + jokatzke = { 9166 + email = "jokatzke@fastmail.com"; 9167 + github = "jokatzke"; 9168 + githubId = 46931073; 9169 + name = "Jonas Katzke"; 9170 + }; 9171 joko = { 9172 email = "ioannis.koutras@gmail.com"; 9173 github = "jokogr";
+54
pkgs/development/python-modules/courlan/default.nix
···
··· 1 + { lib 2 + , buildPythonPackage 3 + , fetchPypi 4 + , langcodes 5 + , pytestCheckHook 6 + , tld 7 + , urllib3 8 + , pythonOlder 9 + }: 10 + 11 + buildPythonPackage rec { 12 + pname = "courlan"; 13 + version = "0.9.5"; 14 + format = "setuptools"; 15 + 16 + disabled = pythonOlder "3.6"; 17 + 18 + src = fetchPypi { 19 + inherit pname version; 20 + hash = "sha256-ONw1suO/H11RbQDVGsEuveVD40F8a+b2oic8D8W1s1M="; 21 + }; 22 + 23 + propagatedBuildInputs = [ 24 + langcodes 25 + tld 26 + urllib3 27 + ]; 28 + 29 + nativeCheckInputs = [ 30 + pytestCheckHook 31 + ]; 32 + 33 + # disable tests that require an internet connection 34 + disabledTests = [ 35 + "test_urlcheck" 36 + ]; 37 + 38 + # nixify path to the courlan binary in the test suite 39 + postPatch = '' 40 + substituteInPlace tests/unit_tests.py \ 41 + --replace "\"courlan --help\"" "\"$out/bin/courlan --help\"" \ 42 + --replace "courlan_bin = \"courlan\"" "courlan_bin = \"$out/bin/courlan\"" 43 + ''; 44 + 45 + pythonImportsCheck = [ "courlan" ]; 46 + 47 + meta = with lib; { 48 + description = "Clean, filter and sample URLs to optimize data collection"; 49 + homepage = "https://github.com/adbar/courlan"; 50 + changelog = "https://github.com/adbar/courlan/blob/v${version}/HISTORY.md"; 51 + license = licenses.gpl3Plus; 52 + maintainers = with maintainers; [ jokatzke ]; 53 + }; 54 + }
+56
pkgs/development/python-modules/htmldate/default.nix
···
··· 1 + { lib 2 + , buildPythonPackage 3 + , fetchPypi 4 + , pythonOlder 5 + , charset-normalizer 6 + , dateparser 7 + , lxml 8 + , pytestCheckHook 9 + , python-dateutil 10 + , urllib3 11 + , backports-datetime-fromisoformat 12 + }: 13 + 14 + buildPythonPackage rec { 15 + pname = "htmldate"; 16 + version = "1.6.0"; 17 + format = "setuptools"; 18 + 19 + disabled = pythonOlder "3.6"; 20 + 21 + src = fetchPypi { 22 + inherit pname version; 23 + hash = "sha256-WCfI9iahaACinlfoGIo9MtCwjKTHvWYlN7c7u/IsRaY="; 24 + }; 25 + 26 + propagatedBuildInputs = [ 27 + charset-normalizer 28 + dateparser 29 + lxml 30 + python-dateutil 31 + urllib3 32 + ] ++ lib.optionals (pythonOlder "3.7") [ 33 + backports-datetime-fromisoformat 34 + ]; 35 + 36 + nativeCheckInputs = [ 37 + pytestCheckHook 38 + ]; 39 + 40 + # disable tests that require an internet connection 41 + disabledTests = [ 42 + "test_input" 43 + "test_cli" 44 + "test_download" 45 + ]; 46 + 47 + pythonImportsCheck = [ "htmldate" ]; 48 + 49 + meta = with lib; { 50 + description = "Fast and robust extraction of original and updated publication dates from URLs and web pages"; 51 + homepage = "https://htmldate.readthedocs.io"; 52 + changelog = "https://github.com/adbar/htmldate/blob/v${version}/CHANGELOG.md"; 53 + license = licenses.gpl3Plus; 54 + maintainers = with maintainers; [ jokatzke ]; 55 + }; 56 + }
+43
pkgs/development/python-modules/justext/default.nix
···
··· 1 + { lib 2 + , buildPythonPackage 3 + , fetchFromGitHub 4 + , pytestCheckHook 5 + , lxml 6 + }: 7 + 8 + buildPythonPackage rec { 9 + pname = "justext"; 10 + version = "3.0.0"; 11 + format = "setuptools"; 12 + 13 + src = fetchFromGitHub { 14 + owner = "miso-belica"; 15 + repo = "jusText"; 16 + rev = "refs/tags/v${version}"; 17 + hash = "sha256-WNxDoM5666tEHS9pMl5dOoig4S7dSYaCLZq71tehWqw="; 18 + }; 19 + 20 + propagatedBuildInputs = [ 21 + lxml 22 + ]; 23 + 24 + nativeCheckInputs = [ 25 + pytestCheckHook 26 + ]; 27 + 28 + # patch out coverage report 29 + postPatch = '' 30 + substituteInPlace setup.cfg \ 31 + --replace " --cov=justext --cov-report=term-missing --no-cov-on-fail" "" 32 + ''; 33 + 34 + pythonImportsCheck = [ "justext" ]; 35 + 36 + meta = with lib; { 37 + description = "Heuristic based boilerplate removal tool"; 38 + homepage = "https://github.com/miso-belica/jusText"; 39 + changelog = "https://github.com/miso-belica/jusText/blob/v${version}/CHANGELOG.rst"; 40 + license = licenses.bsd2; 41 + maintainers = with maintainers; [ jokatzke ]; 42 + }; 43 + }
+43
pkgs/development/python-modules/py3langid/default.nix
···
··· 1 + { lib 2 + , buildPythonPackage 3 + , fetchPypi 4 + , pythonOlder 5 + , numpy 6 + , pytestCheckHook 7 + }: 8 + 9 + buildPythonPackage rec { 10 + pname = "py3langid"; 11 + version = "0.2.2"; 12 + format = "setuptools"; 13 + 14 + disabled = pythonOlder "3.6"; 15 + 16 + src = fetchPypi { 17 + inherit pname version; 18 + hash = "sha256-tN4B2tfnAfKdIWoJNeheCWzIZ1kD0j6oRFsrtfCQuW8="; 19 + }; 20 + 21 + propagatedBuildInputs = [ 22 + numpy 23 + ]; 24 + 25 + nativeCheckInputs = [ 26 + pytestCheckHook 27 + ]; 28 + 29 + # nixify path to the courlan binary in the test suite 30 + postPatch = '' 31 + substituteInPlace tests/test_langid.py --replace "'langid'" "'$out/bin/langid'" 32 + ''; 33 + 34 + pythonImportsCheck = [ "py3langid" ]; 35 + 36 + meta = with lib; { 37 + description = "Fork of the language identification tool langid.py, featuring a modernized codebase and faster execution times"; 38 + homepage = "https://github.com/adbar/py3langid"; 39 + changelog = "https://github.com/adbar/py3langid/blob/v${version}/HISTORY.rst"; 40 + license = licenses.bsd3; 41 + maintainers = with maintainers; [ jokatzke ]; 42 + }; 43 + }
+67
pkgs/development/python-modules/trafilatura/default.nix
···
··· 1 + { lib 2 + , buildPythonPackage 3 + , fetchPypi 4 + , pytestCheckHook 5 + , pythonOlder 6 + , certifi 7 + , charset-normalizer 8 + , courlan 9 + , htmldate 10 + , justext 11 + , lxml 12 + , urllib3 13 + }: 14 + 15 + buildPythonPackage rec { 16 + pname = "trafilatura"; 17 + version = "1.6.3"; 18 + format = "setuptools"; 19 + 20 + disabled = pythonOlder "3.6"; 21 + 22 + src = fetchPypi { 23 + inherit pname version; 24 + hash = "sha256-Zx3W4AAOEBxLzo1w9ECLy3n8vyJ17iVZHv4z4sihYA0="; 25 + }; 26 + 27 + propagatedBuildInputs = [ 28 + certifi 29 + charset-normalizer 30 + courlan 31 + htmldate 32 + justext 33 + lxml 34 + urllib3 35 + ]; 36 + 37 + nativeCheckInputs = [ pytestCheckHook ]; 38 + 39 + # disable tests that require an internet connection 40 + disabledTests = [ 41 + "test_download" 42 + "test_fetch" 43 + "test_redirection" 44 + "test_meta_redirections" 45 + "test_crawl_page" 46 + "test_whole" 47 + "test_probing" 48 + "test_cli_pipeline" 49 + ]; 50 + 51 + # patch out gui cli because it is not supported in this packaging 52 + # nixify path to the trafilatura binary in the test suite 53 + postPatch = '' 54 + substituteInPlace setup.py --replace '"trafilatura_gui=trafilatura.gui:main",' "" 55 + substituteInPlace tests/cli_tests.py --replace "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'" 56 + ''; 57 + 58 + pythonImportsCheck = [ "trafilatura" ]; 59 + 60 + meta = with lib; { 61 + description = "Python package and command-line tool designed to gather text on the Web"; 62 + homepage = "https://trafilatura.readthedocs.io"; 63 + changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md"; 64 + license = licenses.gpl3Plus; 65 + maintainers = with maintainers; [ jokatzke ]; 66 + }; 67 + }
+10
pkgs/top-level/python-packages.nix
··· 2397 qemu = pkgs.qemu; 2398 }; 2399 2400 cov-core = callPackage ../development/python-modules/cov-core { }; 2401 2402 coverage = callPackage ../development/python-modules/coverage { }; ··· 5280 5281 html5-parser = callPackage ../development/python-modules/html5-parser { }; 5282 5283 htmllaundry = callPackage ../development/python-modules/htmllaundry { }; 5284 5285 htmllistparse = callPackage ../development/python-modules/htmllistparse { }; ··· 6057 6058 justbytes = callPackage ../development/python-modules/justbytes { }; 6059 6060 justnimbus = callPackage ../development/python-modules/justnimbus { }; 6061 6062 jwcrypto = callPackage ../development/python-modules/jwcrypto { }; ··· 10084 py3buddy = toPythonModule (callPackage ../development/python-modules/py3buddy { }); 10085 10086 py3exiv2 = callPackage ../development/python-modules/py3exiv2 { }; 10087 10088 py3nvml = callPackage ../development/python-modules/py3nvml { }; 10089 ··· 14667 tracing = callPackage ../development/python-modules/tracing { }; 14668 14669 trackpy = callPackage ../development/python-modules/trackpy { }; 14670 14671 trailrunner = callPackage ../development/python-modules/trailrunner {}; 14672
··· 2397 qemu = pkgs.qemu; 2398 }; 2399 2400 + courlan = callPackage ../development/python-modules/courlan { }; 2401 + 2402 cov-core = callPackage ../development/python-modules/cov-core { }; 2403 2404 coverage = callPackage ../development/python-modules/coverage { }; ··· 5282 5283 html5-parser = callPackage ../development/python-modules/html5-parser { }; 5284 5285 + htmldate = callPackage ../development/python-modules/htmldate { }; 5286 + 5287 htmllaundry = callPackage ../development/python-modules/htmllaundry { }; 5288 5289 htmllistparse = callPackage ../development/python-modules/htmllistparse { }; ··· 6061 6062 justbytes = callPackage ../development/python-modules/justbytes { }; 6063 6064 + justext = callPackage ../development/python-modules/justext { }; 6065 + 6066 justnimbus = callPackage ../development/python-modules/justnimbus { }; 6067 6068 jwcrypto = callPackage ../development/python-modules/jwcrypto { }; ··· 10090 py3buddy = toPythonModule (callPackage ../development/python-modules/py3buddy { }); 10091 10092 py3exiv2 = callPackage ../development/python-modules/py3exiv2 { }; 10093 + 10094 + py3langid = callPackage ../development/python-modules/py3langid { }; 10095 10096 py3nvml = callPackage ../development/python-modules/py3nvml { }; 10097 ··· 14675 tracing = callPackage ../development/python-modules/tracing { }; 14676 14677 trackpy = callPackage ../development/python-modules/trackpy { }; 14678 + 14679 + trafilatura = callPackage ../development/python-modules/trafilatura { }; 14680 14681 trailrunner = callPackage ../development/python-modules/trailrunner {}; 14682