nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1{
2 lib,
3 beautifulsoup4,
4 buildPythonPackage,
5 buildNpmPackage,
6 fetchFromGitHub,
7 html5lib,
8 lxml,
9 nodejs,
10 pytestCheckHook,
11 regex,
12 setuptools,
13 testers,
14 readabilipy,
15}:
16
17buildPythonPackage rec {
18 pname = "readabilipy";
19 version = "0.3.0";
20 pyproject = true;
21
22 src = fetchFromGitHub {
23 owner = "alan-turing-institute";
24 repo = "ReadabiliPy";
25 tag = "v${version}";
26 hash = "sha256-FYdSbq3rm6fBHm5fDRAB0airX9fNcUGs1wHN4i6mnG0=";
27 };
28
29 patches = [
30 # Fix test failures with Python 3.13.6
31 # https://github.com/alan-turing-institute/ReadabiliPy/pull/116
32 ./python3.13.6-compatibility.patch
33 ];
34
35 javascript = buildNpmPackage {
36 pname = "readabilipy-javascript";
37 inherit version;
38
39 src = src;
40 sourceRoot = "${src.name}/readabilipy/javascript";
41 npmDepsHash = "sha256-1yp80TwRbE/NcMa0qrml0TlSZJ6zwSTmj+zDjBejko8=";
42
43 postPatch = ''
44 cp ${./package-lock.json} package-lock.json
45 '';
46
47 dontNpmBuild = true;
48 };
49
50 build-system = [ setuptools ];
51
52 dependencies = [
53 beautifulsoup4
54 html5lib
55 lxml
56 regex
57 ];
58
59 postPatch = ''
60 ln -s $javascript/lib/node_modules/ReadabiliPy/node_modules readabilipy/javascript/node_modules
61 echo "recursive-include readabilipy/javascript *" >MANIFEST.in
62 '';
63
64 postInstall = ''
65 wrapProgram $out/bin/readabilipy \
66 --prefix PATH : ${nodejs}/bin
67 '';
68
69 nativeCheckInputs = [
70 pytestCheckHook
71 nodejs
72 ];
73
74 pythonImportsCheck = [ "readabilipy" ];
75
76 disabledTestPaths = [
77 # Exclude benchmarks
78 "tests/test_benchmarking.py"
79 ];
80
81 disabledTests = [
82 # IndexError: list index out of range
83 "test_html_blacklist"
84 "test_prune_div_with_one_empty_span"
85 "test_prune_div_with_one_whitespace_paragraph"
86 "test_empty_page"
87 "test_contentless_page"
88 "test_extract_title"
89 "test_iframe_containing_tags"
90 "test_iframe_with_source"
91 ];
92
93 passthru = {
94 tests.version = testers.testVersion {
95 package = readabilipy;
96 command = "readabilipy --version";
97 version = "${version} (Readability.js supported: yes)";
98 };
99 };
100
101 meta = {
102 description = "HTML content extractor";
103 homepage = "https://github.com/alan-turing-institute/ReadabiliPy";
104 changelog = "https://github.com/alan-turing-institute/ReadabiliPy/blob/${src.tag}/CHANGELOG.md";
105 license = lib.licenses.mit;
106 maintainers = with lib.maintainers; [ fab ];
107 mainProgram = "readabilipy";
108 };
109}