1{
2 lib,
3 stdenv,
4 botocore,
5 buildPythonPackage,
6 cryptography,
7 cssselect,
8 fetchPypi,
9 fetchpatch,
10 glibcLocales,
11 installShellFiles,
12 itemadapter,
13 itemloaders,
14 jmespath,
15 lxml,
16 packaging,
17 parsel,
18 pexpect,
19 protego,
20 pydispatcher,
21 pyopenssl,
22 pytestCheckHook,
23 pythonOlder,
24 queuelib,
25 service-identity,
26 setuptools,
27 sybil,
28 testfixtures,
29 tldextract,
30 twisted,
31 w3lib,
32 zope-interface,
33}:
34
35buildPythonPackage rec {
36 pname = "scrapy";
37 version = "2.11.1";
38 pyproject = true;
39
40 disabled = pythonOlder "3.8";
41
42 src = fetchPypi {
43 inherit version;
44 pname = "Scrapy";
45 hash = "sha256-czoDnHQj5StpvygQtTMgk9TkKoSEYDWcB7Auz/j3Pr4=";
46 };
47
48 patches = [
49 # https://github.com/scrapy/scrapy/pull/6316
50 # fix test_get_func_args. remove on next update
51 (fetchpatch {
52 name = "test_get_func_args.patch";
53 url = "https://github.com/scrapy/scrapy/commit/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9.patch";
54 hash = "sha256-POlmsuW4SD9baKwZieKfmlp2vtdlb7aKQ62VOmNXsr0=";
55 })
56 ];
57
58 nativeBuildInputs = [
59 installShellFiles
60 setuptools
61 ];
62
63 propagatedBuildInputs = [
64 cryptography
65 cssselect
66 itemadapter
67 itemloaders
68 lxml
69 packaging
70 parsel
71 protego
72 pydispatcher
73 pyopenssl
74 queuelib
75 service-identity
76 tldextract
77 twisted
78 w3lib
79 zope-interface
80 ];
81
82 nativeCheckInputs = [
83 botocore
84 glibcLocales
85 jmespath
86 pexpect
87 pytestCheckHook
88 sybil
89 testfixtures
90 ];
91
92 LC_ALL = "en_US.UTF-8";
93
94 disabledTestPaths = [
95 "tests/test_proxy_connect.py"
96 "tests/test_utils_display.py"
97 "tests/test_command_check.py"
98 # Don't test the documentation
99 "docs"
100 ];
101
102 disabledTests =
103 [
104 # It's unclear if the failures are related to libxml2, https://github.com/NixOS/nixpkgs/pull/123890
105 "test_nested_css"
106 "test_nested_xpath"
107 "test_flavor_detection"
108 "test_follow_whitespace"
109 # Requires network access
110 "AnonymousFTPTestCase"
111 "FTPFeedStorageTest"
112 "FeedExportTest"
113 "test_custom_asyncio_loop_enabled_true"
114 "test_custom_loop_asyncio"
115 "test_custom_loop_asyncio_deferred_signal"
116 "FileFeedStoragePreFeedOptionsTest" # https://github.com/scrapy/scrapy/issues/5157
117 "test_persist"
118 "test_timeout_download_from_spider_nodata_rcvd"
119 "test_timeout_download_from_spider_server_hangs"
120 "test_unbounded_response"
121 "CookiesMiddlewareTest"
122 # Depends on uvloop
123 "test_asyncio_enabled_reactor_different_loop"
124 "test_asyncio_enabled_reactor_same_loop"
125 # Fails with AssertionError
126 "test_peek_fifo"
127 "test_peek_one_element"
128 "test_peek_lifo"
129 "test_callback_kwargs"
130 # Test fails on Hydra
131 "test_start_requests_laziness"
132 ]
133 ++ lib.optionals stdenv.isDarwin [
134 "test_xmliter_encoding"
135 "test_download"
136 "test_reactor_default_twisted_reactor_select"
137 "URIParamsSettingTest"
138 "URIParamsFeedOptionTest"
139 # flaky on darwin-aarch64
140 "test_fixed_delay"
141 "test_start_requests_laziness"
142 ];
143
144 postInstall = ''
145 installManPage extras/scrapy.1
146 installShellCompletion --cmd scrapy \
147 --zsh extras/scrapy_zsh_completion \
148 --bash extras/scrapy_bash_completion
149 '';
150
151 pythonImportsCheck = [ "scrapy" ];
152
153 __darwinAllowLocalNetworking = true;
154
155 meta = with lib; {
156 description = "High-level web crawling and web scraping framework";
157 mainProgram = "scrapy";
158 longDescription = ''
159 Scrapy is a fast high-level web crawling and web scraping framework, used to crawl
160 websites and extract structured data from their pages. It can be used for a wide
161 range of purposes, from data mining to monitoring and automated testing.
162 '';
163 homepage = "https://scrapy.org/";
164 changelog = "https://github.com/scrapy/scrapy/raw/${version}/docs/news.rst";
165 license = licenses.bsd3;
166 maintainers = with maintainers; [ vinnymeller ];
167 };
168}