1{ lib
2, stdenv
3, botocore
4, buildPythonPackage
5, cryptography
6, cssselect
7, fetchPypi
8, fetchpatch
9, glibcLocales
10, installShellFiles
11, itemadapter
12, itemloaders
13, jmespath
14, lxml
15, packaging
16, parsel
17, pexpect
18, protego
19, pydispatcher
20, pyopenssl
21, pytestCheckHook
22, pythonOlder
23, queuelib
24, service-identity
25, sybil
26, testfixtures
27, tldextract
28, twisted
29, w3lib
30, zope_interface
31}:
32
33buildPythonPackage rec {
34 pname = "scrapy";
35 version = "2.11.0";
36 format = "setuptools";
37
38 disabled = pythonOlder "3.8";
39
40 src = fetchPypi {
41 inherit version;
42 pname = "Scrapy";
43 hash = "sha256-PL3tzgw/DgSC1hvi10WGg758188UsO5q37rduA9bNqU=";
44 };
45
46 patches = [
47 # Fix compatiblity with Twisted>=23.8. Remove with the next release.
48 (fetchpatch {
49 url = "https://github.com/scrapy/scrapy/commit/aa95ada42cdf570f840f55c463375f8a81b303f8.patch";
50 hash = "sha256-LuhA5BqtjSUgkotplvUCtvGNYOTrl0MJRCXiSBMDFzY=";
51 excludes = [
52 "tests/CrawlerProcess/sleeping.py"
53 "tests/test_crawler.py"
54 ];
55 })
56 ];
57
58 nativeBuildInputs = [
59 installShellFiles
60 ];
61
62 propagatedBuildInputs = [
63 cryptography
64 cssselect
65 itemadapter
66 itemloaders
67 lxml
68 packaging
69 parsel
70 protego
71 pydispatcher
72 pyopenssl
73 queuelib
74 service-identity
75 tldextract
76 twisted
77 w3lib
78 zope_interface
79 ];
80
81 nativeCheckInputs = [
82 botocore
83 glibcLocales
84 jmespath
85 pexpect
86 pytestCheckHook
87 sybil
88 testfixtures
89 ];
90
91 LC_ALL = "en_US.UTF-8";
92
93 disabledTestPaths = [
94 "tests/test_proxy_connect.py"
95 "tests/test_utils_display.py"
96 "tests/test_command_check.py"
97 # Don't test the documentation
98 "docs"
99 ];
100
101 disabledTests = [
102 # It's unclear if the failures are related to libxml2, https://github.com/NixOS/nixpkgs/pull/123890
103 "test_nested_css"
104 "test_nested_xpath"
105 "test_flavor_detection"
106 "test_follow_whitespace"
107 # Requires network access
108 "AnonymousFTPTestCase"
109 "FTPFeedStorageTest"
110 "FeedExportTest"
111 "test_custom_asyncio_loop_enabled_true"
112 "test_custom_loop_asyncio"
113 "test_custom_loop_asyncio_deferred_signal"
114 "FileFeedStoragePreFeedOptionsTest" # https://github.com/scrapy/scrapy/issues/5157
115 "test_persist"
116 "test_timeout_download_from_spider_nodata_rcvd"
117 "test_timeout_download_from_spider_server_hangs"
118 "test_unbounded_response"
119 "CookiesMiddlewareTest"
120 # Depends on uvloop
121 "test_asyncio_enabled_reactor_different_loop"
122 "test_asyncio_enabled_reactor_same_loop"
123 # Fails with AssertionError
124 "test_peek_fifo"
125 "test_peek_one_element"
126 "test_peek_lifo"
127 "test_callback_kwargs"
128 # Test fails on Hydra
129 "test_start_requests_laziness"
130 ] ++ lib.optionals stdenv.isDarwin [
131 "test_xmliter_encoding"
132 "test_download"
133 "test_reactor_default_twisted_reactor_select"
134 "URIParamsSettingTest"
135 "URIParamsFeedOptionTest"
136 # flaky on darwin-aarch64
137 "test_fixed_delay"
138 "test_start_requests_laziness"
139 ];
140
141 postInstall = ''
142 installManPage extras/scrapy.1
143 installShellCompletion --cmd scrapy \
144 --zsh extras/scrapy_zsh_completion \
145 --bash extras/scrapy_bash_completion
146 '';
147
148 pythonImportsCheck = [
149 "scrapy"
150 ];
151
152 __darwinAllowLocalNetworking = true;
153
154 meta = with lib; {
155 description = "High-level web crawling and web scraping framework";
156 longDescription = ''
157 Scrapy is a fast high-level web crawling and web scraping framework, used to crawl
158 websites and extract structured data from their pages. It can be used for a wide
159 range of purposes, from data mining to monitoring and automated testing.
160 '';
161 homepage = "https://scrapy.org/";
162 changelog = "https://github.com/scrapy/scrapy/raw/${version}/docs/news.rst";
163 license = licenses.bsd3;
164 maintainers = with maintainers; [ marsam ];
165 };
166}