1{
2 lib,
3 stdenv,
4 buildPythonPackage,
5 fetchFromGitHub,
6
7 # build-system
8 cython,
9 distlib,
10 grpcio-tools,
11 jinja2,
12 jsonpickle,
13 jsonschema,
14 mypy-protobuf,
15 redis,
16 setuptools,
17 yapf,
18
19 # dependencies
20 crcmod,
21 dill,
22 fastavro,
23 fasteners,
24 grpcio,
25 hdfs,
26 httplib2,
27 numpy,
28 objsize,
29 orjson,
30 proto-plus,
31 protobuf,
32 pyarrow,
33 pydot,
34 pymongo,
35 python-dateutil,
36 pytz,
37 regex,
38 requests,
39 typing-extensions,
40 zstandard,
41
42 # tests
43 python,
44 docstring-parser,
45 freezegun,
46 hypothesis,
47 mock,
48 pandas,
49 parameterized,
50 psycopg2,
51 pyhamcrest,
52 pytest-xdist,
53 pytestCheckHook,
54 pyyaml,
55 requests-mock,
56 scikit-learn,
57 sqlalchemy,
58 tenacity,
59 testcontainers,
60 pythonAtLeast,
61}:
62
63buildPythonPackage rec {
64 pname = "apache-beam";
65 version = "2.65.0";
66 pyproject = true;
67
68 src = fetchFromGitHub {
69 owner = "apache";
70 repo = "beam";
71 tag = "v${version}";
72 hash = "sha256-vDW0PVNep+egIZBe4t8IPwLgsQDmoO4rrA4wUoAHzfg=";
73 };
74
75 pythonRelaxDeps = [
76 "grpcio"
77 "jsonpickle"
78
79 # As of apache-beam v2.55.1, the requirement is cloudpickle~=2.2.1, but
80 # the current (2024-04-20) nixpkgs's pydot version is 3.0.0.
81 "cloudpickle"
82
83 # See https://github.com/NixOS/nixpkgs/issues/156957
84 "dill"
85
86 "protobuf"
87
88 # As of apache-beam v2.45.0, the requirement is pyarrow<10.0.0,>=0.15.1, but
89 # the current (2023-02-22) nixpkgs's pyarrow version is 11.0.0.
90 "pyarrow"
91
92 "pydot"
93 ];
94
95 sourceRoot = "${src.name}/sdks/python";
96
97 build-system = [
98 cython
99 distlib
100 grpcio-tools
101 jinja2
102 jsonpickle
103 jsonschema
104 mypy-protobuf
105 redis
106 setuptools
107 yapf
108 ];
109
110 dependencies = [
111 crcmod
112 dill
113 fastavro
114 fasteners
115 grpcio
116 hdfs
117 httplib2
118 numpy
119 objsize
120 orjson
121 proto-plus
122 protobuf
123 pyarrow
124 pydot
125 pymongo
126 python-dateutil
127 pytz
128 regex
129 requests
130 typing-extensions
131 zstandard
132 ];
133
134 enableParallelBuilding = true;
135
136 postPatch = ''
137 substituteInPlace pyproject.toml \
138 --replace-fail "distlib==0.3.7" "distlib" \
139 --replace-fail "yapf==0.29.0" "yapf" \
140 --replace-fail "grpcio-tools==1.62.1" "grpcio-tools" \
141 --replace-fail "mypy-protobuf==3.5.0" "mypy-protobuf" \
142 --replace-fail "numpy>=1.14.3,<2.3.0" "numpy"
143
144 substituteInPlace setup.py \
145 --replace-fail " copy_tests_from_docs()" ""
146 '';
147
148 __darwinAllowLocalNetworking = true;
149
150 pythonImportsCheck = [ "apache_beam" ];
151
152 nativeCheckInputs = [
153 docstring-parser
154 freezegun
155 hypothesis
156 mock
157 pandas
158 parameterized
159 psycopg2
160 pyhamcrest
161 pytest-xdist
162 pytestCheckHook
163 pyyaml
164 requests-mock
165 scikit-learn
166 sqlalchemy
167 tenacity
168 testcontainers
169 ];
170
171 # Make sure we're running the tests for the actually installed
172 # package, so that cython's .so files are available.
173 preCheck = ''
174 cd $out/${python.sitePackages}
175 '';
176
177 disabledTestPaths =
178 [
179 # Fails with
180 # _______ ERROR collecting apache_beam/io/external/xlang_jdbcio_it_test.py _______
181 # apache_beam/io/external/xlang_jdbcio_it_test.py:80: in <module>
182 # class CrossLanguageJdbcIOTest(unittest.TestCase):
183 # apache_beam/io/external/xlang_jdbcio_it_test.py:99: in CrossLanguageJdbcIOTest
184 # container_init: Callable[[], Union[PostgresContainer, MySqlContainer]],
185 # E NameError: name 'MySqlContainer' is not defined
186 #
187 "apache_beam/io/external/xlang_jdbcio_it_test.py"
188
189 # These tests depend on the availability of specific servers backends.
190 "apache_beam/runners/portability/flink_runner_test.py"
191 "apache_beam/runners/portability/samza_runner_test.py"
192 "apache_beam/runners/portability/spark_runner_test.py"
193
194 # Fails starting from dill 0.3.6 because it tries to pickle pytest globals:
195 # https://github.com/uqfoundation/dill/issues/482#issuecomment-1139017499.
196 "apache_beam/transforms/window_test.py"
197
198 # See https://github.com/apache/beam/issues/25390.
199 "apache_beam/coders/slow_coders_test.py"
200 "apache_beam/dataframe/pandas_doctests_test.py"
201 "apache_beam/typehints/typed_pipeline_test.py"
202 "apache_beam/coders/fast_coders_test.py"
203 "apache_beam/dataframe/schemas_test.py"
204
205 # Fails with TypeError: cannot pickle 'EncodedFile' instances
206 # Upstream issue https://github.com/apache/beam/issues/33889
207 "apache_beam/options/pipeline_options_validator_test.py"
208 "apache_beam/yaml/main_test.py"
209 "apache_beam/yaml/programming_guide_test.py"
210 "apache_beam/yaml/readme_test.py"
211 "apache_beam/yaml/yaml_combine_test.py"
212 "apache_beam/yaml/yaml_enrichment_test.py"
213 "apache_beam/yaml/yaml_io_test.py"
214 "apache_beam/yaml/yaml_join_test.py"
215 "apache_beam/yaml/yaml_mapping_test.py"
216 "apache_beam/yaml/yaml_ml_test.py"
217 "apache_beam/yaml/yaml_provider_unit_test.py"
218
219 # FIXME All those fails due to a single- AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs'
220 # Upstream issue https://github.com/apache/beam/issues/33854
221 "apache_beam/coders/row_coder_test.py"
222 "apache_beam/examples/avro_nyc_trips_test.py"
223 "apache_beam/examples/complete/autocomplete_test.py"
224 "apache_beam/examples/complete/estimate_pi_test.py"
225 "apache_beam/examples/complete/game/game_stats_test.py"
226 "apache_beam/examples/complete/game/hourly_team_score_test.py"
227 "apache_beam/examples/complete/game/leader_board_test.py"
228 "apache_beam/examples/complete/game/user_score_test.py"
229 "apache_beam/examples/complete/tfidf_test.py"
230 "apache_beam/examples/complete/top_wikipedia_sessions_test.py"
231 "apache_beam/examples/cookbook/bigquery_side_input_test.py"
232 "apache_beam/examples/cookbook/bigquery_tornadoes_test.py"
233 "apache_beam/examples/cookbook/coders_test.py"
234 "apache_beam/examples/cookbook/combiners_test.py"
235 "apache_beam/examples/cookbook/custom_ptransform_test.py"
236 "apache_beam/examples/cookbook/filters_test.py"
237 "apache_beam/examples/matrix_power_test.py"
238 "apache_beam/examples/snippets/snippets_test.py"
239 "apache_beam/examples/snippets/transforms/aggregation/approximatequantiles_test.py"
240 "apache_beam/examples/snippets/transforms/aggregation/approximateunique_test.py"
241 "apache_beam/examples/snippets/transforms/aggregation/batchelements_test.py"
242 "apache_beam/examples/snippets/transforms/aggregation/cogroupbykey_test.py"
243 "apache_beam/examples/snippets/transforms/aggregation/combineglobally_test.py"
244 "apache_beam/examples/snippets/transforms/aggregation/combineperkey_test.py"
245 "apache_beam/examples/snippets/transforms/aggregation/combinevalues_test.py"
246 "apache_beam/examples/snippets/transforms/aggregation/count_test.py"
247 "apache_beam/examples/snippets/transforms/aggregation/distinct_test.py"
248 "apache_beam/examples/snippets/transforms/aggregation/groupbykey_test.py"
249 "apache_beam/examples/snippets/transforms/aggregation/groupintobatches_test.py"
250 "apache_beam/examples/snippets/transforms/aggregation/latest_test.py"
251 "apache_beam/examples/snippets/transforms/aggregation/max_test.py"
252 "apache_beam/examples/snippets/transforms/aggregation/mean_test.py"
253 "apache_beam/examples/snippets/transforms/aggregation/min_test.py"
254 "apache_beam/examples/snippets/transforms/aggregation/sample_test.py"
255 "apache_beam/examples/snippets/transforms/aggregation/sum_test.py"
256 "apache_beam/examples/snippets/transforms/aggregation/tolist_test.py"
257 "apache_beam/examples/snippets/transforms/aggregation/top_test.py"
258 "apache_beam/examples/snippets/transforms/elementwise/filter_test.py"
259 "apache_beam/examples/snippets/transforms/elementwise/flatmap_test.py"
260 "apache_beam/examples/snippets/transforms/elementwise/keys_test.py"
261 "apache_beam/examples/snippets/transforms/elementwise/kvswap_test.py"
262 "apache_beam/examples/snippets/transforms/elementwise/map_test.py"
263 "apache_beam/examples/snippets/transforms/elementwise/pardo_test.py"
264 "apache_beam/examples/snippets/transforms/elementwise/partition_test.py"
265 "apache_beam/examples/snippets/transforms/elementwise/regex_test.py"
266 "apache_beam/examples/snippets/transforms/elementwise/tostring_test.py"
267 "apache_beam/examples/snippets/transforms/elementwise/values_test.py"
268 "apache_beam/examples/snippets/transforms/elementwise/withtimestamps_test.py"
269 "apache_beam/examples/snippets/transforms/other/create_test.py"
270 "apache_beam/examples/snippets/transforms/other/flatten_test.py"
271 "apache_beam/examples/snippets/transforms/other/window_test.py"
272 "apache_beam/examples/snippets/util_test.py"
273 "apache_beam/io/avroio_test.py"
274 "apache_beam/io/concat_source_test.py"
275 "apache_beam/io/filebasedsink_test.py"
276 "apache_beam/io/filebasedsource_test.py"
277 "apache_beam/io/fileio_test.py"
278 "apache_beam/io/mongodbio_test.py"
279 "apache_beam/io/parquetio_test.py"
280 "apache_beam/io/sources_test.py"
281 "apache_beam/io/textio_test.py"
282 "apache_beam/io/tfrecordio_test.py"
283 "apache_beam/metrics/metric_test.py"
284 "apache_beam/ml/inference/base_test.py"
285 "apache_beam/ml/inference/sklearn_inference_test.py"
286 "apache_beam/ml/inference/utils_test.py"
287 "apache_beam/ml/rag/chunking/base_test.py"
288 "apache_beam/ml/rag/ingestion/base_test.py"
289 "apache_beam/pipeline_test.py"
290 "apache_beam/runners/direct/direct_runner_test.py"
291 "apache_beam/runners/direct/sdf_direct_runner_test.py"
292 "apache_beam/runners/interactive/interactive_beam_test.py"
293 "apache_beam/runners/interactive/interactive_runner_test.py"
294 "apache_beam/runners/interactive/non_interactive_runner_test.py"
295 "apache_beam/runners/interactive/recording_manager_test.py"
296 "apache_beam/runners/portability/fn_api_runner/translations_test.py"
297 "apache_beam/runners/portability/fn_api_runner/trigger_manager_test.py"
298 "apache_beam/runners/portability/stager_test.py"
299 "apache_beam/testing/synthetic_pipeline_test.py"
300 "apache_beam/testing/test_stream_test.py"
301 "apache_beam/testing/util_test.py"
302 "apache_beam/transforms/combiners_test.py"
303 "apache_beam/transforms/core_test.py"
304 "apache_beam/transforms/create_test.py"
305 "apache_beam/transforms/deduplicate_test.py"
306 "apache_beam/transforms/periodicsequence_test.py"
307 "apache_beam/transforms/ptransform_test.py"
308 "apache_beam/transforms/sideinputs_test.py"
309 "apache_beam/transforms/stats_test.py"
310 "apache_beam/transforms/transforms_keyword_only_args_test.py"
311 "apache_beam/transforms/trigger_test.py"
312 "apache_beam/transforms/userstate_test.py"
313 "apache_beam/transforms/util_test.py"
314 "apache_beam/transforms/write_ptransform_test.py"
315
316 # FIXME AttributeError: 'Namespace' object has no attribute 'test_pipeline_options'
317 # Upstream issue https://github.com/apache/beam/issues/33853
318 "apache_beam/runners/portability/prism_runner_test.py"
319
320 # FIXME ValueError: Unable to run pipeline with requirement: unsupported_requirement
321 # Upstream issuehttps://github.com/apache/beam/issues/33853
322 "apache_beam/yaml/yaml_transform_scope_test.py"
323 "apache_beam/yaml/yaml_transform_test.py"
324 "apache_beam/yaml/yaml_transform_unit_test.py"
325 "apache_beam/yaml/yaml_udf_test.py"
326 "apache_beam/dataframe/frames_test.py"
327
328 # FIXME Those tests do not terminate due to a grpc error (threading issue)
329 # grpc_status:14, grpc_message:"Cancelling all calls"}"
330 # Upstream issue https://github.com/apache/beam/issues/33851
331 "apache_beam/runners/portability/portable_runner_test.py"
332 ]
333 ++ lib.optionals (pythonAtLeast "3.13") [
334 # > instruction = ofs_table[pc]
335 # E KeyError: 18
336 "apache_beam/typehints/trivial_inference_test.py"
337 ];
338
339 disabledTests =
340 lib.optionals stdenv.hostPlatform.isDarwin [
341 # PermissionError: [Errno 13] Permission denied: '/tmp/...'
342 "test_cache_manager_uses_local_ib_cache_root"
343 "test_describe_all_recordings"
344 "test_find_out_correct_user_pipeline"
345 "test_get_cache_manager_creates_cache_manager_if_absent"
346 "test_streaming_cache_uses_local_ib_cache_root"
347 "test_track_user_pipeline_cleanup_non_inspectable_pipeline"
348 ]
349 ++ lib.optionals (pythonAtLeast "3.12") [
350 # TypeError: Could not determine schema for type hint Any.
351 "test_batching_beam_row_input"
352 "test_auto_convert"
353 "test_unbatching_series"
354 "test_batching_beam_row_to_dataframe"
355
356 # AssertionError: Any != <class 'int'>
357 "test_pycallable_map"
358 "testAlwaysReturnsEarly"
359
360 # TypeError: Expected Iterator in return type annotatio
361 "test_get_output_batch_type"
362 ];
363
364 meta = {
365 description = "Unified model for defining both batch and streaming data-parallel processing pipelines";
366 homepage = "https://beam.apache.org/";
367 changelog = "https://github.com/apache/beam/blob/release-${version}/CHANGES.md";
368 license = lib.licenses.asl20;
369 maintainers = with lib.maintainers; [ ndl ];
370 };
371}