1{
2 lib,
3 stdenv,
4 buildPythonPackage,
5 fetchFromGitHub,
6
7 # build-system
8 cython,
9 distlib,
10 grpcio-tools,
11 jinja2,
12 jsonpickle,
13 jsonschema,
14 mypy-protobuf,
15 redis,
16 setuptools,
17 yapf,
18
19 # dependencies
20 crcmod,
21 dill,
22 fastavro,
23 fasteners,
24 grpcio,
25 hdfs,
26 httplib2,
27 numpy,
28 objsize,
29 orjson,
30 proto-plus,
31 protobuf,
32 pyarrow,
33 pydot,
34 pymongo,
35 python-dateutil,
36 pytz,
37 regex,
38 requests,
39 typing-extensions,
40 zstandard,
41
42 # tests
43 python,
44 docstring-parser,
45 freezegun,
46 hypothesis,
47 mock,
48 pandas,
49 parameterized,
50 psycopg2,
51 pyhamcrest,
52 pytest-xdist,
53 pytestCheckHook,
54 pyyaml,
55 requests-mock,
56 scikit-learn,
57 sqlalchemy,
58 tenacity,
59 testcontainers,
60 pythonAtLeast,
61}:
62
63buildPythonPackage rec {
64 pname = "apache-beam";
65 version = "2.66.0";
66 pyproject = true;
67
68 src = fetchFromGitHub {
69 owner = "apache";
70 repo = "beam";
71 tag = "v${version}";
72 hash = "sha256-nRofy9pvhO5SUvkIk73ViFm1gPWxEhj1rAUeCVYIpYs=";
73 };
74
75 pythonRelaxDeps = [
76 "grpcio"
77 "jsonpickle"
78
79 # As of apache-beam v2.55.1, the requirement is cloudpickle~=2.2.1, but
80 # the current (2024-04-20) nixpkgs's pydot version is 3.0.0.
81 "cloudpickle"
82
83 # See https://github.com/NixOS/nixpkgs/issues/156957
84 "dill"
85
86 "numpy"
87
88 "protobuf"
89
90 # As of apache-beam v2.45.0, the requirement is pyarrow<10.0.0,>=0.15.1, but
91 # the current (2023-02-22) nixpkgs's pyarrow version is 11.0.0.
92 "pyarrow"
93
94 "pydot"
95 "redis"
96 ];
97
98 sourceRoot = "${src.name}/sdks/python";
99
100 build-system = [
101 cython
102 distlib
103 grpcio-tools
104 jinja2
105 jsonpickle
106 jsonschema
107 mypy-protobuf
108 redis
109 setuptools
110 yapf
111 ];
112
113 dependencies = [
114 crcmod
115 dill
116 fastavro
117 fasteners
118 grpcio
119 hdfs
120 httplib2
121 numpy
122 objsize
123 orjson
124 proto-plus
125 protobuf
126 pyarrow
127 pydot
128 pymongo
129 python-dateutil
130 pytz
131 regex
132 requests
133 typing-extensions
134 zstandard
135 ];
136
137 enableParallelBuilding = true;
138
139 postPatch = ''
140 substituteInPlace pyproject.toml \
141 --replace-fail "distlib==0.3.7" "distlib" \
142 --replace-fail "yapf==0.29.0" "yapf" \
143 --replace-fail "grpcio-tools==1.62.1" "grpcio-tools" \
144 --replace-fail "mypy-protobuf==3.5.0" "mypy-protobuf" \
145 --replace-fail "numpy>=1.14.3,<2.3.0" "numpy"
146
147 substituteInPlace setup.py \
148 --replace-fail " copy_tests_from_docs()" ""
149 '';
150
151 __darwinAllowLocalNetworking = true;
152
153 pythonImportsCheck = [ "apache_beam" ];
154
155 nativeCheckInputs = [
156 docstring-parser
157 freezegun
158 hypothesis
159 mock
160 pandas
161 parameterized
162 psycopg2
163 pyhamcrest
164 pytest-xdist
165 pytestCheckHook
166 pyyaml
167 requests-mock
168 scikit-learn
169 sqlalchemy
170 tenacity
171 testcontainers
172 ];
173
174 # Make sure we're running the tests for the actually installed
175 # package, so that cython's .so files are available.
176 preCheck = ''
177 cd $out/${python.sitePackages}
178 '';
179
180 disabledTestPaths = [
181 # Fails with
182 # _______ ERROR collecting apache_beam/io/external/xlang_jdbcio_it_test.py _______
183 # apache_beam/io/external/xlang_jdbcio_it_test.py:80: in <module>
184 # class CrossLanguageJdbcIOTest(unittest.TestCase):
185 # apache_beam/io/external/xlang_jdbcio_it_test.py:99: in CrossLanguageJdbcIOTest
186 # container_init: Callable[[], Union[PostgresContainer, MySqlContainer]],
187 # E NameError: name 'MySqlContainer' is not defined
188 #
189 "apache_beam/io/external/xlang_jdbcio_it_test.py"
190
191 # These tests depend on the availability of specific servers backends.
192 "apache_beam/runners/portability/flink_runner_test.py"
193 "apache_beam/runners/portability/samza_runner_test.py"
194 "apache_beam/runners/portability/spark_runner_test.py"
195
196 # Fails starting from dill 0.3.6 because it tries to pickle pytest globals:
197 # https://github.com/uqfoundation/dill/issues/482#issuecomment-1139017499.
198 "apache_beam/transforms/window_test.py"
199
200 # See https://github.com/apache/beam/issues/25390.
201 "apache_beam/coders/slow_coders_test.py"
202 "apache_beam/dataframe/pandas_doctests_test.py"
203 "apache_beam/typehints/typed_pipeline_test.py"
204 "apache_beam/coders/fast_coders_test.py"
205 "apache_beam/dataframe/schemas_test.py"
206
207 # Fails with TypeError: cannot pickle 'EncodedFile' instances
208 # Upstream issue https://github.com/apache/beam/issues/33889
209 "apache_beam/options/pipeline_options_validator_test.py"
210 "apache_beam/yaml/main_test.py"
211 "apache_beam/yaml/programming_guide_test.py"
212 "apache_beam/yaml/readme_test.py"
213 "apache_beam/yaml/yaml_combine_test.py"
214 "apache_beam/yaml/yaml_enrichment_test.py"
215 "apache_beam/yaml/yaml_io_test.py"
216 "apache_beam/yaml/yaml_join_test.py"
217 "apache_beam/yaml/yaml_mapping_test.py"
218 "apache_beam/yaml/yaml_ml_test.py"
219 "apache_beam/yaml/yaml_provider_unit_test.py"
220
221 # FIXME All those fails due to a single- AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs'
222 # Upstream issue https://github.com/apache/beam/issues/33854
223 "apache_beam/coders/row_coder_test.py"
224 "apache_beam/examples/avro_nyc_trips_test.py"
225 "apache_beam/examples/complete/autocomplete_test.py"
226 "apache_beam/examples/complete/estimate_pi_test.py"
227 "apache_beam/examples/complete/game/game_stats_test.py"
228 "apache_beam/examples/complete/game/hourly_team_score_test.py"
229 "apache_beam/examples/complete/game/leader_board_test.py"
230 "apache_beam/examples/complete/game/user_score_test.py"
231 "apache_beam/examples/complete/tfidf_test.py"
232 "apache_beam/examples/complete/top_wikipedia_sessions_test.py"
233 "apache_beam/examples/cookbook/bigquery_side_input_test.py"
234 "apache_beam/examples/cookbook/bigquery_tornadoes_test.py"
235 "apache_beam/examples/cookbook/coders_test.py"
236 "apache_beam/examples/cookbook/combiners_test.py"
237 "apache_beam/examples/cookbook/custom_ptransform_test.py"
238 "apache_beam/examples/cookbook/filters_test.py"
239 "apache_beam/examples/matrix_power_test.py"
240 "apache_beam/examples/snippets/snippets_test.py"
241 "apache_beam/examples/snippets/transforms/aggregation/approximatequantiles_test.py"
242 "apache_beam/examples/snippets/transforms/aggregation/approximateunique_test.py"
243 "apache_beam/examples/snippets/transforms/aggregation/batchelements_test.py"
244 "apache_beam/examples/snippets/transforms/aggregation/cogroupbykey_test.py"
245 "apache_beam/examples/snippets/transforms/aggregation/combineglobally_test.py"
246 "apache_beam/examples/snippets/transforms/aggregation/combineperkey_test.py"
247 "apache_beam/examples/snippets/transforms/aggregation/combinevalues_test.py"
248 "apache_beam/examples/snippets/transforms/aggregation/count_test.py"
249 "apache_beam/examples/snippets/transforms/aggregation/distinct_test.py"
250 "apache_beam/examples/snippets/transforms/aggregation/groupbykey_test.py"
251 "apache_beam/examples/snippets/transforms/aggregation/groupintobatches_test.py"
252 "apache_beam/examples/snippets/transforms/aggregation/latest_test.py"
253 "apache_beam/examples/snippets/transforms/aggregation/max_test.py"
254 "apache_beam/examples/snippets/transforms/aggregation/mean_test.py"
255 "apache_beam/examples/snippets/transforms/aggregation/min_test.py"
256 "apache_beam/examples/snippets/transforms/aggregation/sample_test.py"
257 "apache_beam/examples/snippets/transforms/aggregation/sum_test.py"
258 "apache_beam/examples/snippets/transforms/aggregation/tolist_test.py"
259 "apache_beam/examples/snippets/transforms/aggregation/top_test.py"
260 "apache_beam/examples/snippets/transforms/elementwise/filter_test.py"
261 "apache_beam/examples/snippets/transforms/elementwise/flatmap_test.py"
262 "apache_beam/examples/snippets/transforms/elementwise/keys_test.py"
263 "apache_beam/examples/snippets/transforms/elementwise/kvswap_test.py"
264 "apache_beam/examples/snippets/transforms/elementwise/map_test.py"
265 "apache_beam/examples/snippets/transforms/elementwise/pardo_test.py"
266 "apache_beam/examples/snippets/transforms/elementwise/partition_test.py"
267 "apache_beam/examples/snippets/transforms/elementwise/regex_test.py"
268 "apache_beam/examples/snippets/transforms/elementwise/tostring_test.py"
269 "apache_beam/examples/snippets/transforms/elementwise/values_test.py"
270 "apache_beam/examples/snippets/transforms/elementwise/withtimestamps_test.py"
271 "apache_beam/examples/snippets/transforms/other/create_test.py"
272 "apache_beam/examples/snippets/transforms/other/flatten_test.py"
273 "apache_beam/examples/snippets/transforms/other/window_test.py"
274 "apache_beam/examples/snippets/util_test.py"
275 "apache_beam/io/avroio_test.py"
276 "apache_beam/io/concat_source_test.py"
277 "apache_beam/io/filebasedsink_test.py"
278 "apache_beam/io/filebasedsource_test.py"
279 "apache_beam/io/fileio_test.py"
280 "apache_beam/io/mongodbio_test.py"
281 "apache_beam/io/parquetio_test.py"
282 "apache_beam/io/sources_test.py"
283 "apache_beam/io/textio_test.py"
284 "apache_beam/io/tfrecordio_test.py"
285 "apache_beam/metrics/metric_test.py"
286 "apache_beam/ml/inference/base_test.py"
287 "apache_beam/ml/inference/sklearn_inference_test.py"
288 "apache_beam/ml/inference/utils_test.py"
289 "apache_beam/ml/rag/chunking/base_test.py"
290 "apache_beam/ml/rag/ingestion/base_test.py"
291 "apache_beam/pipeline_test.py"
292 "apache_beam/runners/direct/direct_runner_test.py"
293 "apache_beam/runners/direct/sdf_direct_runner_test.py"
294 "apache_beam/runners/interactive/interactive_beam_test.py"
295 "apache_beam/runners/interactive/interactive_runner_test.py"
296 "apache_beam/runners/interactive/non_interactive_runner_test.py"
297 "apache_beam/runners/interactive/recording_manager_test.py"
298 "apache_beam/runners/portability/fn_api_runner/translations_test.py"
299 "apache_beam/runners/portability/fn_api_runner/trigger_manager_test.py"
300 "apache_beam/runners/portability/stager_test.py"
301 "apache_beam/testing/synthetic_pipeline_test.py"
302 "apache_beam/testing/test_stream_test.py"
303 "apache_beam/testing/util_test.py"
304 "apache_beam/transforms/combiners_test.py"
305 "apache_beam/transforms/core_test.py"
306 "apache_beam/transforms/create_test.py"
307 "apache_beam/transforms/deduplicate_test.py"
308 "apache_beam/transforms/periodicsequence_test.py"
309 "apache_beam/transforms/ptransform_test.py"
310 "apache_beam/transforms/sideinputs_test.py"
311 "apache_beam/transforms/stats_test.py"
312 "apache_beam/transforms/transforms_keyword_only_args_test.py"
313 "apache_beam/transforms/trigger_test.py"
314 "apache_beam/transforms/userstate_test.py"
315 "apache_beam/transforms/util_test.py"
316 "apache_beam/transforms/write_ptransform_test.py"
317
318 # FIXME AttributeError: 'Namespace' object has no attribute 'test_pipeline_options'
319 # Upstream issue https://github.com/apache/beam/issues/33853
320 "apache_beam/runners/portability/prism_runner_test.py"
321
322 # FIXME ValueError: Unable to run pipeline with requirement: unsupported_requirement
323 # Upstream issuehttps://github.com/apache/beam/issues/33853
324 "apache_beam/yaml/yaml_transform_scope_test.py"
325 "apache_beam/yaml/yaml_transform_test.py"
326 "apache_beam/yaml/yaml_transform_unit_test.py"
327 "apache_beam/yaml/yaml_udf_test.py"
328 "apache_beam/dataframe/frames_test.py"
329
330 # FIXME Those tests do not terminate due to a grpc error (threading issue)
331 # grpc_status:14, grpc_message:"Cancelling all calls"}"
332 # Upstream issue https://github.com/apache/beam/issues/33851
333 "apache_beam/runners/portability/portable_runner_test.py"
334 ]
335 ++ lib.optionals (pythonAtLeast "3.13") [
336 # > instruction = ofs_table[pc]
337 # E KeyError: 18
338 "apache_beam/typehints/trivial_inference_test.py"
339 ];
340
341 disabledTests = [
342 # IndexError: list index out of range
343 "test_only_sample_exceptions"
344
345 # AssertionError: False is not true
346 "test_samples_all_with_both_experiments"
347 ]
348 ++ lib.optionals stdenv.hostPlatform.isDarwin [
349 # PermissionError: [Errno 13] Permission denied: '/tmp/...'
350 "test_cache_manager_uses_local_ib_cache_root"
351 "test_describe_all_recordings"
352 "test_find_out_correct_user_pipeline"
353 "test_get_cache_manager_creates_cache_manager_if_absent"
354 "test_streaming_cache_uses_local_ib_cache_root"
355 "test_track_user_pipeline_cleanup_non_inspectable_pipeline"
356 ]
357 ++ lib.optionals (pythonAtLeast "3.12") [
358 # TypeError: Could not determine schema for type hint Any.
359 "test_batching_beam_row_input"
360 "test_auto_convert"
361 "test_unbatching_series"
362 "test_batching_beam_row_to_dataframe"
363
364 # AssertionError: Any != <class 'int'>
365 "test_pycallable_map"
366 "testAlwaysReturnsEarly"
367
368 # TypeError: Expected Iterator in return type annotatio
369 "test_get_output_batch_type"
370 ];
371
372 meta = {
373 description = "Unified model for defining both batch and streaming data-parallel processing pipelines";
374 homepage = "https://beam.apache.org/";
375 changelog = "https://github.com/apache/beam/blob/release-${src.tag}/CHANGES.md";
376 license = lib.licenses.asl20;
377 maintainers = with lib.maintainers; [ ndl ];
378 };
379}