1{
2 buildPythonPackage,
3 cloudpickle,
4 crcmod,
5 cython,
6 dill,
7 fastavro,
8 fasteners,
9 fetchFromGitHub,
10 fetchpatch,
11 freezegun,
12 grpcio,
13 grpcio-tools,
14 hdfs,
15 httplib2,
16 hypothesis,
17 lib,
18 mock,
19 mypy-protobuf,
20 numpy,
21 objsize,
22 orjson,
23 pandas,
24 parameterized,
25 proto-plus,
26 protobuf,
27 psycopg2,
28 pyarrow,
29 pydot,
30 pyhamcrest,
31 pymongo,
32 pytest-xdist,
33 pytestCheckHook,
34 python,
35 python-dateutil,
36 pythonRelaxDepsHook,
37 pytz,
38 pyyaml,
39 regex,
40 requests,
41 requests-mock,
42 scikit-learn,
43 setuptools,
44 sqlalchemy,
45 tenacity,
46 testcontainers,
47 typing-extensions,
48 zstandard,
49}:
50
51buildPythonPackage rec {
52 pname = "apache-beam";
53 version = "2.54.0";
54 pyproject = true;
55
56 src = fetchFromGitHub {
57 owner = "apache";
58 repo = "beam";
59 rev = "refs/tags/v${version}";
60 hash = "sha256-DcqYBPAS+yUqTJLUem8+2OqRUzb6DoBOeRkMjmvuvws=";
61 };
62
63 patches = [
64 (fetchpatch {
65 # https://github.com/apache/beam/pull/24143
66 name = "fix-for-dill-0.3.6.patch";
67 url = "https://github.com/apache/beam/commit/7e014435b816015d21cc07f3f6c80809f3d8023d.patch";
68 hash = "sha256-iUmnzrItTFM98w3mpadzrmtI3t0fucpSujAg/6qxCGk=";
69 stripLen = 2;
70 })
71 ];
72
73 pythonRelaxDeps = [
74 # See https://github.com/NixOS/nixpkgs/issues/156957
75 "dill"
76 "numpy"
77 "pymongo"
78
79 # See https://github.com/NixOS/nixpkgs/issues/193613
80 "protobuf"
81
82 # As of apache-beam v2.45.0, the requirement is httplib2>=0.8,<0.21.0, but
83 # the current (2023-02-08) nixpkgs's httplib2 version is 0.21.0. This can be
84 # removed once beam is upgraded since the current requirement on master is
85 # for httplib2>=0.8,<0.22.0.
86 "httplib2"
87
88 # As of apache-beam v2.45.0, the requirement is pyarrow<10.0.0,>=0.15.1, but
89 # the current (2023-02-22) nixpkgs's pyarrow version is 11.0.0.
90 "pyarrow"
91 ];
92
93 sourceRoot = "${src.name}/sdks/python";
94
95 nativeBuildInputs = [
96 cython
97 grpcio-tools
98 mypy-protobuf
99 pythonRelaxDepsHook
100 setuptools
101 ];
102
103 propagatedBuildInputs = [
104 cloudpickle
105 crcmod
106 dill
107 fastavro
108 fasteners
109 grpcio
110 hdfs
111 httplib2
112 numpy
113 objsize
114 orjson
115 proto-plus
116 protobuf
117 pyarrow
118 pydot
119 pymongo
120 python-dateutil
121 pytz
122 regex
123 requests
124 typing-extensions
125 zstandard
126 ];
127
128 enableParallelBuilding = true;
129
130 pythonImportsCheck = [ "apache_beam" ];
131
132 checkInputs = [
133 freezegun
134 hypothesis
135 mock
136 pandas
137 parameterized
138 psycopg2
139 pyhamcrest
140 pytestCheckHook
141 pytest-xdist
142 pyyaml
143 requests-mock
144 scikit-learn
145 sqlalchemy
146 tenacity
147 testcontainers
148 ];
149
150 # Make sure we're running the tests for the actually installed
151 # package, so that cython's .so files are available.
152 preCheck = "cd $out/${python.sitePackages}";
153
154 disabledTestPaths = [
155 # Fails with
156 # _______ ERROR collecting apache_beam/io/external/xlang_jdbcio_it_test.py _______
157 # apache_beam/io/external/xlang_jdbcio_it_test.py:80: in <module>
158 # class CrossLanguageJdbcIOTest(unittest.TestCase):
159 # apache_beam/io/external/xlang_jdbcio_it_test.py:99: in CrossLanguageJdbcIOTest
160 # container_init: Callable[[], Union[PostgresContainer, MySqlContainer]],
161 # E NameError: name 'MySqlContainer' is not defined
162 #
163 "apache_beam/io/external/xlang_jdbcio_it_test.py"
164
165 # These tests depend on the availability of specific servers backends.
166 "apache_beam/runners/portability/flink_runner_test.py"
167 "apache_beam/runners/portability/samza_runner_test.py"
168 "apache_beam/runners/portability/spark_runner_test.py"
169
170 # Fails starting from dill 0.3.6 because it tries to pickle pytest globals:
171 # https://github.com/uqfoundation/dill/issues/482#issuecomment-1139017499.
172 "apache_beam/transforms/window_test.py"
173
174 # See https://github.com/apache/beam/issues/25390.
175 "apache_beam/coders/slow_coders_test.py"
176 "apache_beam/dataframe/pandas_doctests_test.py"
177 "apache_beam/typehints/typed_pipeline_test.py"
178 "apache_beam/coders/fast_coders_test.py"
179 "apache_beam/dataframe/schemas_test.py"
180 ];
181
182 disabledTests = [
183 # The reasons of failures for these tests are unclear.
184 # They reproduce in Docker with Ubuntu 22.04
185 # (= they're not `nixpkgs`-specific) but given the upstream uses
186 # quite elaborate testing infra with containers and multiple
187 # different runners - I don't expect them to help debugging these
188 # when running via our (= custom from their PoV) testing infra.
189 "test_with_main_session"
190 # AssertionErrors
191 "test_unified_repr"
192 "testDictComprehension"
193 "testDictComprehensionSimple"
194 "testGenerator"
195 "testGeneratorComprehension"
196 "testListComprehension"
197 "testNoneReturn"
198 "testSet"
199 "testTupleListComprehension"
200 "test_newtype"
201 "test_pardo_type_inference"
202 "test_get_output_batch_type"
203 "test_pformat_namedtuple_with_unnamed_fields"
204 "test_row_coder_fail_early_bad_schema"
205 # See https://github.com/apache/beam/issues/26004.
206 "test_batch_encode_decode"
207 ];
208
209 meta = with lib; {
210 description = "Unified model for defining both batch and streaming data-parallel processing pipelines";
211 homepage = "https://beam.apache.org/";
212 license = licenses.asl20;
213 maintainers = with maintainers; [ ndl ];
214 # https://github.com/apache/beam/issues/27221
215 broken = lib.versionAtLeast pandas.version "2";
216 };
217}