1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build-system
7 cython,
8 poetry-core,
9 setuptools,
10
11 # dependencies
12 cachetools,
13 click,
14 fsspec,
15 mmh3,
16 pydantic,
17 pyparsing,
18 ray,
19 requests,
20 rich,
21 sortedcontainers,
22 strictyaml,
23 tenacity,
24 zstandard,
25
26 # optional-dependencies
27 adlfs,
28 # getdaft,
29 duckdb,
30 pyarrow,
31 boto3,
32 gcsfs,
33 mypy-boto3-glue,
34 thrift,
35 pandas,
36 s3fs,
37 python-snappy,
38 psycopg2-binary,
39 sqlalchemy,
40
41 # tests
42 azure-core,
43 azure-storage-blob,
44 fastavro,
45 moto,
46 pyspark,
47 pytestCheckHook,
48 pytest-lazy-fixture,
49 pytest-mock,
50 pytest-timeout,
51 requests-mock,
52 pythonOlder,
53}:
54
55buildPythonPackage rec {
56 pname = "iceberg-python";
57 version = "0.8.1";
58 pyproject = true;
59
60 src = fetchFromGitHub {
61 owner = "apache";
62 repo = "iceberg-python";
63 tag = "pyiceberg-${version}";
64 hash = "sha256-L3YlOtzJv9R4TLeJGzfMQ+0nYtQEsqmgNZpW9B6vVAI=";
65 };
66
67 patches = lib.optionals (pythonOlder "3.12") [
68 # Build script fails to build the cython extension on python 3.11 (no issues with python 3.12):
69 # distutils.errors.DistutilsSetupError: each element of 'ext_modules' option must be an Extension instance or 2-tuple
70 # This error vanishes if Cython and setuptools imports are swapped
71 # https://stackoverflow.com/a/53356077/11196710
72 ./reorder-imports-in-build-script.patch
73 ];
74
75 build-system = [
76 cython
77 poetry-core
78 setuptools
79 ];
80
81 # Prevents the cython build to fail silently
82 env.CIBUILDWHEEL = "1";
83
84 dependencies = [
85 cachetools
86 click
87 fsspec
88 mmh3
89 pydantic
90 pyparsing
91 ray
92 requests
93 rich
94 sortedcontainers
95 strictyaml
96 tenacity
97 zstandard
98 ];
99
100 optional-dependencies = {
101 adlfs = [
102 adlfs
103 ];
104 daft = [
105 # getdaft
106 ];
107 duckdb = [
108 duckdb
109 pyarrow
110 ];
111 dynamodb = [
112 boto3
113 ];
114 gcsfs = [
115 gcsfs
116 ];
117 glue = [
118 boto3
119 mypy-boto3-glue
120 ];
121 hive = [
122 thrift
123 ];
124 pandas = [
125 pandas
126 pyarrow
127 ];
128 pyarrow = [
129 pyarrow
130 ];
131 ray = [
132 pandas
133 pyarrow
134 ray
135 ];
136 s3fs = [
137 s3fs
138 ];
139 snappy = [
140 python-snappy
141 ];
142 sql-postgres = [
143 psycopg2-binary
144 sqlalchemy
145 ];
146 sql-sqlite = [
147 sqlalchemy
148 ];
149 zstandard = [
150 zstandard
151 ];
152 };
153
154 pythonImportsCheck = [
155 "pyiceberg"
156 # Compiled avro decoder (cython)
157 "pyiceberg.avro.decoder_fast"
158 ];
159
160 nativeCheckInputs = [
161 azure-core
162 azure-storage-blob
163 boto3
164 fastavro
165 moto
166 mypy-boto3-glue
167 pandas
168 pyarrow
169 pyspark
170 pytest-lazy-fixture
171 pytest-mock
172 pytest-timeout
173 pytestCheckHook
174 requests-mock
175 s3fs
176 sqlalchemy
177 thrift
178 ] ++ moto.optional-dependencies.server;
179
180 disabledTestPaths = [
181 # Several errors:
182 # - FileNotFoundError: [Errno 2] No such file or directory: '/nix/store/...-python3.12-pyspark-3.5.3/lib/python3.12/site-packages/pyspark/./bin/spark-submit'
183 # - requests.exceptions.ConnectionError: HTTPConnectionPool(host='localhost', port=8181): Max retries exceeded with url: /v1/config
184 # - thrift.transport.TTransport.TTransportException: Could not connect to any of [('127.0.0.1', 9083)]
185 "tests/integration"
186 ];
187
188 disabledTests = [
189 # botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL
190 "test_checking_if_a_file_exists"
191 "test_closing_a_file"
192 "test_fsspec_file_tell"
193 "test_fsspec_getting_length_of_file"
194 "test_fsspec_pickle_round_trip_s3"
195 "test_fsspec_raise_on_opening_file_not_found"
196 "test_fsspec_read_specified_bytes_for_file"
197 "test_fsspec_write_and_read_file"
198 "test_writing_avro_file"
199
200 # Require unpackaged gcsfs
201 "test_fsspec_converting_an_outputfile_to_an_inputfile_gcs"
202 "test_fsspec_new_input_file_gcs"
203 "test_fsspec_new_output_file_gcs"
204 "test_fsspec_pickle_roundtrip_gcs"
205
206 # Timeout (network access)
207 "test_fsspec_converting_an_outputfile_to_an_inputfile_adls"
208 "test_fsspec_new_abfss_output_file_adls"
209 "test_fsspec_new_input_file_adls"
210 "test_fsspec_pickle_round_trip_aldfs"
211
212 # TypeError: pyarrow.lib.large_list() takes no keyword argument
213 # From tests/io/test_pyarrow_stats.py:
214 "test_bounds"
215 "test_column_metrics_mode"
216 "test_column_sizes"
217 "test_metrics_mode_counts"
218 "test_metrics_mode_full"
219 "test_metrics_mode_non_default_trunc"
220 "test_metrics_mode_none"
221 "test_null_and_nan_counts"
222 "test_offsets"
223 "test_read_missing_statistics"
224 "test_record_count"
225 "test_value_counts"
226 "test_write_and_read_stats_schema"
227 # From tests/io/test_pyarrow.py:
228 "test_list_type_to_pyarrow"
229 "test_projection_add_column"
230 "test_projection_list_of_structs"
231 "test_read_list"
232 "test_schema_compatible_missing_nullable_field_nested"
233 "test_schema_compatible_nested"
234 "test_schema_mismatch_missing_required_field_nested"
235 "test_schema_to_pyarrow_schema_exclude_field_ids"
236 "test_schema_to_pyarrow_schema_include_field_ids"
237 # From tests/io/test_pyarrow_visitor.py
238 "test_round_schema_conversion_nested"
239
240 # Hangs forever (from tests/io/test_pyarrow.py)
241 "test_getting_length_of_file_gcs"
242 ];
243
244 meta = {
245 description = "Python library for programmatic access to Apache Iceberg";
246 homepage = "https://github.com/apache/iceberg-python";
247 changelog = "https://github.com/apache/iceberg-python/releases/tag/pyiceberg-${version}";
248 license = lib.licenses.asl20;
249 maintainers = with lib.maintainers; [ GaetanLepage ];
250 };
251}