nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1{
2 lib,
3 stdenv,
4 buildPythonPackage,
5 python,
6 pythonAtLeast,
7 arrow-cpp,
8 cffi,
9 cloudpickle,
10 cmake,
11 cython,
12 fsspec,
13 hypothesis,
14 numpy,
15 pandas,
16 pytestCheckHook,
17 pytest-lazy-fixture,
18 pkg-config,
19 setuptools,
20 setuptools-scm,
21 oldest-supported-numpy,
22}:
23
24let
25 zero_or_one = cond: if cond then 1 else 0;
26in
27
28buildPythonPackage rec {
29 pname = "pyarrow";
30 inherit (arrow-cpp) version src;
31 pyproject = true;
32
33 sourceRoot = "${src.name}/python";
34
35 nativeBuildInputs = [
36 cmake
37 cython
38 pkg-config
39 setuptools
40 setuptools-scm
41 oldest-supported-numpy
42 ];
43
44 buildInputs = [ arrow-cpp ];
45
46 propagatedBuildInputs = [
47 cffi
48 numpy
49 ];
50
51 checkInputs = [
52 cloudpickle
53 fsspec
54 ];
55
56 nativeCheckInputs = [
57 hypothesis
58 pandas
59 pytestCheckHook
60 pytest-lazy-fixture
61 ];
62
63 PYARROW_BUILD_TYPE = "release";
64
65 PYARROW_WITH_DATASET = zero_or_one true;
66 PYARROW_WITH_FLIGHT = zero_or_one arrow-cpp.enableFlight;
67 PYARROW_WITH_HDFS = zero_or_one true;
68 PYARROW_WITH_PARQUET = zero_or_one true;
69 PYARROW_WITH_PARQUET_ENCRYPTION = zero_or_one true;
70 PYARROW_WITH_S3 = zero_or_one arrow-cpp.enableS3;
71 PYARROW_WITH_GCS = zero_or_one arrow-cpp.enableGcs;
72 PYARROW_BUNDLE_ARROW_CPP_HEADERS = zero_or_one false;
73
74 PYARROW_CMAKE_OPTIONS = [ "-DCMAKE_INSTALL_RPATH=${ARROW_HOME}/lib" ];
75
76 ARROW_HOME = arrow-cpp;
77 PARQUET_HOME = arrow-cpp;
78
79 ARROW_TEST_DATA = lib.optionalString doCheck arrow-cpp.ARROW_TEST_DATA;
80 doCheck = true;
81
82 dontUseCmakeConfigure = true;
83
84 __darwinAllowLocalNetworking = true;
85
86 preBuild = ''
87 export PYARROW_PARALLEL=$NIX_BUILD_CORES
88 '';
89
90 postInstall = ''
91 # copy the pyarrow C++ header files to the appropriate location
92 pyarrow_include="$out/${python.sitePackages}/pyarrow/include"
93 mkdir -p "$pyarrow_include/arrow/python"
94 find "$PWD/pyarrow/src/arrow" -type f -name '*.h' -exec cp {} "$pyarrow_include/arrow/python" \;
95 '';
96
97 disabledTestPaths = [
98 # These tests require access to s3 via the internet.
99 "pyarrow/tests/test_fs.py::test_resolve_s3_region"
100 "pyarrow/tests/test_fs.py::test_s3_finalize"
101 "pyarrow/tests/test_fs.py::test_s3_finalize_region_resolver"
102 "pyarrow/tests/test_fs.py::test_s3_real_aws"
103 "pyarrow/tests/test_fs.py::test_s3_real_aws_region_selection"
104 "pyarrow/tests/test_fs.py::test_s3_options"
105 # Flaky test
106 "pyarrow/tests/test_flight.py::test_roundtrip_errors"
107 "pyarrow/tests/test_pandas.py::test_threaded_pandas_import"
108 # Flaky test, works locally but not on Hydra.
109 "pyarrow/tests/test_csv.py::TestThreadedCSVTableRead::test_cancellation"
110 # expects arrow-cpp headers to be bundled.
111 "pyarrow/tests/test_cpp_internals.py::test_pyarrow_include"
112 # Searches for TZDATA in /usr.
113 "pyarrow/tests/test_orc.py::test_example_using_json"
114 # AssertionError: assert 'Europe/Monaco' == 'Europe/Paris'
115 "pyarrow/tests/test_types.py::test_dateutil_tzinfo_to_string"
116 # These fail with xxx_fixture not found.
117 # xxx = unary_func, unary_agg_func, varargs_agg_func
118 "pyarrow/tests/test_substrait.py::test_udf_via_substrait"
119 "pyarrow/tests/test_substrait.py::test_scalar_aggregate_udf_basic"
120 "pyarrow/tests/test_substrait.py::test_hash_aggregate_udf_basic"
121 "pyarrow/tests/test_udf.py::test_hash_agg_basic"
122 "pyarrow/tests/test_udf.py::test_hash_agg_empty"
123 "pyarrow/tests/test_udf.py::test_input_lifetime"
124 "pyarrow/tests/test_udf.py::test_scalar_agg_basic"
125 "pyarrow/tests/test_udf.py::test_scalar_agg_empty"
126 "pyarrow/tests/test_udf.py::test_scalar_agg_varargs"
127 "pyarrow/tests/test_udf.py::test_scalar_input"
128 "pyarrow/tests/test_udf.py::test_scalar_udf_context"
129 "pyarrow/tests/test_udf.py::test_udf_array_unary"
130 ]
131 ++ lib.optionals stdenv.hostPlatform.isDarwin [
132 # Requires loopback networking.
133 "pyarrow/tests/test_ipc.py::test_socket_"
134 "pyarrow/tests/test_flight.py::test_never_sends_data"
135 "pyarrow/tests/test_flight.py::test_large_descriptor"
136 "pyarrow/tests/test_flight.py::test_large_metadata_client"
137 "pyarrow/tests/test_flight.py::test_none_action_side_effect"
138 # Fails to compile.
139 "pyarrow/tests/test_cython.py::test_cython_api"
140 ]
141 ++ lib.optionals (pythonAtLeast "3.11") [
142 # Repr output is printing number instead of enum name so these tests fail
143 "pyarrow/tests/test_fs.py::test_get_file_info"
144 ]
145 ++ lib.optionals stdenv.hostPlatform.isLinux [
146 # This test requires local networking.
147 "pyarrow/tests/test_fs.py::test_filesystem_from_uri_gcs"
148 ];
149
150 disabledTests = [ "GcsFileSystem" ];
151
152 preCheck = ''
153 export PARQUET_TEST_DATA="${arrow-cpp.PARQUET_TEST_DATA}"
154 shopt -s extglob
155 rm -r pyarrow/!(conftest.py|tests)
156 mv pyarrow/conftest.py pyarrow/tests/parent_conftest.py
157 substituteInPlace pyarrow/tests/conftest.py --replace-fail ..conftest .parent_conftest
158 ''
159 + lib.optionalString stdenv.hostPlatform.isDarwin ''
160 # OSError: [Errno 24] Too many open files
161 ulimit -n 1024
162 '';
163
164 pythonImportsCheck = [
165 "pyarrow"
166 ]
167 ++ map (module: "pyarrow.${module}") [
168 "compute"
169 "csv"
170 "dataset"
171 "feather"
172 "flight"
173 "fs"
174 "json"
175 "orc"
176 "parquet"
177 ];
178
179 meta = {
180 description = "Cross-language development platform for in-memory data";
181 homepage = "https://arrow.apache.org/";
182 license = lib.licenses.asl20;
183 platforms = lib.platforms.unix;
184 maintainers = with lib.maintainers; [
185 veprbl
186 cpcloud
187 ];
188 };
189}