1{
2 lib,
3 stdenv,
4 buildPythonPackage,
5 python,
6 pythonAtLeast,
7 pythonOlder,
8 arrow-cpp,
9 cffi,
10 cloudpickle,
11 cmake,
12 cython_0,
13 fsspec,
14 hypothesis,
15 numpy,
16 pandas,
17 pytestCheckHook,
18 pytest-lazy-fixture,
19 pkg-config,
20 setuptools,
21 setuptools-scm,
22 oldest-supported-numpy,
23}:
24
25let
26 zero_or_one = cond: if cond then 1 else 0;
27in
28
29buildPythonPackage rec {
30 pname = "pyarrow";
31 inherit (arrow-cpp) version src;
32 pyproject = true;
33
34 disabled = pythonOlder "3.7";
35
36 sourceRoot = "apache-arrow-${version}/python";
37
38 nativeBuildInputs = [
39 cmake
40 cython_0
41 pkg-config
42 setuptools
43 setuptools-scm
44 oldest-supported-numpy
45 ];
46
47 buildInputs = [ arrow-cpp ];
48
49 propagatedBuildInputs = [
50 cffi
51 numpy
52 ];
53
54 checkInputs = [
55 cloudpickle
56 fsspec
57 ];
58
59 nativeCheckInputs = [
60 hypothesis
61 pandas
62 pytestCheckHook
63 pytest-lazy-fixture
64 ];
65
66 PYARROW_BUILD_TYPE = "release";
67
68 PYARROW_WITH_DATASET = zero_or_one true;
69 PYARROW_WITH_FLIGHT = zero_or_one arrow-cpp.enableFlight;
70 PYARROW_WITH_HDFS = zero_or_one true;
71 PYARROW_WITH_PARQUET = zero_or_one true;
72 PYARROW_WITH_PARQUET_ENCRYPTION = zero_or_one true;
73 PYARROW_WITH_S3 = zero_or_one arrow-cpp.enableS3;
74 PYARROW_WITH_GCS = zero_or_one arrow-cpp.enableGcs;
75 PYARROW_BUNDLE_ARROW_CPP_HEADERS = zero_or_one false;
76
77 PYARROW_CMAKE_OPTIONS = [ "-DCMAKE_INSTALL_RPATH=${ARROW_HOME}/lib" ];
78
79 ARROW_HOME = arrow-cpp;
80 PARQUET_HOME = arrow-cpp;
81
82 ARROW_TEST_DATA = lib.optionalString doCheck arrow-cpp.ARROW_TEST_DATA;
83
84 doCheck = true;
85
86 dontUseCmakeConfigure = true;
87
88 __darwinAllowLocalNetworking = true;
89
90 preBuild = ''
91 export PYARROW_PARALLEL=$NIX_BUILD_CORES
92 '';
93
94 postInstall = ''
95 # copy the pyarrow C++ header files to the appropriate location
96 pyarrow_include="$out/${python.sitePackages}/pyarrow/include"
97 mkdir -p "$pyarrow_include/arrow/python"
98 find "$PWD/pyarrow/src/arrow" -type f -name '*.h' -exec cp {} "$pyarrow_include/arrow/python" \;
99 '';
100
101 pytestFlagsArray =
102 [
103 # A couple of tests are missing fixture imports, luckily pytest offers a
104 # clean solution.
105 "--fixtures pyarrow/tests/conftest.py"
106 # Deselect a single test because pyarrow prints a 2-line error message where
107 # only a single line is expected. The additional line of output comes from
108 # the glog library which is an optional dependency of arrow-cpp that is
109 # enabled in nixpkgs.
110 # Upstream Issue: https://issues.apache.org/jira/browse/ARROW-11393
111 "--deselect=pyarrow/tests/test_memory.py::test_env_var"
112 # these tests require access to s3 via the internet
113 "--deselect=pyarrow/tests/test_fs.py::test_resolve_s3_region"
114 "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws"
115 "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws_region_selection"
116 "--deselect=pyarrow/tests/test_fs.py::test_s3_options"
117 # Flaky test
118 "--deselect=pyarrow/tests/test_flight.py::test_roundtrip_errors"
119 "--deselect=pyarrow/tests/test_pandas.py::test_threaded_pandas_import"
120 # Flaky test, works locally but not on Hydra
121 "--deselect=pyarrow/tests/test_csv.py::TestThreadedCSVTableRead::test_cancellation"
122 # expects arrow-cpp headers to be bundled
123 "--deselect=pyarrow/tests/test_cpp_internals.py::test_pyarrow_include"
124 ]
125 ++ lib.optionals stdenv.isDarwin [
126 # Requires loopback networking
127 "--deselect=pyarrow/tests/test_ipc.py::test_socket_"
128 "--deselect=pyarrow/tests/test_flight.py::test_never_sends_data"
129 "--deselect=pyarrow/tests/test_flight.py::test_large_descriptor"
130 "--deselect=pyarrow/tests/test_flight.py::test_large_metadata_client"
131 "--deselect=pyarrow/tests/test_flight.py::test_none_action_side_effect"
132 # fails to compile
133 "--deselect=pyarrow/tests/test_cython.py::test_cython_api"
134 ]
135 ++ lib.optionals (pythonAtLeast "3.11") [
136 # Repr output is printing number instead of enum name so these tests fail
137 "--deselect=pyarrow/tests/test_fs.py::test_get_file_info"
138 ]
139 ++ lib.optionals stdenv.isLinux [
140 # this test requires local networking
141 "--deselect=pyarrow/tests/test_fs.py::test_filesystem_from_uri_gcs"
142 ];
143
144 disabledTests = [ "GcsFileSystem" ];
145
146 dontUseSetuptoolsCheck = true;
147
148 preCheck =
149 ''
150 shopt -s extglob
151 rm -r pyarrow/!(conftest.py|tests)
152 mv pyarrow/conftest.py pyarrow/tests/parent_conftest.py
153 substituteInPlace pyarrow/tests/conftest.py --replace ..conftest .parent_conftest
154 ''
155 + lib.optionalString stdenv.isDarwin ''
156 # OSError: [Errno 24] Too many open files
157 ulimit -n 1024
158 '';
159
160 pythonImportsCheck =
161 [ "pyarrow" ]
162 ++ map (module: "pyarrow.${module}") [
163 "compute"
164 "csv"
165 "dataset"
166 "feather"
167 "flight"
168 "fs"
169 "json"
170 "parquet"
171 ];
172
173 meta = with lib; {
174 description = "A cross-language development platform for in-memory data";
175 homepage = "https://arrow.apache.org/";
176 license = licenses.asl20;
177 platforms = platforms.unix;
178 maintainers = with maintainers; [
179 veprbl
180 cpcloud
181 ];
182 };
183}