1{ lib
2, stdenv
3, buildPythonPackage
4, python
5, pythonAtLeast
6, pythonOlder
7, arrow-cpp
8, cffi
9, cloudpickle
10, cmake
11, cython
12, fsspec
13, hypothesis
14, numpy
15, pandas
16, pytestCheckHook
17, pytest-lazy-fixture
18, pkg-config
19, scipy
20, fetchpatch
21, setuptools-scm
22}:
23
24let
25 zero_or_one = cond: if cond then 1 else 0;
26in
27
28buildPythonPackage rec {
29 pname = "pyarrow";
30 inherit (arrow-cpp) version src;
31
32 disabled = pythonOlder "3.7";
33
34 sourceRoot = "apache-arrow-${version}/python";
35
36 nativeBuildInputs = [
37 cmake
38 cython
39 pkg-config
40 setuptools-scm
41 ];
42
43 buildInputs = [ arrow-cpp ];
44
45 propagatedBuildInputs = [
46 cffi
47 cloudpickle
48 fsspec
49 numpy
50 scipy
51 ];
52
53 nativeCheckInputs = [
54 hypothesis
55 pandas
56 pytestCheckHook
57 pytest-lazy-fixture
58 ];
59
60 PYARROW_BUILD_TYPE = "release";
61
62 PYARROW_WITH_DATASET = zero_or_one true;
63 PYARROW_WITH_FLIGHT = zero_or_one arrow-cpp.enableFlight;
64 PYARROW_WITH_HDFS = zero_or_one true;
65 PYARROW_WITH_PARQUET = zero_or_one true;
66 PYARROW_WITH_PARQUET_ENCRYPTION = zero_or_one true;
67 # Plasma is deprecated since arrow 10.0.0
68 PYARROW_WITH_PLASMA = zero_or_one false;
69 PYARROW_WITH_S3 = zero_or_one arrow-cpp.enableS3;
70 PYARROW_WITH_GCS = zero_or_one arrow-cpp.enableGcs;
71 PYARROW_BUNDLE_ARROW_CPP_HEADERS = zero_or_one false;
72
73 PYARROW_CMAKE_OPTIONS = [
74 "-DCMAKE_INSTALL_RPATH=${ARROW_HOME}/lib"
75 ];
76
77 ARROW_HOME = arrow-cpp;
78 PARQUET_HOME = arrow-cpp;
79
80 ARROW_TEST_DATA = lib.optionalString doCheck arrow-cpp.ARROW_TEST_DATA;
81
82 doCheck = true;
83
84 dontUseCmakeConfigure = true;
85
86 __darwinAllowLocalNetworking = true;
87
88 preBuild = ''
89 export PYARROW_PARALLEL=$NIX_BUILD_CORES
90 '';
91
92 postInstall = ''
93 # copy the pyarrow C++ header files to the appropriate location
94 pyarrow_include="$out/${python.sitePackages}/pyarrow/include"
95 mkdir -p "$pyarrow_include/arrow/python"
96 find "$PWD/pyarrow/src/arrow" -type f -name '*.h' -exec cp {} "$pyarrow_include/arrow/python" \;
97 '';
98
99 pytestFlagsArray = [
100 # A couple of tests are missing fixture imports, luckily pytest offers a
101 # clean solution.
102 "--fixtures pyarrow/tests/conftest.py"
103 # Deselect a single test because pyarrow prints a 2-line error message where
104 # only a single line is expected. The additional line of output comes from
105 # the glog library which is an optional dependency of arrow-cpp that is
106 # enabled in nixpkgs.
107 # Upstream Issue: https://issues.apache.org/jira/browse/ARROW-11393
108 "--deselect=pyarrow/tests/test_memory.py::test_env_var"
109 # these tests require access to s3 via the internet
110 "--deselect=pyarrow/tests/test_fs.py::test_resolve_s3_region"
111 "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws"
112 "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws_region_selection"
113 "--deselect=pyarrow/tests/test_fs.py::test_s3_options"
114 # Flaky test
115 "--deselect=pyarrow/tests/test_flight.py::test_roundtrip_errors"
116 "--deselect=pyarrow/tests/test_pandas.py::test_threaded_pandas_import"
117 # Flaky test, works locally but not on Hydra
118 "--deselect=pyarrow/tests/test_csv.py::TestThreadedCSVTableRead::test_cancellation"
119 # expects arrow-cpp headers to be bundled
120 "--deselect=pyarrow/tests/test_cpp_internals.py::test_pyarrow_include"
121 ] ++ lib.optionals stdenv.isDarwin [
122 # Requires loopback networking
123 "--deselect=pyarrow/tests/test_ipc.py::test_socket_"
124 "--deselect=pyarrow/tests/test_flight.py::test_never_sends_data"
125 "--deselect=pyarrow/tests/test_flight.py::test_large_descriptor"
126 "--deselect=pyarrow/tests/test_flight.py::test_large_metadata_client"
127 "--deselect=pyarrow/tests/test_flight.py::test_none_action_side_effect"
128 # fails to compile
129 "--deselect=pyarrow/tests/test_cython.py::test_cython_api"
130 ] ++ lib.optionals (pythonAtLeast "3.11") [
131 # Repr output is printing number instead of enum name so these tests fail
132 "--deselect=pyarrow/tests/test_fs.py::test_get_file_info"
133 ] ++ lib.optionals stdenv.isLinux [
134 # this test requires local networking
135 "--deselect=pyarrow/tests/test_fs.py::test_filesystem_from_uri_gcs"
136 ];
137
138 disabledTests = [ "GcsFileSystem" ];
139
140 dontUseSetuptoolsCheck = true;
141
142 preCheck = ''
143 shopt -s extglob
144 rm -r pyarrow/!(conftest.py|tests)
145 mv pyarrow/conftest.py pyarrow/tests/parent_conftest.py
146 substituteInPlace pyarrow/tests/conftest.py --replace ..conftest .parent_conftest
147 '' + lib.optionalString stdenv.isDarwin ''
148 # OSError: [Errno 24] Too many open files
149 ulimit -n 1024
150 '';
151
152 pythonImportsCheck = [
153 "pyarrow"
154 ] ++ map (module: "pyarrow.${module}") [
155 "compute"
156 "csv"
157 "dataset"
158 "feather"
159 "flight"
160 "fs"
161 "hdfs"
162 "json"
163 "parquet"
164 ];
165
166 meta = with lib; {
167 description = "A cross-language development platform for in-memory data";
168 homepage = "https://arrow.apache.org/";
169 license = licenses.asl20;
170 platforms = platforms.unix;
171 maintainers = with maintainers; [ veprbl cpcloud ];
172 };
173}