1{ lib 2, stdenv 3, buildPythonPackage 4, python 5, pythonAtLeast 6, pythonOlder 7, arrow-cpp 8, cffi 9, cloudpickle 10, cmake 11, cython 12, fsspec 13, hypothesis 14, numpy 15, pandas 16, pytestCheckHook 17, pytest-lazy-fixture 18, pkg-config 19, scipy 20, fetchpatch 21, setuptools-scm 22}: 23 24let 25 zero_or_one = cond: if cond then 1 else 0; 26in 27 28buildPythonPackage rec { 29 pname = "pyarrow"; 30 inherit (arrow-cpp) version src; 31 32 disabled = pythonOlder "3.7"; 33 34 sourceRoot = "apache-arrow-${version}/python"; 35 36 nativeBuildInputs = [ 37 cmake 38 cython 39 pkg-config 40 setuptools-scm 41 ]; 42 43 buildInputs = [ arrow-cpp ]; 44 45 propagatedBuildInputs = [ 46 cffi 47 cloudpickle 48 fsspec 49 numpy 50 scipy 51 ]; 52 53 nativeCheckInputs = [ 54 hypothesis 55 pandas 56 pytestCheckHook 57 pytest-lazy-fixture 58 ]; 59 60 PYARROW_BUILD_TYPE = "release"; 61 62 PYARROW_WITH_DATASET = zero_or_one true; 63 PYARROW_WITH_FLIGHT = zero_or_one arrow-cpp.enableFlight; 64 PYARROW_WITH_HDFS = zero_or_one true; 65 PYARROW_WITH_PARQUET = zero_or_one true; 66 PYARROW_WITH_PARQUET_ENCRYPTION = zero_or_one true; 67 # Plasma is deprecated since arrow 10.0.0 68 PYARROW_WITH_PLASMA = zero_or_one false; 69 PYARROW_WITH_S3 = zero_or_one arrow-cpp.enableS3; 70 PYARROW_WITH_GCS = zero_or_one arrow-cpp.enableGcs; 71 PYARROW_BUNDLE_ARROW_CPP_HEADERS = zero_or_one false; 72 73 PYARROW_CMAKE_OPTIONS = [ 74 "-DCMAKE_INSTALL_RPATH=${ARROW_HOME}/lib" 75 ]; 76 77 ARROW_HOME = arrow-cpp; 78 PARQUET_HOME = arrow-cpp; 79 80 ARROW_TEST_DATA = lib.optionalString doCheck arrow-cpp.ARROW_TEST_DATA; 81 82 doCheck = true; 83 84 dontUseCmakeConfigure = true; 85 86 __darwinAllowLocalNetworking = true; 87 88 preBuild = '' 89 export PYARROW_PARALLEL=$NIX_BUILD_CORES 90 ''; 91 92 postInstall = '' 93 # copy the pyarrow C++ header files to the appropriate location 94 pyarrow_include="$out/${python.sitePackages}/pyarrow/include" 95 mkdir -p "$pyarrow_include/arrow/python" 96 find "$PWD/pyarrow/src/arrow" -type f -name '*.h' -exec cp {} "$pyarrow_include/arrow/python" \; 97 ''; 98 99 pytestFlagsArray = [ 100 # A couple of tests are missing fixture imports, luckily pytest offers a 101 # clean solution. 102 "--fixtures pyarrow/tests/conftest.py" 103 # Deselect a single test because pyarrow prints a 2-line error message where 104 # only a single line is expected. The additional line of output comes from 105 # the glog library which is an optional dependency of arrow-cpp that is 106 # enabled in nixpkgs. 107 # Upstream Issue: https://issues.apache.org/jira/browse/ARROW-11393 108 "--deselect=pyarrow/tests/test_memory.py::test_env_var" 109 # these tests require access to s3 via the internet 110 "--deselect=pyarrow/tests/test_fs.py::test_resolve_s3_region" 111 "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws" 112 "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws_region_selection" 113 "--deselect=pyarrow/tests/test_fs.py::test_s3_options" 114 # Flaky test 115 "--deselect=pyarrow/tests/test_flight.py::test_roundtrip_errors" 116 "--deselect=pyarrow/tests/test_pandas.py::test_threaded_pandas_import" 117 # Flaky test, works locally but not on Hydra 118 "--deselect=pyarrow/tests/test_csv.py::TestThreadedCSVTableRead::test_cancellation" 119 # expects arrow-cpp headers to be bundled 120 "--deselect=pyarrow/tests/test_cpp_internals.py::test_pyarrow_include" 121 ] ++ lib.optionals stdenv.isDarwin [ 122 # Requires loopback networking 123 "--deselect=pyarrow/tests/test_ipc.py::test_socket_" 124 "--deselect=pyarrow/tests/test_flight.py::test_never_sends_data" 125 "--deselect=pyarrow/tests/test_flight.py::test_large_descriptor" 126 "--deselect=pyarrow/tests/test_flight.py::test_large_metadata_client" 127 "--deselect=pyarrow/tests/test_flight.py::test_none_action_side_effect" 128 # fails to compile 129 "--deselect=pyarrow/tests/test_cython.py::test_cython_api" 130 ] ++ lib.optionals (pythonAtLeast "3.11") [ 131 # Repr output is printing number instead of enum name so these tests fail 132 "--deselect=pyarrow/tests/test_fs.py::test_get_file_info" 133 ] ++ lib.optionals stdenv.isLinux [ 134 # this test requires local networking 135 "--deselect=pyarrow/tests/test_fs.py::test_filesystem_from_uri_gcs" 136 ]; 137 138 disabledTests = [ "GcsFileSystem" ]; 139 140 dontUseSetuptoolsCheck = true; 141 142 preCheck = '' 143 shopt -s extglob 144 rm -r pyarrow/!(conftest.py|tests) 145 mv pyarrow/conftest.py pyarrow/tests/parent_conftest.py 146 substituteInPlace pyarrow/tests/conftest.py --replace ..conftest .parent_conftest 147 '' + lib.optionalString stdenv.isDarwin '' 148 # OSError: [Errno 24] Too many open files 149 ulimit -n 1024 150 ''; 151 152 pythonImportsCheck = [ 153 "pyarrow" 154 ] ++ map (module: "pyarrow.${module}") [ 155 "compute" 156 "csv" 157 "dataset" 158 "feather" 159 "flight" 160 "fs" 161 "hdfs" 162 "json" 163 "parquet" 164 ]; 165 166 meta = with lib; { 167 description = "A cross-language development platform for in-memory data"; 168 homepage = "https://arrow.apache.org/"; 169 license = licenses.asl20; 170 platforms = platforms.unix; 171 maintainers = with maintainers; [ veprbl cpcloud ]; 172 }; 173}