1{ stdenv
2, lib
3, fetchurl
4, fetchFromGitHub
5, fixDarwinDylibNames
6, autoconf
7, aws-sdk-cpp
8, boost
9, brotli
10, c-ares
11, cmake
12, crc32c
13, curl
14, flatbuffers
15, gflags
16, glog
17, google-cloud-cpp
18, grpc
19, gtest
20, libbacktrace
21, lz4
22, minio
23, ninja
24, nlohmann_json
25, openssl
26, perl
27, protobuf
28, python3
29, rapidjson
30, re2
31, snappy
32, sqlite
33, thrift
34, tzdata
35, utf8proc
36, which
37, zlib
38, zstd
39, enableShared ? !stdenv.hostPlatform.isStatic
40, enableFlight ? true
41, enableJemalloc ? !stdenv.isDarwin
42, enableS3 ? true
43, enableGcs ? !stdenv.isDarwin
44}:
45
46assert lib.asserts.assertMsg
47 ((enableS3 && stdenv.isDarwin) -> (lib.versionOlder boost.version "1.69" || lib.versionAtLeast boost.version "1.70"))
48 "S3 on Darwin requires Boost != 1.69";
49
50let
51 arrow-testing = fetchFromGitHub {
52 name = "arrow-testing";
53 owner = "apache";
54 repo = "arrow-testing";
55 rev = "47f7b56b25683202c1fd957668e13f2abafc0f12";
56 hash = "sha256-ZDznR+yi0hm5O1s9as8zq5nh1QxJ8kXCRwbNQlzXpnI=";
57 };
58
59 parquet-testing = fetchFromGitHub {
60 name = "parquet-testing";
61 owner = "apache";
62 repo = "parquet-testing";
63 rev = "b2e7cc755159196e3a068c8594f7acbaecfdaaac";
64 hash = "sha256-IFvGTOkaRSNgZOj8DziRj88yH5JRF+wgSDZ5N0GNvjk=";
65 };
66
67 aws-sdk-cpp-arrow = aws-sdk-cpp.override {
68 apis = [
69 "cognito-identity"
70 "config"
71 "identity-management"
72 "s3"
73 "sts"
74 "transfer"
75 ];
76 };
77
78in
79stdenv.mkDerivation rec {
80 pname = "arrow-cpp";
81 version = "14.0.1";
82
83 src = fetchurl {
84 url = "mirror://apache/arrow/arrow-${version}/apache-arrow-${version}.tar.gz";
85 hash = "sha256-XHDq+xAR+dEkuvsyiv5U9izFuSgLcIDh49Zo94wOQH4=";
86 };
87
88 sourceRoot = "apache-arrow-${version}/cpp";
89
90 # versions are all taken from
91 # https://github.com/apache/arrow/blob/apache-arrow-${version}/cpp/thirdparty/versions.txt
92
93 # jemalloc: arrow uses a custom prefix to prevent default allocator symbol
94 # collisions as well as custom build flags
95 ${if enableJemalloc then "ARROW_JEMALLOC_URL" else null} = fetchurl {
96 url = "https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2";
97 hash = "sha256-LbgtHnEZ3z5xt2QCGbbf6EeJvAU3mDw7esT3GJrs/qo=";
98 };
99
100 # mimalloc: arrow uses custom build flags for mimalloc
101 ARROW_MIMALLOC_URL = fetchFromGitHub {
102 owner = "microsoft";
103 repo = "mimalloc";
104 rev = "v2.0.6";
105 hash = "sha256-u2ITXABBN/dwU+mCIbL3tN1f4c17aBuSdNTV+Adtohc=";
106 };
107
108 ARROW_XSIMD_URL = fetchFromGitHub {
109 owner = "xtensor-stack";
110 repo = "xsimd";
111 rev = "9.0.1";
112 hash = "sha256-onALN6agtrHWigtFlCeefD9CiRZI4Y690XTzy2UDnrk=";
113 };
114
115 ARROW_SUBSTRAIT_URL = fetchFromGitHub {
116 owner = "substrait-io";
117 repo = "substrait";
118 rev = "v0.27.0";
119 hash = "sha256-wptEAXembah04pzqAz6UHeUxp+jMf6Lh/IdyuIhy/a8=";
120 };
121
122 nativeBuildInputs = [
123 cmake
124 ninja
125 autoconf # for vendored jemalloc
126 flatbuffers
127 ] ++ lib.optional stdenv.isDarwin fixDarwinDylibNames;
128 buildInputs = [
129 boost
130 brotli
131 flatbuffers
132 gflags
133 glog
134 gtest
135 libbacktrace
136 lz4
137 nlohmann_json # alternative JSON parser to rapidjson
138 protobuf # substrait requires protobuf
139 rapidjson
140 re2
141 snappy
142 thrift
143 utf8proc
144 zlib
145 zstd
146 ] ++ lib.optionals enableFlight [
147 grpc
148 openssl
149 protobuf
150 sqlite
151 ] ++ lib.optionals enableS3 [ aws-sdk-cpp-arrow openssl ]
152 ++ lib.optionals enableGcs [
153 crc32c
154 curl
155 google-cloud-cpp
156 grpc
157 nlohmann_json
158 ];
159
160 preConfigure = ''
161 patchShebangs build-support/
162 substituteInPlace "src/arrow/vendored/datetime/tz.cpp" \
163 --replace 'discover_tz_dir();' '"${tzdata}/share/zoneinfo";'
164 '';
165
166 cmakeFlags = [
167 "-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON"
168 "-DARROW_BUILD_SHARED=${if enableShared then "ON" else "OFF"}"
169 "-DARROW_BUILD_STATIC=${if enableShared then "OFF" else "ON"}"
170 "-DARROW_BUILD_TESTS=ON"
171 "-DARROW_BUILD_INTEGRATION=ON"
172 "-DARROW_BUILD_UTILITIES=ON"
173 "-DARROW_EXTRA_ERROR_CONTEXT=ON"
174 "-DARROW_VERBOSE_THIRDPARTY_BUILD=ON"
175 "-DARROW_DEPENDENCY_SOURCE=SYSTEM"
176 "-Dxsimd_SOURCE=AUTO"
177 "-DARROW_DEPENDENCY_USE_SHARED=${if enableShared then "ON" else "OFF"}"
178 "-DARROW_COMPUTE=ON"
179 "-DARROW_CSV=ON"
180 "-DARROW_DATASET=ON"
181 "-DARROW_FILESYSTEM=ON"
182 "-DARROW_FLIGHT_SQL=${if enableFlight then "ON" else "OFF"}"
183 "-DARROW_HDFS=ON"
184 "-DARROW_IPC=ON"
185 "-DARROW_JEMALLOC=${if enableJemalloc then "ON" else "OFF"}"
186 "-DARROW_JSON=ON"
187 "-DARROW_USE_GLOG=ON"
188 "-DARROW_WITH_BACKTRACE=ON"
189 "-DARROW_WITH_BROTLI=ON"
190 "-DARROW_WITH_LZ4=ON"
191 "-DARROW_WITH_NLOHMANN_JSON=ON"
192 "-DARROW_WITH_SNAPPY=ON"
193 "-DARROW_WITH_UTF8PROC=ON"
194 "-DARROW_WITH_ZLIB=ON"
195 "-DARROW_WITH_ZSTD=ON"
196 "-DARROW_MIMALLOC=ON"
197 "-DARROW_SUBSTRAIT=ON"
198 "-DARROW_FLIGHT=${if enableFlight then "ON" else "OFF"}"
199 "-DARROW_FLIGHT_TESTING=${if enableFlight then "ON" else "OFF"}"
200 "-DARROW_S3=${if enableS3 then "ON" else "OFF"}"
201 "-DARROW_GCS=${if enableGcs then "ON" else "OFF"}"
202 # Parquet options:
203 "-DARROW_PARQUET=ON"
204 "-DPARQUET_BUILD_EXECUTABLES=ON"
205 "-DPARQUET_REQUIRE_ENCRYPTION=ON"
206 ] ++ lib.optionals (!enableShared) [
207 "-DARROW_TEST_LINKAGE=static"
208 ] ++ lib.optionals stdenv.isDarwin [
209 "-DCMAKE_INSTALL_RPATH=@loader_path/../lib" # needed for tools executables
210 ] ++ lib.optionals (!stdenv.isx86_64) [ "-DARROW_USE_SIMD=OFF" ]
211 ++ lib.optionals enableS3 [ "-DAWSSDK_CORE_HEADER_FILE=${aws-sdk-cpp-arrow}/include/aws/core/Aws.h" ];
212
213 doInstallCheck = true;
214 ARROW_TEST_DATA = lib.optionalString doInstallCheck "${arrow-testing}/data";
215 PARQUET_TEST_DATA = lib.optionalString doInstallCheck "${parquet-testing}/data";
216 GTEST_FILTER =
217 let
218 # Upstream Issue: https://issues.apache.org/jira/browse/ARROW-11398
219 filteredTests = lib.optionals stdenv.hostPlatform.isAarch64 [
220 "TestFilterKernelWithNumeric/3.CompareArrayAndFilterRandomNumeric"
221 "TestFilterKernelWithNumeric/7.CompareArrayAndFilterRandomNumeric"
222 "TestCompareKernel.PrimitiveRandomTests"
223 ] ++ lib.optionals enableS3 [
224 "S3OptionsTest.FromUri"
225 "S3RegionResolutionTest.NonExistentBucket"
226 "S3RegionResolutionTest.PublicBucket"
227 "S3RegionResolutionTest.RestrictedBucket"
228 "TestMinioServer.Connect"
229 "TestS3FS.*"
230 "TestS3FSGeneric.*"
231 ] ++ lib.optionals stdenv.isDarwin [
232 # TODO: revisit at 12.0.0 or when
233 # https://github.com/apache/arrow/commit/295c6644ca6b67c95a662410b2c7faea0920c989
234 # is available, see
235 # https://github.com/apache/arrow/pull/15288#discussion_r1071244661
236 "ExecPlanExecution.StressSourceSinkStopped"
237 ];
238 in
239 lib.optionalString doInstallCheck "-${lib.concatStringsSep ":" filteredTests}";
240
241 __darwinAllowLocalNetworking = true;
242
243 nativeInstallCheckInputs = [ perl which sqlite ]
244 ++ lib.optionals enableS3 [ minio ]
245 ++ lib.optionals enableFlight [ python3 ];
246
247 disabledTests = [
248 # requires networking
249 "arrow-gcsfs-test"
250 "arrow-flight-integration-test"
251 ];
252
253 installCheckPhase = ''
254 runHook preInstallCheck
255
256 ctest -L unittest --exclude-regex '^(${lib.concatStringsSep "|" disabledTests})$'
257
258 runHook postInstallCheck
259 '';
260
261 meta = with lib; {
262 description = "A cross-language development platform for in-memory data";
263 homepage = "https://arrow.apache.org/docs/cpp/";
264 license = licenses.asl20;
265 platforms = platforms.unix;
266 maintainers = with maintainers; [ tobim veprbl cpcloud ];
267 };
268 passthru = {
269 inherit enableFlight enableJemalloc enableS3 enableGcs;
270 };
271}