1{
2 lib,
3 stdenv,
4 fetchFromGitHub,
5 rocmUpdateScript,
6 cmake,
7 rocm-cmake,
8 rocm-smi,
9 rocm-core,
10 clr,
11 mscclpp,
12 perl,
13 hipify,
14 python3,
15 gtest,
16 chrpath,
17 rocprofiler,
18 rocprofiler-register,
19 autoPatchelfHook,
20 buildTests ? false,
21 gpuTargets ? (clr.localGpuTargets or [ ]),
22}:
23
24let
25 useAsan = buildTests;
26 useUbsan = buildTests;
27 san = lib.optionalString (useAsan || useUbsan) (
28 "-fno-gpu-sanitize -fsanitize=undefined "
29 + (lib.optionalString useAsan "-fsanitize=address -shared-libsan ")
30 );
31in
32# Note: we can't properly test or make use of multi-node collective ops
33# https://github.com/NixOS/nixpkgs/issues/366242 tracks kernel support
34# kfd_peerdirect support which is on out-of-tree amdkfd in ROCm/ROCK-Kernel-Driver
35# infiniband ib_peer_mem support isn't in the mainline kernel but is carried by some distros
36stdenv.mkDerivation (finalAttrs: {
37 pname = "rccl${clr.gpuArchSuffix}";
38 version = "6.3.3";
39
40 outputs = [
41 "out"
42 ]
43 ++ lib.optionals buildTests [
44 "test"
45 ];
46
47 patches = [
48 ./fix-mainline-support-and-ub.diff
49 ./enable-mscclpp-on-all-gfx9.diff
50 ./rccl-test-missing-iomanip.diff
51 ];
52
53 src = fetchFromGitHub {
54 owner = "ROCm";
55 repo = "rccl";
56 rev = "rocm-${finalAttrs.version}";
57 hash = "sha256-998tDiC0Qp9hhcXtFpiCWqwdKPVT2vNp0GU/rng03Bw=";
58 };
59
60 nativeBuildInputs = [
61 cmake
62 rocm-cmake
63 clr
64 perl
65 hipify
66 python3
67 autoPatchelfHook # ASAN doesn't add rpath without this
68 ];
69
70 buildInputs = [
71 rocm-smi
72 gtest
73 rocprofiler
74 rocprofiler-register
75 mscclpp
76 ]
77 ++ lib.optionals buildTests [
78 chrpath
79 ];
80
81 cmakeFlags = [
82 "-DHIP_CLANG_NUM_PARALLEL_JOBS=4"
83 "-DCMAKE_BUILD_TYPE=Release"
84 "-DROCM_PATH=${clr}"
85 "-DHIP_COMPILER=${clr}/bin/amdclang++"
86 "-DCMAKE_CXX_COMPILER=${clr}/bin/amdclang++"
87 "-DROCM_PATCH_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
88 "-DROCM_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
89 "-DBUILD_BFD=OFF" # Can't get it to detect bfd.h
90 "-DENABLE_MSCCL_KERNEL=ON"
91 "-DENABLE_MSCCLPP=ON"
92 "-DMSCCLPP_ROOT=${mscclpp}"
93 # Manually define CMAKE_INSTALL_<DIR>
94 # See: https://github.com/NixOS/nixpkgs/pull/197838
95 "-DCMAKE_INSTALL_BINDIR=bin"
96 "-DCMAKE_INSTALL_LIBDIR=lib"
97 "-DCMAKE_INSTALL_INCLUDEDIR=include"
98 ]
99 ++ lib.optionals (gpuTargets != [ ]) [
100 # AMD can't make up their minds and keep changing which one is used in different projects.
101 "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
102 "-DGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
103 ]
104 ++ lib.optionals buildTests [
105 "-DBUILD_TESTS=ON"
106 ];
107
108 # -O2 and -fno-strict-aliasing due to UB issues in RCCL :c
109 # Reported upstream
110 env.CFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
111 env.CXXFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
112 env.LDFLAGS = "${san}";
113 postPatch = ''
114 patchShebangs src tools
115 '';
116
117 postInstall =
118 lib.optionalString useAsan ''
119 patchelf --add-needed ${clr}/llvm/lib/linux/libclang_rt.asan-${stdenv.hostPlatform.parsed.cpu.name}.so $out/lib/librccl.so
120 ''
121 + lib.optionalString buildTests ''
122 mkdir -p $test/bin
123 mv $out/bin/* $test/bin
124 rmdir $out/bin
125 '';
126
127 passthru.updateScript = rocmUpdateScript {
128 name = finalAttrs.pname;
129 inherit (finalAttrs.src) owner;
130 inherit (finalAttrs.src) repo;
131 };
132
133 meta = with lib; {
134 description = "ROCm communication collectives library";
135 homepage = "https://github.com/ROCm/rccl";
136 license = with licenses; [
137 bsd2
138 bsd3
139 ];
140 teams = [ teams.rocm ];
141 platforms = platforms.linux;
142 };
143})