nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1{
2 lib,
3 stdenv,
4 fetchFromGitHub,
5 rocmUpdateScript,
6 cmake,
7 rocm-cmake,
8 rocm-smi,
9 rocm-core,
10 clr,
11 mscclpp,
12 perl,
13 hipify,
14 gtest,
15 chrpath,
16 rocprofiler,
17 rocprofiler-register,
18 autoPatchelfHook,
19 buildTests ? false,
20 gpuTargets ? (clr.localGpuTargets or [ ]),
21}:
22
23let
24 useAsan = buildTests;
25 useUbsan = buildTests;
26 san = lib.optionalString (useAsan || useUbsan) (
27 "-fno-gpu-sanitize -fsanitize=undefined "
28 + (lib.optionalString useAsan "-fsanitize=address -shared-libsan ")
29 );
30in
31# Note: we can't properly test or make use of multi-node collective ops
32# https://github.com/NixOS/nixpkgs/issues/366242 tracks kernel support
33# kfd_peerdirect support which is on out-of-tree amdkfd in ROCm/ROCK-Kernel-Driver
34# infiniband ib_peer_mem support isn't in the mainline kernel but is carried by some distros
35stdenv.mkDerivation (finalAttrs: {
36 pname = "rccl${clr.gpuArchSuffix}";
37 version = "6.3.3";
38
39 outputs = [
40 "out"
41 ]
42 ++ lib.optionals buildTests [
43 "test"
44 ];
45
46 patches = [
47 ./fix-mainline-support-and-ub.diff
48 ./enable-mscclpp-on-all-gfx9.diff
49 ./rccl-test-missing-iomanip.diff
50 ];
51
52 src = fetchFromGitHub {
53 owner = "ROCm";
54 repo = "rccl";
55 rev = "rocm-${finalAttrs.version}";
56 hash = "sha256-998tDiC0Qp9hhcXtFpiCWqwdKPVT2vNp0GU/rng03Bw=";
57 };
58
59 nativeBuildInputs = [
60 cmake
61 rocm-cmake
62 clr
63 perl
64 hipify
65 autoPatchelfHook # ASAN doesn't add rpath without this
66 ];
67
68 buildInputs = [
69 rocm-smi
70 gtest
71 rocprofiler
72 rocprofiler-register
73 mscclpp
74 ]
75 ++ lib.optionals buildTests [
76 chrpath
77 ];
78
79 cmakeFlags = [
80 "-DHIP_CLANG_NUM_PARALLEL_JOBS=4"
81 "-DCMAKE_BUILD_TYPE=Release"
82 "-DROCM_PATH=${clr}"
83 "-DHIP_COMPILER=${clr}/bin/amdclang++"
84 "-DCMAKE_CXX_COMPILER=${clr}/bin/amdclang++"
85 "-DROCM_PATCH_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
86 "-DROCM_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
87 "-DBUILD_BFD=OFF" # Can't get it to detect bfd.h
88 "-DENABLE_MSCCL_KERNEL=ON"
89 "-DENABLE_MSCCLPP=ON"
90 "-DMSCCLPP_ROOT=${mscclpp}"
91 # Manually define CMAKE_INSTALL_<DIR>
92 # See: https://github.com/NixOS/nixpkgs/pull/197838
93 "-DCMAKE_INSTALL_BINDIR=bin"
94 "-DCMAKE_INSTALL_LIBDIR=lib"
95 "-DCMAKE_INSTALL_INCLUDEDIR=include"
96 ]
97 ++ lib.optionals (gpuTargets != [ ]) [
98 # AMD can't make up their minds and keep changing which one is used in different projects.
99 "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
100 "-DGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
101 ]
102 ++ lib.optionals buildTests [
103 "-DBUILD_TESTS=ON"
104 ];
105
106 # -O2 and -fno-strict-aliasing due to UB issues in RCCL :c
107 # Reported upstream
108 env.CFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
109 env.CXXFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
110 env.LDFLAGS = "${san}";
111 postPatch = ''
112 patchShebangs src tools
113 '';
114
115 postInstall =
116 lib.optionalString useAsan ''
117 patchelf --add-needed ${clr}/llvm/lib/linux/libclang_rt.asan-${stdenv.hostPlatform.parsed.cpu.name}.so $out/lib/librccl.so
118 ''
119 + lib.optionalString buildTests ''
120 mkdir -p $test/bin
121 mv $out/bin/* $test/bin
122 rmdir $out/bin
123 '';
124
125 passthru.updateScript = rocmUpdateScript {
126 name = finalAttrs.pname;
127 inherit (finalAttrs.src) owner;
128 inherit (finalAttrs.src) repo;
129 };
130
131 meta = with lib; {
132 description = "ROCm communication collectives library";
133 homepage = "https://github.com/ROCm/rccl";
134 license = with licenses; [
135 bsd2
136 bsd3
137 ];
138 teams = [ teams.rocm ];
139 platforms = platforms.linux;
140 };
141})