at master 143 lines 3.7 kB view raw
1{ 2 lib, 3 stdenv, 4 fetchFromGitHub, 5 rocmUpdateScript, 6 cmake, 7 rocm-cmake, 8 rocm-smi, 9 rocm-core, 10 clr, 11 mscclpp, 12 perl, 13 hipify, 14 python3, 15 gtest, 16 chrpath, 17 rocprofiler, 18 rocprofiler-register, 19 autoPatchelfHook, 20 buildTests ? false, 21 gpuTargets ? (clr.localGpuTargets or [ ]), 22}: 23 24let 25 useAsan = buildTests; 26 useUbsan = buildTests; 27 san = lib.optionalString (useAsan || useUbsan) ( 28 "-fno-gpu-sanitize -fsanitize=undefined " 29 + (lib.optionalString useAsan "-fsanitize=address -shared-libsan ") 30 ); 31in 32# Note: we can't properly test or make use of multi-node collective ops 33# https://github.com/NixOS/nixpkgs/issues/366242 tracks kernel support 34# kfd_peerdirect support which is on out-of-tree amdkfd in ROCm/ROCK-Kernel-Driver 35# infiniband ib_peer_mem support isn't in the mainline kernel but is carried by some distros 36stdenv.mkDerivation (finalAttrs: { 37 pname = "rccl${clr.gpuArchSuffix}"; 38 version = "6.3.3"; 39 40 outputs = [ 41 "out" 42 ] 43 ++ lib.optionals buildTests [ 44 "test" 45 ]; 46 47 patches = [ 48 ./fix-mainline-support-and-ub.diff 49 ./enable-mscclpp-on-all-gfx9.diff 50 ./rccl-test-missing-iomanip.diff 51 ]; 52 53 src = fetchFromGitHub { 54 owner = "ROCm"; 55 repo = "rccl"; 56 rev = "rocm-${finalAttrs.version}"; 57 hash = "sha256-998tDiC0Qp9hhcXtFpiCWqwdKPVT2vNp0GU/rng03Bw="; 58 }; 59 60 nativeBuildInputs = [ 61 cmake 62 rocm-cmake 63 clr 64 perl 65 hipify 66 python3 67 autoPatchelfHook # ASAN doesn't add rpath without this 68 ]; 69 70 buildInputs = [ 71 rocm-smi 72 gtest 73 rocprofiler 74 rocprofiler-register 75 mscclpp 76 ] 77 ++ lib.optionals buildTests [ 78 chrpath 79 ]; 80 81 cmakeFlags = [ 82 "-DHIP_CLANG_NUM_PARALLEL_JOBS=4" 83 "-DCMAKE_BUILD_TYPE=Release" 84 "-DROCM_PATH=${clr}" 85 "-DHIP_COMPILER=${clr}/bin/amdclang++" 86 "-DCMAKE_CXX_COMPILER=${clr}/bin/amdclang++" 87 "-DROCM_PATCH_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}" 88 "-DROCM_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}" 89 "-DBUILD_BFD=OFF" # Can't get it to detect bfd.h 90 "-DENABLE_MSCCL_KERNEL=ON" 91 "-DENABLE_MSCCLPP=ON" 92 "-DMSCCLPP_ROOT=${mscclpp}" 93 # Manually define CMAKE_INSTALL_<DIR> 94 # See: https://github.com/NixOS/nixpkgs/pull/197838 95 "-DCMAKE_INSTALL_BINDIR=bin" 96 "-DCMAKE_INSTALL_LIBDIR=lib" 97 "-DCMAKE_INSTALL_INCLUDEDIR=include" 98 ] 99 ++ lib.optionals (gpuTargets != [ ]) [ 100 # AMD can't make up their minds and keep changing which one is used in different projects. 101 "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}" 102 "-DGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}" 103 ] 104 ++ lib.optionals buildTests [ 105 "-DBUILD_TESTS=ON" 106 ]; 107 108 # -O2 and -fno-strict-aliasing due to UB issues in RCCL :c 109 # Reported upstream 110 env.CFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer"; 111 env.CXXFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer"; 112 env.LDFLAGS = "${san}"; 113 postPatch = '' 114 patchShebangs src tools 115 ''; 116 117 postInstall = 118 lib.optionalString useAsan '' 119 patchelf --add-needed ${clr}/llvm/lib/linux/libclang_rt.asan-${stdenv.hostPlatform.parsed.cpu.name}.so $out/lib/librccl.so 120 '' 121 + lib.optionalString buildTests '' 122 mkdir -p $test/bin 123 mv $out/bin/* $test/bin 124 rmdir $out/bin 125 ''; 126 127 passthru.updateScript = rocmUpdateScript { 128 name = finalAttrs.pname; 129 inherit (finalAttrs.src) owner; 130 inherit (finalAttrs.src) repo; 131 }; 132 133 meta = with lib; { 134 description = "ROCm communication collectives library"; 135 homepage = "https://github.com/ROCm/rccl"; 136 license = with licenses; [ 137 bsd2 138 bsd3 139 ]; 140 teams = [ teams.rocm ]; 141 platforms = platforms.linux; 142 }; 143})