nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at devShellTools-shell 178 lines 8.0 kB view raw
1diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h 2index 8c5f081..9922b79 100644 3--- a/src/include/bootstrap.h 4+++ b/src/include/bootstrap.h 5@@ -10,11 +10,13 @@ 6 #include "nccl.h" 7 #include "comm.h" 8 9+// this is accessed through unaligned ptrs because ncclUniqueId is a typedef of char[128] 10 struct ncclBootstrapHandle { 11 uint64_t magic; 12 union ncclSocketAddress addr; 13 }; 14 static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); 15+static_assert(alignof(struct ncclBootstrapHandle) == alignof(ncclUniqueId), "Bootstrap handle must have same alignment as NCCL unique ID to avoid UB"); 16 17 ncclResult_t bootstrapNetInit(); 18 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); 19diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc 20index b3063d5..464b80d 100644 21--- a/src/misc/rocmwrap.cc 22+++ b/src/misc/rocmwrap.cc 23@@ -131,9 +131,12 @@ static void initOnceFunc() { 24 //format and store the kernel conf file location 25 snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release); 26 fp = fopen(kernel_conf_file, "r"); 27- if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file"); 28+ if (fp == NULL) { 29+ INFO(NCCL_INIT,"Could not open kernel conf file, will assume CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA are enabled"); 30+ } 31 //look for kernel_opt1 and kernel_opt2 in the conf file and check 32- while (fgets(buf, sizeof(buf), fp) != NULL) { 33+ // FIXME: This check is broken, CONFIG_DMABUF_MOVE_NOTIFY could be across a buf boundary. 34+ while (fp && fgets(buf, sizeof(buf), fp) != NULL) { 35 if (strstr(buf, kernel_opt1) != NULL) { 36 found_opt1 = 1; 37 INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release); 38@@ -143,11 +146,12 @@ static void initOnceFunc() { 39 INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release); 40 } 41 } 42- if (!found_opt1 || !found_opt2) { 43+ if (fp && (!found_opt1 || !found_opt2)) { 44 dmaBufSupport = 0; 45 INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release); 46 INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support"); 47 } 48+ if (fp) fclose(fp); 49 50 if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled"); 51 else goto error; 52diff --git a/src/nccl.h.in b/src/nccl.h.in 53index 1d127b0..6296073 100644 54--- a/src/nccl.h.in 55+++ b/src/nccl.h.in 56@@ -39,7 +39,7 @@ typedef struct ncclComm* ncclComm_t; 57 #define NCCL_UNIQUE_ID_BYTES 128 58 /*! @brief Opaque unique id used to initialize communicators 59 @details The ncclUniqueId must be passed to all participating ranks */ 60-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId; 61+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId; 62 63 /*! @defgroup rccl_result_code Result Codes 64 @details The various result codes that RCCL API calls may return 65diff --git a/src/proxy.cc b/src/proxy.cc 66index 50e5437..51bb401 100644 67--- a/src/proxy.cc 68+++ b/src/proxy.cc 69@@ -965,7 +965,11 @@ struct ncclProxyConnectionPool { 70 71 static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) { 72 if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) { 73- NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1)); 74+ if (pool->pools) { 75+ NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1)); 76+ } else { 77+ NCCLCHECK(ncclCalloc(&pool->pools, pool->banks+1)); 78+ } 79 NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE)); 80 pool->banks++; 81 pool->offset = 0; 82diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc 83index 6d77784..49762d3 100644 84--- a/src/transport/net_ib.cc 85+++ b/src/transport/net_ib.cc 86@@ -573,7 +573,7 @@ ncclResult_t ncclIbGdrSupport() { 87 // Requires support from NIC driver modules 88 // Use ONLY for debugging! 89 moduleLoaded = 1; 90- INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1"); 91+ INFO(NCCL_INIT, "ncclIbGdrSupport: RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1"); 92 } 93 94 if (moduleLoaded == -1) { 95@@ -586,13 +586,14 @@ ncclResult_t ncclIbGdrSupport() { 96 // or created under a different path like `/sys/kernel/` or `/sys/` (depending on your ib_peer_mem module) 97 const char* memory_peers_paths[] = {"/sys/kernel/mm/memory_peers/amdkfd/version", 98 "/sys/kernel/memory_peers/amdkfd/version", 99- "/sys/memory_peers/amdkfd/version"}; 100+ "/sys/memory_peers/amdkfd/version", 101+ NULL}; 102 int i = 0; 103 104 while (memory_peers_paths[i]) { 105 if (access(memory_peers_paths[i], F_OK) == 0) { 106 moduleLoaded = 1; 107- INFO(NCCL_INIT,"Found %s", memory_peers_paths[i]); 108+ INFO(NCCL_INIT,"ncclIbGdrSupport: Found %s", memory_peers_paths[i]); 109 break; 110 } else { 111 moduleLoaded = 0; 112@@ -612,22 +613,23 @@ ncclResult_t ncclIbGdrSupport() { 113 if (moduleLoaded == 0) { 114 // Check for `ib_register_peer_memory_client` symbol in `/proc/kallsyms` 115 // if your system uses native OS ib_peer module 116- char buf[256]; 117- FILE *fp = NULL; 118- fp = fopen("/proc/kallsyms", "r"); 119+ FILE *fp = fopen("/proc/kallsyms", "r"); 120+ char *line = NULL; 121+ size_t len = 0; 122 123 if (fp == NULL) { 124- INFO(NCCL_INIT,"Could not open /proc/kallsyms"); 125+ INFO(NCCL_INIT,"ncclIbGdrSupport: Could not open /proc/kallsyms to check for ib_register_peer_memory_client"); 126 } else { 127- while (fgets(buf, sizeof(buf), fp) != NULL) { 128- if (strstr(buf, "t ib_register_peer_memory_client") != NULL || 129- strstr(buf, "T ib_register_peer_memory_client") != NULL) { 130+ while (getline(&line, &len, fp) > 0) { 131+ if (line && strstr(line, "ib_register_peer_memory_client") != NULL) { 132 moduleLoaded = 1; 133- INFO(NCCL_INIT,"Found ib_register_peer_memory_client in /proc/kallsyms"); 134+ INFO(NCCL_INIT,"ncclIbGdrSupport: Found ib_register_peer_memory_client in /proc/kallsyms"); 135 break; 136 } 137 } 138 } 139+ if (line) free(line); 140+ if (fp) fclose(fp); 141 } 142 #else 143 // Check for the nv_peer_mem module being loaded 144@@ -637,7 +639,7 @@ ncclResult_t ncclIbGdrSupport() { 145 #endif 146 } 147 if (moduleLoaded == 0) { 148- INFO(NCCL_INIT,"GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol"); 149+ INFO(NCCL_INIT,"ncclIbGdrSupport: GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol"); 150 return ncclSystemError; 151 } 152 return ncclSuccess; 153diff --git a/tools/ib-test/include/nccl.h b/tools/ib-test/include/nccl.h 154index 2c86c33..5801c61 100755 155--- a/tools/ib-test/include/nccl.h 156+++ b/tools/ib-test/include/nccl.h 157@@ -31,7 +31,7 @@ extern "C" { 158 typedef struct ncclComm* ncclComm_t; 159 160 #define NCCL_UNIQUE_ID_BYTES 128 161-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; 162+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; 163 164 /* Error type */ 165 typedef enum { ncclSuccess = 0, 166diff --git a/tools/topo_expl/include/nccl.h b/tools/topo_expl/include/nccl.h 167index 729561b..4e4bdd9 100644 168--- a/tools/topo_expl/include/nccl.h 169+++ b/tools/topo_expl/include/nccl.h 170@@ -35,7 +35,7 @@ typedef struct ncclComm* ncclComm_t; 171 #define NCCL_COMM_NULL NULL 172 173 #define NCCL_UNIQUE_ID_BYTES 128 174-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; 175+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; 176 177 /*! @brief Error type */ 178 typedef enum { ncclSuccess = 0,