nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
2index 8c5f081..9922b79 100644
3--- a/src/include/bootstrap.h
4+++ b/src/include/bootstrap.h
5@@ -10,11 +10,13 @@
6 #include "nccl.h"
7 #include "comm.h"
8
9+// this is accessed through unaligned ptrs because ncclUniqueId is a typedef of char[128]
10 struct ncclBootstrapHandle {
11 uint64_t magic;
12 union ncclSocketAddress addr;
13 };
14 static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
15+static_assert(alignof(struct ncclBootstrapHandle) == alignof(ncclUniqueId), "Bootstrap handle must have same alignment as NCCL unique ID to avoid UB");
16
17 ncclResult_t bootstrapNetInit();
18 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
19diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc
20index b3063d5..464b80d 100644
21--- a/src/misc/rocmwrap.cc
22+++ b/src/misc/rocmwrap.cc
23@@ -131,9 +131,12 @@ static void initOnceFunc() {
24 //format and store the kernel conf file location
25 snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
26 fp = fopen(kernel_conf_file, "r");
27- if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file");
28+ if (fp == NULL) {
29+ INFO(NCCL_INIT,"Could not open kernel conf file, will assume CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA are enabled");
30+ }
31 //look for kernel_opt1 and kernel_opt2 in the conf file and check
32- while (fgets(buf, sizeof(buf), fp) != NULL) {
33+ // FIXME: This check is broken, CONFIG_DMABUF_MOVE_NOTIFY could be across a buf boundary.
34+ while (fp && fgets(buf, sizeof(buf), fp) != NULL) {
35 if (strstr(buf, kernel_opt1) != NULL) {
36 found_opt1 = 1;
37 INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
38@@ -143,11 +146,12 @@ static void initOnceFunc() {
39 INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
40 }
41 }
42- if (!found_opt1 || !found_opt2) {
43+ if (fp && (!found_opt1 || !found_opt2)) {
44 dmaBufSupport = 0;
45 INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
46 INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
47 }
48+ if (fp) fclose(fp);
49
50 if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
51 else goto error;
52diff --git a/src/nccl.h.in b/src/nccl.h.in
53index 1d127b0..6296073 100644
54--- a/src/nccl.h.in
55+++ b/src/nccl.h.in
56@@ -39,7 +39,7 @@ typedef struct ncclComm* ncclComm_t;
57 #define NCCL_UNIQUE_ID_BYTES 128
58 /*! @brief Opaque unique id used to initialize communicators
59 @details The ncclUniqueId must be passed to all participating ranks */
60-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
61+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
62
63 /*! @defgroup rccl_result_code Result Codes
64 @details The various result codes that RCCL API calls may return
65diff --git a/src/proxy.cc b/src/proxy.cc
66index 50e5437..51bb401 100644
67--- a/src/proxy.cc
68+++ b/src/proxy.cc
69@@ -965,7 +965,11 @@ struct ncclProxyConnectionPool {
70
71 static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
72 if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) {
73- NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
74+ if (pool->pools) {
75+ NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
76+ } else {
77+ NCCLCHECK(ncclCalloc(&pool->pools, pool->banks+1));
78+ }
79 NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE));
80 pool->banks++;
81 pool->offset = 0;
82diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
83index 6d77784..49762d3 100644
84--- a/src/transport/net_ib.cc
85+++ b/src/transport/net_ib.cc
86@@ -573,7 +573,7 @@ ncclResult_t ncclIbGdrSupport() {
87 // Requires support from NIC driver modules
88 // Use ONLY for debugging!
89 moduleLoaded = 1;
90- INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
91+ INFO(NCCL_INIT, "ncclIbGdrSupport: RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
92 }
93
94 if (moduleLoaded == -1) {
95@@ -586,13 +586,14 @@ ncclResult_t ncclIbGdrSupport() {
96 // or created under a different path like `/sys/kernel/` or `/sys/` (depending on your ib_peer_mem module)
97 const char* memory_peers_paths[] = {"/sys/kernel/mm/memory_peers/amdkfd/version",
98 "/sys/kernel/memory_peers/amdkfd/version",
99- "/sys/memory_peers/amdkfd/version"};
100+ "/sys/memory_peers/amdkfd/version",
101+ NULL};
102 int i = 0;
103
104 while (memory_peers_paths[i]) {
105 if (access(memory_peers_paths[i], F_OK) == 0) {
106 moduleLoaded = 1;
107- INFO(NCCL_INIT,"Found %s", memory_peers_paths[i]);
108+ INFO(NCCL_INIT,"ncclIbGdrSupport: Found %s", memory_peers_paths[i]);
109 break;
110 } else {
111 moduleLoaded = 0;
112@@ -612,22 +613,23 @@ ncclResult_t ncclIbGdrSupport() {
113 if (moduleLoaded == 0) {
114 // Check for `ib_register_peer_memory_client` symbol in `/proc/kallsyms`
115 // if your system uses native OS ib_peer module
116- char buf[256];
117- FILE *fp = NULL;
118- fp = fopen("/proc/kallsyms", "r");
119+ FILE *fp = fopen("/proc/kallsyms", "r");
120+ char *line = NULL;
121+ size_t len = 0;
122
123 if (fp == NULL) {
124- INFO(NCCL_INIT,"Could not open /proc/kallsyms");
125+ INFO(NCCL_INIT,"ncclIbGdrSupport: Could not open /proc/kallsyms to check for ib_register_peer_memory_client");
126 } else {
127- while (fgets(buf, sizeof(buf), fp) != NULL) {
128- if (strstr(buf, "t ib_register_peer_memory_client") != NULL ||
129- strstr(buf, "T ib_register_peer_memory_client") != NULL) {
130+ while (getline(&line, &len, fp) > 0) {
131+ if (line && strstr(line, "ib_register_peer_memory_client") != NULL) {
132 moduleLoaded = 1;
133- INFO(NCCL_INIT,"Found ib_register_peer_memory_client in /proc/kallsyms");
134+ INFO(NCCL_INIT,"ncclIbGdrSupport: Found ib_register_peer_memory_client in /proc/kallsyms");
135 break;
136 }
137 }
138 }
139+ if (line) free(line);
140+ if (fp) fclose(fp);
141 }
142 #else
143 // Check for the nv_peer_mem module being loaded
144@@ -637,7 +639,7 @@ ncclResult_t ncclIbGdrSupport() {
145 #endif
146 }
147 if (moduleLoaded == 0) {
148- INFO(NCCL_INIT,"GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol");
149+ INFO(NCCL_INIT,"ncclIbGdrSupport: GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol");
150 return ncclSystemError;
151 }
152 return ncclSuccess;
153diff --git a/tools/ib-test/include/nccl.h b/tools/ib-test/include/nccl.h
154index 2c86c33..5801c61 100755
155--- a/tools/ib-test/include/nccl.h
156+++ b/tools/ib-test/include/nccl.h
157@@ -31,7 +31,7 @@ extern "C" {
158 typedef struct ncclComm* ncclComm_t;
159
160 #define NCCL_UNIQUE_ID_BYTES 128
161-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
162+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
163
164 /* Error type */
165 typedef enum { ncclSuccess = 0,
166diff --git a/tools/topo_expl/include/nccl.h b/tools/topo_expl/include/nccl.h
167index 729561b..4e4bdd9 100644
168--- a/tools/topo_expl/include/nccl.h
169+++ b/tools/topo_expl/include/nccl.h
170@@ -35,7 +35,7 @@ typedef struct ncclComm* ncclComm_t;
171 #define NCCL_COMM_NULL NULL
172
173 #define NCCL_UNIQUE_ID_BYTES 128
174-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
175+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
176
177 /*! @brief Error type */
178 typedef enum { ncclSuccess = 0,