Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nvmet-rdma: Correctly handle RDMA device hot removal

When configuring a device attached listener, we may
see device removal events. In this case we return a
non-zero return code from the cm event handler which
implicitly destroys the cm_id. It is possible that in
the future the user will remove this listener and by
that trigger a second call to rdma_destroy_id on an
already destroyed cm_id -> BUG.

In addition, when a queue bound (active session) cm_id
generates a DEVICE_REMOVAL event we must guarantee all
resources are cleaned up by the time we return from the
event handler.

Introduce nvmet_rdma_device_removal which addresses
(or at least attempts to) both scenarios.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>

+70 -17
+70 -17
drivers/nvme/target/rdma.c
··· 77 77 NVMET_RDMA_Q_CONNECTING, 78 78 NVMET_RDMA_Q_LIVE, 79 79 NVMET_RDMA_Q_DISCONNECTING, 80 + NVMET_RDMA_IN_DEVICE_REMOVAL, 80 81 }; 81 82 82 83 struct nvmet_rdma_queue { ··· 985 984 struct nvmet_rdma_device *dev = queue->dev; 986 985 987 986 nvmet_rdma_free_queue(queue); 988 - rdma_destroy_id(cm_id); 987 + 988 + if (queue->state != NVMET_RDMA_IN_DEVICE_REMOVAL) 989 + rdma_destroy_id(cm_id); 990 + 989 991 kref_put(&dev->ref, nvmet_rdma_free_dev); 990 992 } 991 993 ··· 1237 1233 switch (queue->state) { 1238 1234 case NVMET_RDMA_Q_CONNECTING: 1239 1235 case NVMET_RDMA_Q_LIVE: 1240 - disconnect = true; 1241 1236 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1237 + case NVMET_RDMA_IN_DEVICE_REMOVAL: 1238 + disconnect = true; 1242 1239 break; 1243 1240 case NVMET_RDMA_Q_DISCONNECTING: 1244 1241 break; ··· 1277 1272 schedule_work(&queue->release_work); 1278 1273 } 1279 1274 1275 + /** 1276 + * nvme_rdma_device_removal() - Handle RDMA device removal 1277 + * @queue: nvmet rdma queue (cm id qp_context) 1278 + * @addr: nvmet address (cm_id context) 1279 + * 1280 + * DEVICE_REMOVAL event notifies us that the RDMA device is about 1281 + * to unplug so we should take care of destroying our RDMA resources. 1282 + * This event will be generated for each allocated cm_id. 1283 + * 1284 + * Note that this event can be generated on a normal queue cm_id 1285 + * and/or a device bound listener cm_id (where in this case 1286 + * queue will be null). 1287 + * 1288 + * we claim ownership on destroying the cm_id. For queues we move 1289 + * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port 1290 + * we nullify the priv to prevent double cm_id destruction and destroying 1291 + * the cm_id implicitely by returning a non-zero rc to the callout. 1292 + */ 1293 + static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1294 + struct nvmet_rdma_queue *queue) 1295 + { 1296 + unsigned long flags; 1297 + 1298 + if (!queue) { 1299 + struct nvmet_port *port = cm_id->context; 1300 + 1301 + /* 1302 + * This is a listener cm_id. Make sure that 1303 + * future remove_port won't invoke a double 1304 + * cm_id destroy. use atomic xchg to make sure 1305 + * we don't compete with remove_port. 1306 + */ 1307 + if (xchg(&port->priv, NULL) != cm_id) 1308 + return 0; 1309 + } else { 1310 + /* 1311 + * This is a queue cm_id. Make sure that 1312 + * release queue will not destroy the cm_id 1313 + * and schedule all ctrl queues removal (only 1314 + * if the queue is not disconnecting already). 1315 + */ 1316 + spin_lock_irqsave(&queue->state_lock, flags); 1317 + if (queue->state != NVMET_RDMA_Q_DISCONNECTING) 1318 + queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; 1319 + spin_unlock_irqrestore(&queue->state_lock, flags); 1320 + nvmet_rdma_queue_disconnect(queue); 1321 + flush_scheduled_work(); 1322 + } 1323 + 1324 + /* 1325 + * We need to return 1 so that the core will destroy 1326 + * it's own ID. What a great API design.. 1327 + */ 1328 + return 1; 1329 + } 1330 + 1280 1331 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1281 1332 struct rdma_cm_event *event) 1282 1333 { ··· 1355 1294 break; 1356 1295 case RDMA_CM_EVENT_ADDR_CHANGE: 1357 1296 case RDMA_CM_EVENT_DISCONNECTED: 1358 - case RDMA_CM_EVENT_DEVICE_REMOVAL: 1359 1297 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1360 - /* 1361 - * We can get the device removal callback even for a 1362 - * CM ID that we aren't actually using. In that case 1363 - * the context pointer is NULL, so we shouldn't try 1364 - * to disconnect a non-existing queue. But we also 1365 - * need to return 1 so that the core will destroy 1366 - * it's own ID. What a great API design.. 1367 - */ 1368 - if (queue) 1369 - nvmet_rdma_queue_disconnect(queue); 1370 - else 1371 - ret = 1; 1298 + nvmet_rdma_queue_disconnect(queue); 1299 + break; 1300 + case RDMA_CM_EVENT_DEVICE_REMOVAL: 1301 + ret = nvmet_rdma_device_removal(cm_id, queue); 1372 1302 break; 1373 1303 case RDMA_CM_EVENT_REJECTED: 1374 1304 case RDMA_CM_EVENT_UNREACHABLE: ··· 1448 1396 1449 1397 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1450 1398 { 1451 - struct rdma_cm_id *cm_id = port->priv; 1399 + struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1452 1400 1453 - rdma_destroy_id(cm_id); 1401 + if (cm_id) 1402 + rdma_destroy_id(cm_id); 1454 1403 } 1455 1404 1456 1405 static struct nvmet_fabrics_ops nvmet_rdma_ops = {