RDMA/core: Introduce shared CQ pool API

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Allow a ULP to ask the core to provide a completion queue based on a
least-used search on a per-device CQ pools. The device CQ pools grow in a
lazy fashion when more CQs are requested.

This feature reduces the amount of interrupts when using many QPs. Using
shared CQs allows for more effcient completion handling. It also reduces
the amount of overhead needed for CQ contexts.

Test setup:
Intel(R) Xeon(R) Platinum 8176M CPU @ 2.10GHz servers.
Running NVMeoF 4KB read IOs over ConnectX-5EX across Spectrum switch.
TX-depth = 32. The patch was applied in the nvme driver on both the target
and initiator. Four controllers are accessed from each core. In the
current test case we have exposed sixteen NVMe namespaces using four
different subsystems (four namespaces per subsystem) from one NVM port.
Each controller allocated X queues (RDMA QPs) and attached to Y CQs.
Before this series we had X == Y, i.e for four controllers we've created
total of 4X QPs and 4X CQs. In the shared case, we've created 4X QPs and
only X CQs which means that we have four controllers that share a
completion queue per core. Until fourteen cores there is no significant
change in performance and the number of interrupts per second is less than
a million in the current case.
==================================================
|Cores|Current KIOPs |Shared KIOPs |improvement|
|-----|---------------|--------------|-----------|
|14 |2332 |2723 |16.7% |
|-----|---------------|--------------|-----------|
|20 |2086 |2712 |30% |
|-----|---------------|--------------|-----------|
|28 |1971 |2669 |35.4% |
|=================================================
|Cores|Current avg lat|Shared avg lat|improvement|
|-----|---------------|--------------|-----------|
|14 |767us |657us |14.3% |
|-----|---------------|--------------|-----------|
|20 |1225us |943us |23% |
|-----|---------------|--------------|-----------|
|28 |1816us |1341us |26.1% |
========================================================
|Cores|Current interrupts|Shared interrupts|improvement|
|-----|------------------|-----------------|-----------|
|14 |1.6M/sec |0.4M/sec |72% |
|-----|------------------|-----------------|-----------|
|20 |2.8M/sec |0.6M/sec |72.4% |
|-----|------------------|-----------------|-----------|
|28 |2.9M/sec |0.8M/sec |63.4% |
====================================================================
|Cores|Current 99.99th PCTL lat|Shared 99.99th PCTL lat|improvement|
|-----|------------------------|-----------------------|-----------|
|14 |67ms |6ms |90.9% |
|-----|------------------------|-----------------------|-----------|
|20 |5ms |6ms |-10% |
|-----|------------------------|-----------------------|-----------|
|28 |8.7ms |6ms |25.9% |
|===================================================================

Performance improvement with sixteen disks (sixteen CQs per core) is
comparable.

Link: https://lore.kernel.org/r/1590568495-101621-3-git-send-email-yaminf@mellanox.com
Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

authored by

Yamin Friedman and committed by

Jason Gunthorpe 5 years ago c7ff819a 3446cbd2

+194 -1

4 changed files

expand all

drivers

infiniband

core

core_priv.h

cq.c

device.c

include

rdma

ib_verbs.h

drivers/infiniband/core/core_priv.h

··· 414 414 struct vm_area_struct *vma, 415 415 struct rdma_user_mmap_entry *entry); 416 416 417 + void ib_cq_pool_init(struct ib_device *dev); 418 + void ib_cq_pool_destroy(struct ib_device *dev); 419 + 417 420 #endif /* _CORE_PRIV_H */

+173

drivers/infiniband/core/cq.c

··· 7 7 #include <linux/slab.h> 8 8 #include <rdma/ib_verbs.h> 9 9 10 + #include "core_priv.h" 11 + 10 12 #include <trace/events/rdma_core.h> 13 + /* Max size for shared CQ, may require tuning */ 14 + #define IB_MAX_SHARED_CQ_SZ 4096U 11 15 12 16 /* # of WCs to poll for with a single call to ib_poll_cq */ 13 17 #define IB_POLL_BATCH 16 ··· 222 218 cq->cq_context = private; 223 219 cq->poll_ctx = poll_ctx; 224 220 atomic_set(&cq->usecnt, 0); 221 + cq->comp_vector = comp_vector; 225 222 226 223 cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); 227 224 if (!cq->wc) ··· 314 309 { 315 310 if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) 316 311 return; 312 + if (WARN_ON_ONCE(cq->cqe_used)) 313 + return; 317 314 318 315 switch (cq->poll_ctx) { 319 316 case IB_POLL_DIRECT: ··· 341 334 kfree(cq); 342 335 } 343 336 EXPORT_SYMBOL(ib_free_cq_user); 337 + 338 + void ib_cq_pool_init(struct ib_device *dev) 339 + { 340 + unsigned int i; 341 + 342 + spin_lock_init(&dev->cq_pools_lock); 343 + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) 344 + INIT_LIST_HEAD(&dev->cq_pools[i]); 345 + } 346 + 347 + void ib_cq_pool_destroy(struct ib_device *dev) 348 + { 349 + struct ib_cq *cq, *n; 350 + unsigned int i; 351 + 352 + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { 353 + list_for_each_entry_safe(cq, n, &dev->cq_pools[i], 354 + pool_entry) { 355 + WARN_ON(cq->cqe_used); 356 + cq->shared = false; 357 + ib_free_cq(cq); 358 + } 359 + } 360 + } 361 + 362 + static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, 363 + enum ib_poll_context poll_ctx) 364 + { 365 + LIST_HEAD(tmp_list); 366 + unsigned int nr_cqs, i; 367 + struct ib_cq *cq; 368 + int ret; 369 + 370 + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { 371 + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); 372 + return -EINVAL; 373 + } 374 + 375 + /* 376 + * Allocate at least as many CQEs as requested, and otherwise 377 + * a reasonable batch size so that we can share CQs between 378 + * multiple users instead of allocating a larger number of CQs. 379 + */ 380 + nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, 381 + max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); 382 + nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); 383 + for (i = 0; i < nr_cqs; i++) { 384 + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); 385 + if (IS_ERR(cq)) { 386 + ret = PTR_ERR(cq); 387 + goto out_free_cqs; 388 + } 389 + cq->shared = true; 390 + list_add_tail(&cq->pool_entry, &tmp_list); 391 + } 392 + 393 + spin_lock_irq(&dev->cq_pools_lock); 394 + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); 395 + spin_unlock_irq(&dev->cq_pools_lock); 396 + 397 + return 0; 398 + 399 + out_free_cqs: 400 + list_for_each_entry(cq, &tmp_list, pool_entry) { 401 + cq->shared = false; 402 + ib_free_cq(cq); 403 + } 404 + return ret; 405 + } 406 + 407 + /** 408 + * ib_cq_pool_get() - Find the least used completion queue that matches 409 + * a given cpu hint (or least used for wild card affinity) and fits 410 + * nr_cqe. 411 + * @dev: rdma device 412 + * @nr_cqe: number of needed cqe entries 413 + * @comp_vector_hint: completion vector hint (-1) for the driver to assign 414 + * a comp vector based on internal counter 415 + * @poll_ctx: cq polling context 416 + * 417 + * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and 418 + * claim entries in it for us. In case there is no available cq, allocate 419 + * a new cq with the requirements and add it to the device pool. 420 + * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value 421 + * for @poll_ctx. 422 + */ 423 + struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, 424 + int comp_vector_hint, 425 + enum ib_poll_context poll_ctx) 426 + { 427 + static unsigned int default_comp_vector; 428 + unsigned int vector, num_comp_vectors; 429 + struct ib_cq *cq, *found = NULL; 430 + int ret; 431 + 432 + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { 433 + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); 434 + return ERR_PTR(-EINVAL); 435 + } 436 + 437 + num_comp_vectors = 438 + min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); 439 + /* Project the affinty to the device completion vector range */ 440 + if (comp_vector_hint < 0) { 441 + comp_vector_hint = 442 + (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; 443 + WRITE_ONCE(default_comp_vector, comp_vector_hint); 444 + } 445 + vector = comp_vector_hint % num_comp_vectors; 446 + 447 + /* 448 + * Find the least used CQ with correct affinity and 449 + * enough free CQ entries 450 + */ 451 + while (!found) { 452 + spin_lock_irq(&dev->cq_pools_lock); 453 + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], 454 + pool_entry) { 455 + /* 456 + * Check to see if we have found a CQ with the 457 + * correct completion vector 458 + */ 459 + if (vector != cq->comp_vector) 460 + continue; 461 + if (cq->cqe_used + nr_cqe > cq->cqe) 462 + continue; 463 + found = cq; 464 + break; 465 + } 466 + 467 + if (found) { 468 + found->cqe_used += nr_cqe; 469 + spin_unlock_irq(&dev->cq_pools_lock); 470 + 471 + return found; 472 + } 473 + spin_unlock_irq(&dev->cq_pools_lock); 474 + 475 + /* 476 + * Didn't find a match or ran out of CQs in the device 477 + * pool, allocate a new array of CQs. 478 + */ 479 + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); 480 + if (ret) 481 + return ERR_PTR(ret); 482 + } 483 + 484 + return found; 485 + } 486 + EXPORT_SYMBOL(ib_cq_pool_get); 487 + 488 + /** 489 + * ib_cq_pool_put - Return a CQ taken from a shared pool. 490 + * @cq: The CQ to return. 491 + * @nr_cqe: The max number of cqes that the user had requested. 492 + */ 493 + void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) 494 + { 495 + if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) 496 + return; 497 + 498 + spin_lock_irq(&cq->device->cq_pools_lock); 499 + cq->cqe_used -= nr_cqe; 500 + spin_unlock_irq(&cq->device->cq_pools_lock); 501 + } 502 + EXPORT_SYMBOL(ib_cq_pool_put);

drivers/infiniband/core/device.c

··· 1393 1393 goto dev_cleanup; 1394 1394 } 1395 1395 1396 + ib_cq_pool_init(device); 1396 1397 ret = enable_device_and_get(device); 1397 1398 dev_set_uevent_suppress(&device->dev, false); 1398 1399 /* Mark for userspace that device is ready */ ··· 1448 1447 goto out; 1449 1448 1450 1449 disable_device(ib_dev); 1450 + ib_cq_pool_destroy(ib_dev); 1451 1451 1452 1452 /* Expedite removing unregistered pointers from the hash table */ 1453 1453 free_netdevs(ib_dev);

+16 -1

include/rdma/ib_verbs.h

··· 1588 1588 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); 1589 1589 1590 1590 enum ib_poll_context { 1591 - IB_POLL_DIRECT, /* caller context, no hw completions */ 1592 1591 IB_POLL_SOFTIRQ, /* poll from softirq context */ 1593 1592 IB_POLL_WORKQUEUE, /* poll from workqueue */ 1594 1593 IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ 1594 + IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE, 1595 + 1596 + IB_POLL_DIRECT, /* caller context, no hw completions */ 1595 1597 }; 1596 1598 1597 1599 struct ib_cq { ··· 1603 1601 void (*event_handler)(struct ib_event *, void *); 1604 1602 void *cq_context; 1605 1603 int cqe; 1604 + unsigned int cqe_used; 1606 1605 atomic_t usecnt; /* count number of work queues */ 1607 1606 enum ib_poll_context poll_ctx; 1608 1607 struct ib_wc *wc; 1608 + struct list_head pool_entry; 1609 1609 union { 1610 1610 struct irq_poll iop; 1611 1611 struct work_struct work; ··· 1619 1615 ktime_t timestamp; 1620 1616 u8 interrupt:1; 1621 1617 u8 shared:1; 1618 + unsigned int comp_vector; 1622 1619 1623 1620 /* 1624 1621 * Implementation details of the RDMA core, don't use in drivers: ··· 2739 2734 #endif 2740 2735 2741 2736 u32 index; 2737 + 2738 + spinlock_t cq_pools_lock; 2739 + struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1]; 2740 + 2742 2741 struct rdma_restrack_root *res; 2743 2742 2744 2743 const struct uapi_definition *driver_def; ··· 4045 4036 { 4046 4037 return cq->device->ops.req_notify_cq(cq, flags); 4047 4038 } 4039 + 4040 + struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, 4041 + int comp_vector_hint, 4042 + enum ib_poll_context poll_ctx); 4043 + 4044 + void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); 4048 4045 4049 4046 /** 4050 4047 * ib_req_ncomp_notif - Request completion notification when there are