Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support

Currently the driver doesn't support completion vectors. These
are used to indicate which sets of CQs should be grouped together
into the same vector. A vector is a CQ processing thread that
runs on a specific CPU.

If an application has several CQs bound to different completion
vectors, and each completion vector runs on different CPUs, then
the completion queue workload is balanced. This helps scale as more
nodes are used.

Implement CQ completion vector support using a global workqueue
where a CQ entry is queued to the CPU corresponding to the CQ's
completion vector. Since the workqueue is global, it's guaranteed
to always be there when queueing CQ entries; Therefore, the RCU
locking for cq->rdi->worker in the hot path is superfluous.

Each completion vector is assigned to a different CPU. The number of
completion vectors available is computed by taking the number of
online, physical CPUs from the local NUMA node and subtracting the
CPUs used for kernel receive queues and the general interrupt.
Special use cases:

* If there are no CPUs left for completion vectors, the same CPU
for the general interrupt is used; Therefore, there would only
be one completion vector available.

* For multi-HFI systems, the number of completion vectors available
for each device is the total number of completion vectors in
the local NUMA node divided by the number of devices in the same
NUMA node. If there's a division remainder, the first device to
get initialized gets an extra completion vector.

Upon a CQ creation, an invalid completion vector could be specified.
Handle it as follows:

* If the completion vector is less than 0, set it to 0.

* Set the completion vector to the result of the passed completion
vector moded with the number of device completion vectors
available.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

authored by

Sebastian Sanchez and committed by
Doug Ledford
5d18ee67 cf38ea10

+534 -101
+407 -7
drivers/infiniband/hw/hfi1/affinity.c
··· 1 1 /* 2 - * Copyright(c) 2015 - 2017 Intel Corporation. 2 + * Copyright(c) 2015 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 208 208 return 0; 209 209 } 210 210 211 - void node_affinity_destroy(void) 211 + static void node_affinity_destroy(struct hfi1_affinity_node *entry) 212 + { 213 + free_percpu(entry->comp_vect_affinity); 214 + kfree(entry); 215 + } 216 + 217 + void node_affinity_destroy_all(void) 212 218 { 213 219 struct list_head *pos, *q; 214 220 struct hfi1_affinity_node *entry; ··· 224 218 entry = list_entry(pos, struct hfi1_affinity_node, 225 219 list); 226 220 list_del(pos); 227 - kfree(entry); 221 + node_affinity_destroy(entry); 228 222 } 229 223 mutex_unlock(&node_affinity.lock); 230 224 kfree(hfi1_per_node_cntr); ··· 238 232 if (!entry) 239 233 return NULL; 240 234 entry->node = node; 235 + entry->comp_vect_affinity = alloc_percpu(u16); 241 236 INIT_LIST_HEAD(&entry->list); 242 237 243 238 return entry; ··· 268 261 return NULL; 269 262 } 270 263 264 + static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, 265 + u16 __percpu *comp_vect_affinity) 266 + { 267 + int curr_cpu; 268 + u16 cntr; 269 + u16 prev_cntr; 270 + int ret_cpu; 271 + 272 + if (!possible_cpumask) { 273 + ret_cpu = -EINVAL; 274 + goto fail; 275 + } 276 + 277 + if (!comp_vect_affinity) { 278 + ret_cpu = -EINVAL; 279 + goto fail; 280 + } 281 + 282 + ret_cpu = cpumask_first(possible_cpumask); 283 + if (ret_cpu >= nr_cpu_ids) { 284 + ret_cpu = -EINVAL; 285 + goto fail; 286 + } 287 + 288 + prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); 289 + for_each_cpu(curr_cpu, possible_cpumask) { 290 + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 291 + 292 + if (cntr < prev_cntr) { 293 + ret_cpu = curr_cpu; 294 + prev_cntr = cntr; 295 + } 296 + } 297 + 298 + *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; 299 + 300 + fail: 301 + return ret_cpu; 302 + } 303 + 304 + static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, 305 + u16 __percpu *comp_vect_affinity) 306 + { 307 + int curr_cpu; 308 + int max_cpu; 309 + u16 cntr; 310 + u16 prev_cntr; 311 + 312 + if (!possible_cpumask) 313 + return -EINVAL; 314 + 315 + if (!comp_vect_affinity) 316 + return -EINVAL; 317 + 318 + max_cpu = cpumask_first(possible_cpumask); 319 + if (max_cpu >= nr_cpu_ids) 320 + return -EINVAL; 321 + 322 + prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); 323 + for_each_cpu(curr_cpu, possible_cpumask) { 324 + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 325 + 326 + if (cntr > prev_cntr) { 327 + max_cpu = curr_cpu; 328 + prev_cntr = cntr; 329 + } 330 + } 331 + 332 + *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; 333 + 334 + return max_cpu; 335 + } 336 + 337 + /* 338 + * Non-interrupt CPUs are used first, then interrupt CPUs. 339 + * Two already allocated cpu masks must be passed. 340 + */ 341 + static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, 342 + struct hfi1_affinity_node *entry, 343 + cpumask_var_t non_intr_cpus, 344 + cpumask_var_t available_cpus) 345 + __must_hold(&node_affinity.lock) 346 + { 347 + int cpu; 348 + struct cpu_mask_set *set = dd->comp_vect; 349 + 350 + lockdep_assert_held(&node_affinity.lock); 351 + if (!non_intr_cpus) { 352 + cpu = -1; 353 + goto fail; 354 + } 355 + 356 + if (!available_cpus) { 357 + cpu = -1; 358 + goto fail; 359 + } 360 + 361 + /* Available CPUs for pinning completion vectors */ 362 + _cpu_mask_set_gen_inc(set); 363 + cpumask_andnot(available_cpus, &set->mask, &set->used); 364 + 365 + /* Available CPUs without SDMA engine interrupts */ 366 + cpumask_andnot(non_intr_cpus, available_cpus, 367 + &entry->def_intr.used); 368 + 369 + /* If there are non-interrupt CPUs available, use them first */ 370 + if (!cpumask_empty(non_intr_cpus)) 371 + cpu = cpumask_first(non_intr_cpus); 372 + else /* Otherwise, use interrupt CPUs */ 373 + cpu = cpumask_first(available_cpus); 374 + 375 + if (cpu >= nr_cpu_ids) { /* empty */ 376 + cpu = -1; 377 + goto fail; 378 + } 379 + cpumask_set_cpu(cpu, &set->used); 380 + 381 + fail: 382 + return cpu; 383 + } 384 + 385 + static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) 386 + { 387 + struct cpu_mask_set *set = dd->comp_vect; 388 + 389 + if (cpu < 0) 390 + return; 391 + 392 + cpu_mask_set_put(set, cpu); 393 + } 394 + 395 + /* _dev_comp_vect_mappings_destroy() is reentrant */ 396 + static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) 397 + { 398 + int i, cpu; 399 + 400 + if (!dd->comp_vect_mappings) 401 + return; 402 + 403 + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 404 + cpu = dd->comp_vect_mappings[i]; 405 + _dev_comp_vect_cpu_put(dd, cpu); 406 + dd->comp_vect_mappings[i] = -1; 407 + hfi1_cdbg(AFFINITY, 408 + "[%s] Release CPU %d from completion vector %d", 409 + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); 410 + } 411 + 412 + kfree(dd->comp_vect_mappings); 413 + dd->comp_vect_mappings = NULL; 414 + } 415 + 416 + /* 417 + * This function creates the table for looking up CPUs for completion vectors. 418 + * num_comp_vectors needs to have been initilized before calling this function. 419 + */ 420 + static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, 421 + struct hfi1_affinity_node *entry) 422 + __must_hold(&node_affinity.lock) 423 + { 424 + int i, cpu, ret; 425 + cpumask_var_t non_intr_cpus; 426 + cpumask_var_t available_cpus; 427 + 428 + lockdep_assert_held(&node_affinity.lock); 429 + 430 + if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) 431 + return -ENOMEM; 432 + 433 + if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { 434 + free_cpumask_var(non_intr_cpus); 435 + return -ENOMEM; 436 + } 437 + 438 + dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, 439 + sizeof(*dd->comp_vect_mappings), 440 + GFP_KERNEL); 441 + if (!dd->comp_vect_mappings) { 442 + ret = -ENOMEM; 443 + goto fail; 444 + } 445 + for (i = 0; i < dd->comp_vect_possible_cpus; i++) 446 + dd->comp_vect_mappings[i] = -1; 447 + 448 + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 449 + cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, 450 + available_cpus); 451 + if (cpu < 0) { 452 + ret = -EINVAL; 453 + goto fail; 454 + } 455 + 456 + dd->comp_vect_mappings[i] = cpu; 457 + hfi1_cdbg(AFFINITY, 458 + "[%s] Completion Vector %d -> CPU %d", 459 + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); 460 + } 461 + 462 + return 0; 463 + 464 + fail: 465 + free_cpumask_var(available_cpus); 466 + free_cpumask_var(non_intr_cpus); 467 + _dev_comp_vect_mappings_destroy(dd); 468 + 469 + return ret; 470 + } 471 + 472 + int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) 473 + { 474 + int ret; 475 + struct hfi1_affinity_node *entry; 476 + 477 + mutex_lock(&node_affinity.lock); 478 + entry = node_affinity_lookup(dd->node); 479 + if (!entry) { 480 + ret = -EINVAL; 481 + goto unlock; 482 + } 483 + ret = _dev_comp_vect_mappings_create(dd, entry); 484 + unlock: 485 + mutex_unlock(&node_affinity.lock); 486 + 487 + return ret; 488 + } 489 + 490 + void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) 491 + { 492 + _dev_comp_vect_mappings_destroy(dd); 493 + } 494 + 495 + int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) 496 + { 497 + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 498 + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 499 + 500 + if (!dd->comp_vect_mappings) 501 + return -EINVAL; 502 + if (comp_vect >= dd->comp_vect_possible_cpus) 503 + return -EINVAL; 504 + 505 + return dd->comp_vect_mappings[comp_vect]; 506 + } 507 + 508 + /* 509 + * It assumes dd->comp_vect_possible_cpus is available. 510 + */ 511 + static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, 512 + struct hfi1_affinity_node *entry, 513 + bool first_dev_init) 514 + __must_hold(&node_affinity.lock) 515 + { 516 + int i, j, curr_cpu; 517 + int possible_cpus_comp_vect = 0; 518 + struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; 519 + 520 + lockdep_assert_held(&node_affinity.lock); 521 + /* 522 + * If there's only one CPU available for completion vectors, then 523 + * there will only be one completion vector available. Othewise, 524 + * the number of completion vector available will be the number of 525 + * available CPUs divide it by the number of devices in the 526 + * local NUMA node. 527 + */ 528 + if (cpumask_weight(&entry->comp_vect_mask) == 1) { 529 + possible_cpus_comp_vect = 1; 530 + dd_dev_warn(dd, 531 + "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); 532 + } else { 533 + possible_cpus_comp_vect += 534 + cpumask_weight(&entry->comp_vect_mask) / 535 + hfi1_per_node_cntr[dd->node]; 536 + 537 + /* 538 + * If the completion vector CPUs available doesn't divide 539 + * evenly among devices, then the first device device to be 540 + * initialized gets an extra CPU. 541 + */ 542 + if (first_dev_init && 543 + cpumask_weight(&entry->comp_vect_mask) % 544 + hfi1_per_node_cntr[dd->node] != 0) 545 + possible_cpus_comp_vect++; 546 + } 547 + 548 + dd->comp_vect_possible_cpus = possible_cpus_comp_vect; 549 + 550 + /* Reserving CPUs for device completion vector */ 551 + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 552 + curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, 553 + entry->comp_vect_affinity); 554 + if (curr_cpu < 0) 555 + goto fail; 556 + 557 + cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); 558 + } 559 + 560 + hfi1_cdbg(AFFINITY, 561 + "[%s] Completion vector affinity CPU set(s) %*pbl", 562 + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), 563 + cpumask_pr_args(dev_comp_vect_mask)); 564 + 565 + return 0; 566 + 567 + fail: 568 + for (j = 0; j < i; j++) 569 + per_cpu_affinity_put_max(&entry->comp_vect_mask, 570 + entry->comp_vect_affinity); 571 + 572 + return curr_cpu; 573 + } 574 + 575 + /* 576 + * It assumes dd->comp_vect_possible_cpus is available. 577 + */ 578 + static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, 579 + struct hfi1_affinity_node *entry) 580 + __must_hold(&node_affinity.lock) 581 + { 582 + int i, cpu; 583 + 584 + lockdep_assert_held(&node_affinity.lock); 585 + if (!dd->comp_vect_possible_cpus) 586 + return; 587 + 588 + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 589 + cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, 590 + entry->comp_vect_affinity); 591 + /* Clearing CPU in device completion vector cpu mask */ 592 + if (cpu >= 0) 593 + cpumask_clear_cpu(cpu, &dd->comp_vect->mask); 594 + } 595 + 596 + dd->comp_vect_possible_cpus = 0; 597 + } 598 + 271 599 /* 272 600 * Interrupt affinity. 273 601 * ··· 619 277 int node = pcibus_to_node(dd->pcidev->bus); 620 278 struct hfi1_affinity_node *entry; 621 279 const struct cpumask *local_mask; 622 - int curr_cpu, possible, i; 280 + int curr_cpu, possible, i, ret; 281 + bool new_entry = false; 623 282 624 283 if (node < 0) 625 284 node = numa_node_id(); ··· 642 299 if (!entry) { 643 300 dd_dev_err(dd, 644 301 "Unable to allocate global affinity node\n"); 645 - mutex_unlock(&node_affinity.lock); 646 - return -ENOMEM; 302 + ret = -ENOMEM; 303 + goto fail; 647 304 } 305 + new_entry = true; 306 + 648 307 init_cpu_mask_set(&entry->def_intr); 649 308 init_cpu_mask_set(&entry->rcv_intr); 309 + cpumask_clear(&entry->comp_vect_mask); 650 310 cpumask_clear(&entry->general_intr_mask); 651 311 /* Use the "real" cpu mask of this node as the default */ 652 312 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, ··· 702 356 &entry->general_intr_mask); 703 357 } 704 358 705 - node_affinity_add_tail(entry); 359 + /* Determine completion vector CPUs for the entire node */ 360 + cpumask_and(&entry->comp_vect_mask, 361 + &node_affinity.real_cpu_mask, local_mask); 362 + cpumask_andnot(&entry->comp_vect_mask, 363 + &entry->comp_vect_mask, 364 + &entry->rcv_intr.mask); 365 + cpumask_andnot(&entry->comp_vect_mask, 366 + &entry->comp_vect_mask, 367 + &entry->general_intr_mask); 368 + 369 + /* 370 + * If there ends up being 0 CPU cores leftover for completion 371 + * vectors, use the same CPU core as the general/control 372 + * context. 373 + */ 374 + if (cpumask_weight(&entry->comp_vect_mask) == 0) 375 + cpumask_copy(&entry->comp_vect_mask, 376 + &entry->general_intr_mask); 706 377 } 378 + 379 + ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); 380 + if (ret < 0) 381 + goto fail; 382 + 383 + if (new_entry) 384 + node_affinity_add_tail(entry); 385 + 707 386 mutex_unlock(&node_affinity.lock); 387 + 708 388 return 0; 389 + 390 + fail: 391 + if (new_entry) 392 + node_affinity_destroy(entry); 393 + mutex_unlock(&node_affinity.lock); 394 + return ret; 395 + } 396 + 397 + void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) 398 + { 399 + struct hfi1_affinity_node *entry; 400 + 401 + if (dd->node < 0) 402 + return; 403 + 404 + mutex_lock(&node_affinity.lock); 405 + entry = node_affinity_lookup(dd->node); 406 + if (!entry) 407 + goto unlock; 408 + 409 + /* 410 + * Free device completion vector CPUs to be used by future 411 + * completion vectors 412 + */ 413 + _dev_comp_vect_cpu_mask_clean_up(dd, entry); 414 + unlock: 415 + mutex_unlock(&node_affinity.lock); 416 + dd->node = -1; 709 417 } 710 418 711 419 /*
+8 -2
drivers/infiniband/hw/hfi1/affinity.h
··· 1 1 /* 2 - * Copyright(c) 2015 - 2017 Intel Corporation. 2 + * Copyright(c) 2015 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 98 98 99 99 struct hfi1_affinity_node { 100 100 int node; 101 + u16 __percpu *comp_vect_affinity; 101 102 struct cpu_mask_set def_intr; 102 103 struct cpu_mask_set rcv_intr; 103 104 struct cpumask general_intr_mask; 105 + struct cpumask comp_vect_mask; 104 106 struct list_head list; 105 107 }; 106 108 ··· 118 116 }; 119 117 120 118 int node_affinity_init(void); 121 - void node_affinity_destroy(void); 119 + void node_affinity_destroy_all(void); 122 120 extern struct hfi1_affinity_node_list node_affinity; 121 + void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd); 122 + int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect); 123 + int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd); 124 + void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd); 123 125 124 126 #endif /* _HFI1_AFFINITY_H */
+5
drivers/infiniband/hw/hfi1/chip.c
··· 15233 15233 if (ret) 15234 15234 goto bail_cleanup; 15235 15235 15236 + ret = hfi1_comp_vectors_set_up(dd); 15237 + if (ret) 15238 + goto bail_clear_intr; 15239 + 15236 15240 /* set up LCB access - must be after set_up_interrupts() */ 15237 15241 init_lcb_access(dd); 15238 15242 ··· 15279 15275 bail_free_cntrs: 15280 15276 free_cntrs(dd); 15281 15277 bail_clear_intr: 15278 + hfi1_comp_vectors_clean_up(dd); 15282 15279 hfi1_clean_up_interrupts(dd); 15283 15280 bail_cleanup: 15284 15281 hfi1_pcie_ddcleanup(dd);
+3
drivers/infiniband/hw/hfi1/hfi.h
··· 1263 1263 1264 1264 /* Save the enabled LCB error bits */ 1265 1265 u64 lcb_err_en; 1266 + struct cpu_mask_set *comp_vect; 1267 + int *comp_vect_mappings; 1268 + u32 comp_vect_possible_cpus; 1266 1269 1267 1270 /* 1268 1271 * Capability to have different send engines simply by changing a
+13 -2
drivers/infiniband/hw/hfi1/init.c
··· 1 1 /* 2 - * Copyright(c) 2015-2017 Intel Corporation. 2 + * Copyright(c) 2015 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 1244 1244 dd->rcv_limit = NULL; 1245 1245 dd->send_schedule = NULL; 1246 1246 dd->tx_opstats = NULL; 1247 + kfree(dd->comp_vect); 1248 + dd->comp_vect = NULL; 1247 1249 sdma_clean(dd, dd->num_sdma); 1248 1250 rvt_dealloc_device(&dd->verbs_dev.rdi); 1249 1251 } ··· 1302 1300 dd->unit = ret; 1303 1301 list_add(&dd->list, &hfi1_dev_list); 1304 1302 } 1303 + dd->node = -1; 1305 1304 1306 1305 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1307 1306 idr_preload_end(); ··· 1351 1348 1352 1349 dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); 1353 1350 if (!dd->tx_opstats) { 1351 + ret = -ENOMEM; 1352 + goto bail; 1353 + } 1354 + 1355 + dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); 1356 + if (!dd->comp_vect) { 1354 1357 ret = -ENOMEM; 1355 1358 goto bail; 1356 1359 } ··· 1530 1521 static void __exit hfi1_mod_cleanup(void) 1531 1522 { 1532 1523 pci_unregister_driver(&hfi1_pci_driver); 1533 - node_affinity_destroy(); 1524 + node_affinity_destroy_all(); 1534 1525 hfi1_wss_exit(); 1535 1526 hfi1_dbg_exit(); 1536 1527 ··· 1614 1605 static void postinit_cleanup(struct hfi1_devdata *dd) 1615 1606 { 1616 1607 hfi1_start_cleanup(dd); 1608 + hfi1_comp_vectors_clean_up(dd); 1609 + hfi1_dev_affinity_clean_up(dd); 1617 1610 1618 1611 hfi1_pcie_ddcleanup(dd); 1619 1612 hfi1_pcie_cleanup(dd->pcidev);
+2 -1
drivers/infiniband/hw/hfi1/trace.c
··· 1 1 /* 2 - * Copyright(c) 2015 - 2017 Intel Corporation. 2 + * Copyright(c) 2015 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 374 374 return ret; 375 375 } 376 376 377 + __hfi1_trace_fn(AFFINITY); 377 378 __hfi1_trace_fn(PKT); 378 379 __hfi1_trace_fn(PROC); 379 380 __hfi1_trace_fn(SDMA);
+2 -1
drivers/infiniband/hw/hfi1/trace_dbg.h
··· 1 1 /* 2 - * Copyright(c) 2015, 2016 Intel Corporation. 2 + * Copyright(c) 2015 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 113 113 * hfi1_cdbg(LVL, fmt, ...); as well as take care of all 114 114 * the debugfs stuff. 115 115 */ 116 + __hfi1_trace_def(AFFINITY); 116 117 __hfi1_trace_def(PKT); 117 118 __hfi1_trace_def(PROC); 118 119 __hfi1_trace_def(SDMA);
+4 -3
drivers/infiniband/hw/hfi1/verbs.c
··· 64 64 #include "debugfs.h" 65 65 #include "vnic.h" 66 66 #include "fault.h" 67 + #include "affinity.h" 67 68 68 69 static unsigned int hfi1_lkey_table_size = 16; 69 70 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, ··· 1935 1934 dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; 1936 1935 dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; 1937 1936 dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; 1937 + dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = 1938 + hfi1_comp_vect_mappings_lookup; 1938 1939 1939 1940 /* completeion queue */ 1940 - snprintf(dd->verbs_dev.rdi.dparms.cq_name, 1941 - sizeof(dd->verbs_dev.rdi.dparms.cq_name), 1942 - "hfi1_cq%d", dd->unit); 1941 + dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus; 1943 1942 dd->verbs_dev.rdi.dparms.node = dd->node; 1944 1943 1945 1944 /* misc settings */
+1 -5
drivers/infiniband/hw/qib/qib_verbs.c
··· 1 1 /* 2 - * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. 2 + * Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved. 3 3 * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. 4 4 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. 5 5 * ··· 1630 1630 dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; 1631 1631 dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; 1632 1632 dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; 1633 - 1634 - snprintf(dd->verbs_dev.rdi.dparms.cq_name, 1635 - sizeof(dd->verbs_dev.rdi.dparms.cq_name), 1636 - "qib_cq%d", dd->unit); 1637 1633 1638 1634 qib_fill_device_attr(dd); 1639 1635
+32 -49
drivers/infiniband/sw/rdmavt/cq.c
··· 1 1 /* 2 - * Copyright(c) 2016 Intel Corporation. 2 + * Copyright(c) 2016 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 47 47 48 48 #include <linux/slab.h> 49 49 #include <linux/vmalloc.h> 50 - #include <linux/kthread.h> 51 50 #include "cq.h" 52 51 #include "vt.h" 53 52 #include "trace.h" 53 + 54 + static struct workqueue_struct *comp_vector_wq; 54 55 55 56 /** 56 57 * rvt_cq_enter - add a new entry to the completion queue ··· 121 120 if (cq->notify == IB_CQ_NEXT_COMP || 122 121 (cq->notify == IB_CQ_SOLICITED && 123 122 (solicited || entry->status != IB_WC_SUCCESS))) { 124 - struct kthread_worker *worker; 125 - 126 123 /* 127 124 * This will cause send_complete() to be called in 128 125 * another thread. 129 126 */ 130 - rcu_read_lock(); 131 - worker = rcu_dereference(cq->rdi->worker); 132 - if (likely(worker)) { 133 - cq->notify = RVT_CQ_NONE; 134 - cq->triggered++; 135 - kthread_queue_work(worker, &cq->comptask); 136 - } 137 - rcu_read_unlock(); 127 + cq->notify = RVT_CQ_NONE; 128 + cq->triggered++; 129 + queue_work_on(cq->comp_vector_cpu, comp_vector_wq, 130 + &cq->comptask); 138 131 } 139 132 140 133 spin_unlock_irqrestore(&cq->lock, flags); 141 134 } 142 135 EXPORT_SYMBOL(rvt_cq_enter); 143 136 144 - static void send_complete(struct kthread_work *work) 137 + static void send_complete(struct work_struct *work) 145 138 { 146 139 struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask); 147 140 ··· 187 192 struct ib_cq *ret; 188 193 u32 sz; 189 194 unsigned int entries = attr->cqe; 195 + int comp_vector = attr->comp_vector; 190 196 191 197 if (attr->flags) 192 198 return ERR_PTR(-EINVAL); 193 199 194 200 if (entries < 1 || entries > rdi->dparms.props.max_cqe) 195 201 return ERR_PTR(-EINVAL); 202 + 203 + if (comp_vector < 0) 204 + comp_vector = 0; 205 + 206 + comp_vector = comp_vector % rdi->ibdev.num_comp_vectors; 196 207 197 208 /* Allocate the completion queue structure. */ 198 209 cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); ··· 268 267 * an error. 269 268 */ 270 269 cq->rdi = rdi; 270 + if (rdi->driver_f.comp_vect_cpu_lookup) 271 + cq->comp_vector_cpu = 272 + rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector); 273 + else 274 + cq->comp_vector_cpu = 275 + cpumask_first(cpumask_of_node(rdi->dparms.node)); 276 + 271 277 cq->ibcq.cqe = entries; 272 278 cq->notify = RVT_CQ_NONE; 273 279 spin_lock_init(&cq->lock); 274 - kthread_init_work(&cq->comptask, send_complete); 280 + INIT_WORK(&cq->comptask, send_complete); 275 281 cq->queue = wc; 276 282 277 283 ret = &cq->ibcq; 278 284 285 + trace_rvt_create_cq(cq, attr); 279 286 goto done; 280 287 281 288 bail_ip: ··· 309 300 struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); 310 301 struct rvt_dev_info *rdi = cq->rdi; 311 302 312 - kthread_flush_work(&cq->comptask); 303 + flush_work(&cq->comptask); 313 304 spin_lock_irq(&rdi->n_cqs_lock); 314 305 rdi->n_cqs_allocated--; 315 306 spin_unlock_irq(&rdi->n_cqs_lock); ··· 519 510 * 520 511 * Return: 0 on success 521 512 */ 522 - int rvt_driver_cq_init(struct rvt_dev_info *rdi) 513 + int rvt_driver_cq_init(void) 523 514 { 524 - int cpu; 525 - struct kthread_worker *worker; 515 + comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE, 516 + 0, "rdmavt_cq"); 517 + if (!comp_vector_wq) 518 + return -ENOMEM; 526 519 527 - if (rcu_access_pointer(rdi->worker)) 528 - return 0; 529 - 530 - spin_lock_init(&rdi->n_cqs_lock); 531 - 532 - cpu = cpumask_first(cpumask_of_node(rdi->dparms.node)); 533 - worker = kthread_create_worker_on_cpu(cpu, 0, 534 - "%s", rdi->dparms.cq_name); 535 - if (IS_ERR(worker)) 536 - return PTR_ERR(worker); 537 - 538 - set_user_nice(worker->task, MIN_NICE); 539 - RCU_INIT_POINTER(rdi->worker, worker); 540 520 return 0; 541 521 } 542 522 ··· 533 535 * rvt_cq_exit - tear down cq reources 534 536 * @rdi: rvt dev structure 535 537 */ 536 - void rvt_cq_exit(struct rvt_dev_info *rdi) 538 + void rvt_cq_exit(void) 537 539 { 538 - struct kthread_worker *worker; 539 - 540 - if (!rcu_access_pointer(rdi->worker)) 541 - return; 542 - 543 - spin_lock(&rdi->n_cqs_lock); 544 - worker = rcu_dereference_protected(rdi->worker, 545 - lockdep_is_held(&rdi->n_cqs_lock)); 546 - if (!worker) { 547 - spin_unlock(&rdi->n_cqs_lock); 548 - return; 549 - } 550 - RCU_INIT_POINTER(rdi->worker, NULL); 551 - spin_unlock(&rdi->n_cqs_lock); 552 - synchronize_rcu(); 553 - 554 - kthread_destroy_worker(worker); 540 + destroy_workqueue(comp_vector_wq); 541 + comp_vector_wq = NULL; 555 542 }
+3 -3
drivers/infiniband/sw/rdmavt/cq.h
··· 2 2 #define DEF_RVTCQ_H 3 3 4 4 /* 5 - * Copyright(c) 2016 Intel Corporation. 5 + * Copyright(c) 2016 - 2018 Intel Corporation. 6 6 * 7 7 * This file is provided under a dual BSD/GPLv2 license. When using or 8 8 * redistributing this file, you may do so under either license. ··· 59 59 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); 60 60 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); 61 61 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); 62 - int rvt_driver_cq_init(struct rvt_dev_info *rdi); 63 - void rvt_cq_exit(struct rvt_dev_info *rdi); 62 + int rvt_driver_cq_init(void); 63 + void rvt_cq_exit(void); 64 64 #endif /* DEF_RVTCQ_H */
+34 -1
drivers/infiniband/sw/rdmavt/trace_cq.h
··· 1 1 /* 2 - * Copyright(c) 2016 Intel Corporation. 2 + * Copyright(c) 2016 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 70 70 wc_opcode_name(MASKED_COMP_SWAP), \ 71 71 wc_opcode_name(RECV), \ 72 72 wc_opcode_name(RECV_RDMA_WITH_IMM)) 73 + 74 + #define CQ_ATTR_PRINT \ 75 + "[%s] user cq %s cqe %u comp_vector %d comp_vector_cpu %d flags %x" 76 + 77 + DECLARE_EVENT_CLASS(rvt_cq_template, 78 + TP_PROTO(struct rvt_cq *cq, 79 + const struct ib_cq_init_attr *attr), 80 + TP_ARGS(cq, attr), 81 + TP_STRUCT__entry(RDI_DEV_ENTRY(cq->rdi) 82 + __field(struct rvt_mmap_info *, ip) 83 + __field(unsigned int, cqe) 84 + __field(int, comp_vector) 85 + __field(int, comp_vector_cpu) 86 + __field(u32, flags) 87 + ), 88 + TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi) 89 + __entry->ip = cq->ip; 90 + __entry->cqe = attr->cqe; 91 + __entry->comp_vector = attr->comp_vector; 92 + __entry->comp_vector_cpu = 93 + cq->comp_vector_cpu; 94 + __entry->flags = attr->flags; 95 + ), 96 + TP_printk(CQ_ATTR_PRINT, __get_str(dev), 97 + __entry->ip ? "true" : "false", __entry->cqe, 98 + __entry->comp_vector, __entry->comp_vector_cpu, 99 + __entry->flags 100 + ) 101 + ); 102 + 103 + DEFINE_EVENT(rvt_cq_template, rvt_create_cq, 104 + TP_PROTO(struct rvt_cq *cq, const struct ib_cq_init_attr *attr), 105 + TP_ARGS(cq, attr)); 73 106 74 107 #define CQ_PRN \ 75 108 "[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x"
+13 -22
drivers/infiniband/sw/rdmavt/vt.c
··· 1 1 /* 2 - * Copyright(c) 2016 Intel Corporation. 2 + * Copyright(c) 2016 - 2018 Intel Corporation. 3 3 * 4 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 5 * redistributing this file, you may do so under either license. ··· 49 49 #include <linux/kernel.h> 50 50 #include <linux/dma-mapping.h> 51 51 #include "vt.h" 52 + #include "cq.h" 52 53 #include "trace.h" 53 54 54 55 #define RVT_UVERBS_ABI_VERSION 2 ··· 59 58 60 59 static int rvt_init(void) 61 60 { 62 - /* 63 - * rdmavt does not need to do anything special when it starts up. All it 64 - * needs to do is sit and wait until a driver attempts registration. 65 - */ 66 - return 0; 61 + int ret = rvt_driver_cq_init(); 62 + 63 + if (ret) 64 + pr_err("Error in driver CQ init.\n"); 65 + 66 + return ret; 67 67 } 68 68 module_init(rvt_init); 69 69 70 70 static void rvt_cleanup(void) 71 71 { 72 - /* 73 - * Nothing to do at exit time either. The module won't be able to be 74 - * removed until all drivers are gone which means all the dev structs 75 - * are gone so there is really nothing to do. 76 - */ 72 + rvt_cq_exit(); 77 73 } 78 74 module_exit(rvt_cleanup); 79 75 ··· 775 777 } 776 778 777 779 /* Completion queues */ 778 - ret = rvt_driver_cq_init(rdi); 779 - if (ret) { 780 - pr_err("Error in driver CQ init.\n"); 781 - goto bail_mr; 782 - } 780 + spin_lock_init(&rdi->n_cqs_lock); 783 781 784 782 /* DMA Operations */ 785 783 rdi->ibdev.dev.dma_ops = rdi->ibdev.dev.dma_ops ? : &dma_virt_ops; ··· 823 829 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | 824 830 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); 825 831 rdi->ibdev.node_type = RDMA_NODE_IB_CA; 826 - rdi->ibdev.num_comp_vectors = 1; 832 + if (!rdi->ibdev.num_comp_vectors) 833 + rdi->ibdev.num_comp_vectors = 1; 827 834 828 835 rdi->ibdev.driver_id = driver_id; 829 836 /* We are now good to announce we exist */ 830 837 ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); 831 838 if (ret) { 832 839 rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); 833 - goto bail_cq; 840 + goto bail_mr; 834 841 } 835 842 836 843 rvt_create_mad_agents(rdi); 837 844 838 845 rvt_pr_info(rdi, "Registration with rdmavt done.\n"); 839 846 return ret; 840 - 841 - bail_cq: 842 - rvt_cq_exit(rdi); 843 847 844 848 bail_mr: 845 849 rvt_mr_exit(rdi); ··· 862 870 rvt_free_mad_agents(rdi); 863 871 864 872 ib_unregister_device(&rdi->ibdev); 865 - rvt_cq_exit(rdi); 866 873 rvt_mr_exit(rdi); 867 874 rvt_qp_exit(rdi); 868 875 }
+4 -3
include/rdma/rdma_vt.h
··· 2 2 #define DEF_RDMA_VT_H 3 3 4 4 /* 5 - * Copyright(c) 2016 Intel Corporation. 5 + * Copyright(c) 2016 - 2018 Intel Corporation. 6 6 * 7 7 * This file is provided under a dual BSD/GPLv2 license. When using or 8 8 * redistributing this file, you may do so under either license. ··· 167 167 int qpn_res_end; 168 168 int nports; 169 169 int npkeys; 170 - char cq_name[RVT_CQN_MAX]; 171 170 int node; 172 171 int psn_mask; 173 172 int psn_shift; ··· 346 347 347 348 /* Notify driver to restart rc */ 348 349 void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait); 350 + 351 + /* Get and return CPU to pin CQ processing thread */ 352 + int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect); 349 353 }; 350 354 351 355 struct rvt_dev_info { ··· 404 402 spinlock_t pending_lock; /* protect pending mmap list */ 405 403 406 404 /* CQ */ 407 - struct kthread_worker __rcu *worker; /* per device cq worker */ 408 405 u32 n_cqs_allocated; /* number of CQs allocated for device */ 409 406 spinlock_t n_cqs_lock; /* protect count of in use cqs */ 410 407
+3 -2
include/rdma/rdmavt_cq.h
··· 8 8 * 9 9 * GPL LICENSE SUMMARY 10 10 * 11 - * Copyright(c) 2016 Intel Corporation. 11 + * Copyright(c) 2016 - 2018 Intel Corporation. 12 12 * 13 13 * This program is free software; you can redistribute it and/or modify 14 14 * it under the terms of version 2 of the GNU General Public License as ··· 80 80 */ 81 81 struct rvt_cq { 82 82 struct ib_cq ibcq; 83 - struct kthread_work comptask; 83 + struct work_struct comptask; 84 84 spinlock_t lock; /* protect changes in this struct */ 85 85 u8 notify; 86 86 u8 triggered; 87 + int comp_vector_cpu; 87 88 struct rvt_dev_info *rdi; 88 89 struct rvt_cq_wc *queue; 89 90 struct rvt_mmap_info *ip;