Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

staging/hfi1: Add TID entry program function body

The previous patch in the series added the free/invalidate
function bodies. Now, it's time for the programming side.

This large function takes the user's buffer, breaks it up
into manageable chunks, allocates enough RcvArray groups
and programs the chunks into the RcvArray entries in the
hardware.

With this function, the TID caching functionality is implemented.
However, it is still unused. The switch will come in a later
patch in the series, which will remove the old functionality and
switch the driver over to TID caching.

Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

authored by

Mitko Haralanov and committed by
Doug Ledford
7e7a436e 455d7f1a

+259 -4
+259 -4
drivers/staging/rdma/hfi1/user_exp_rcv.c
··· 97 97 98 98 static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, 99 99 struct rb_root *); 100 - static u32 find_phys_blocks(struct page **, unsigned, 101 - struct tid_pageset *) __maybe_unused; 100 + static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); 102 101 static int set_rcvarray_entry(struct file *, unsigned long, u32, 103 102 struct tid_group *, struct page **, unsigned); 104 103 static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, ··· 118 119 unsigned long, unsigned long); 119 120 static int program_rcvarray(struct file *, unsigned long, struct tid_group *, 120 121 struct tid_pageset *, unsigned, u16, struct page **, 121 - u32 *, unsigned *, unsigned *) __maybe_unused; 122 + u32 *, unsigned *, unsigned *); 122 123 static int unprogram_rcvarray(struct file *, u32, struct tid_group **); 123 124 static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *); 124 125 ··· 338 339 writeq(0, dd->rcvarray_wc + (index * 8)); 339 340 } 340 341 342 + /* 343 + * RcvArray entry allocation for Expected Receives is done by the 344 + * following algorithm: 345 + * 346 + * The context keeps 3 lists of groups of RcvArray entries: 347 + * 1. List of empty groups - tid_group_list 348 + * This list is created during user context creation and 349 + * contains elements which describe sets (of 8) of empty 350 + * RcvArray entries. 351 + * 2. List of partially used groups - tid_used_list 352 + * This list contains sets of RcvArray entries which are 353 + * not completely used up. Another mapping request could 354 + * use some of all of the remaining entries. 355 + * 3. List of full groups - tid_full_list 356 + * This is the list where sets that are completely used 357 + * up go. 358 + * 359 + * An attempt to optimize the usage of RcvArray entries is 360 + * made by finding all sets of physically contiguous pages in a 361 + * user's buffer. 362 + * These physically contiguous sets are further split into 363 + * sizes supported by the receive engine of the HFI. The 364 + * resulting sets of pages are stored in struct tid_pageset, 365 + * which describes the sets as: 366 + * * .count - number of pages in this set 367 + * * .idx - starting index into struct page ** array 368 + * of this set 369 + * 370 + * From this point on, the algorithm deals with the page sets 371 + * described above. The number of pagesets is divided by the 372 + * RcvArray group size to produce the number of full groups 373 + * needed. 374 + * 375 + * Groups from the 3 lists are manipulated using the following 376 + * rules: 377 + * 1. For each set of 8 pagesets, a complete group from 378 + * tid_group_list is taken, programmed, and moved to 379 + * the tid_full_list list. 380 + * 2. For all remaining pagesets: 381 + * 2.1 If the tid_used_list is empty and the tid_group_list 382 + * is empty, stop processing pageset and return only 383 + * what has been programmed up to this point. 384 + * 2.2 If the tid_used_list is empty and the tid_group_list 385 + * is not empty, move a group from tid_group_list to 386 + * tid_used_list. 387 + * 2.3 For each group is tid_used_group, program as much as 388 + * can fit into the group. If the group becomes fully 389 + * used, move it to tid_full_list. 390 + */ 341 391 int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) 342 392 { 343 - return -EINVAL; 393 + int ret = 0, need_group = 0, pinned; 394 + struct hfi1_filedata *fd = fp->private_data; 395 + struct hfi1_ctxtdata *uctxt = fd->uctxt; 396 + struct hfi1_devdata *dd = uctxt->dd; 397 + unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, 398 + tididx = 0, mapped, mapped_pages = 0; 399 + unsigned long vaddr = tinfo->vaddr; 400 + struct page **pages = NULL; 401 + u32 *tidlist = NULL; 402 + struct tid_pageset *pagesets = NULL; 403 + 404 + /* Get the number of pages the user buffer spans */ 405 + npages = num_user_pages(vaddr, tinfo->length); 406 + if (!npages) 407 + return -EINVAL; 408 + 409 + if (npages > uctxt->expected_count) { 410 + dd_dev_err(dd, "Expected buffer too big\n"); 411 + return -EINVAL; 412 + } 413 + 414 + /* Verify that access is OK for the user buffer */ 415 + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 416 + npages * PAGE_SIZE)) { 417 + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", 418 + (void *)vaddr, npages); 419 + return -EFAULT; 420 + } 421 + 422 + pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), 423 + GFP_KERNEL); 424 + if (!pagesets) 425 + return -ENOMEM; 426 + 427 + /* Allocate the array of struct page pointers needed for pinning */ 428 + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 429 + if (!pages) { 430 + ret = -ENOMEM; 431 + goto bail; 432 + } 433 + 434 + /* 435 + * Pin all the pages of the user buffer. If we can't pin all the 436 + * pages, accept the amount pinned so far and program only that. 437 + * User space knows how to deal with partially programmed buffers. 438 + */ 439 + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); 440 + if (pinned <= 0) { 441 + ret = pinned; 442 + goto bail; 443 + } 444 + 445 + /* Find sets of physically contiguous pages */ 446 + npagesets = find_phys_blocks(pages, pinned, pagesets); 447 + 448 + /* 449 + * We don't need to access this under a lock since tid_used is per 450 + * process and the same process cannot be in hfi1_user_exp_rcv_clear() 451 + * and hfi1_user_exp_rcv_setup() at the same time. 452 + */ 453 + spin_lock(&fd->tid_lock); 454 + if (fd->tid_used + npagesets > fd->tid_limit) 455 + pageset_count = fd->tid_limit - fd->tid_used; 456 + else 457 + pageset_count = npagesets; 458 + spin_unlock(&fd->tid_lock); 459 + 460 + if (!pageset_count) 461 + goto bail; 462 + 463 + ngroups = pageset_count / dd->rcv_entries.group_size; 464 + tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); 465 + if (!tidlist) { 466 + ret = -ENOMEM; 467 + goto nomem; 468 + } 469 + 470 + tididx = 0; 471 + 472 + /* 473 + * From this point on, we are going to be using shared (between master 474 + * and subcontexts) context resources. We need to take the lock. 475 + */ 476 + mutex_lock(&uctxt->exp_lock); 477 + /* 478 + * The first step is to program the RcvArray entries which are complete 479 + * groups. 480 + */ 481 + while (ngroups && uctxt->tid_group_list.count) { 482 + struct tid_group *grp = 483 + tid_group_pop(&uctxt->tid_group_list); 484 + 485 + ret = program_rcvarray(fp, vaddr, grp, pagesets, 486 + pageidx, dd->rcv_entries.group_size, 487 + pages, tidlist, &tididx, &mapped); 488 + /* 489 + * If there was a failure to program the RcvArray 490 + * entries for the entire group, reset the grp fields 491 + * and add the grp back to the free group list. 492 + */ 493 + if (ret <= 0) { 494 + tid_group_add_tail(grp, &uctxt->tid_group_list); 495 + hfi1_cdbg(TID, 496 + "Failed to program RcvArray group %d", ret); 497 + goto unlock; 498 + } 499 + 500 + tid_group_add_tail(grp, &uctxt->tid_full_list); 501 + ngroups--; 502 + pageidx += ret; 503 + mapped_pages += mapped; 504 + } 505 + 506 + while (pageidx < pageset_count) { 507 + struct tid_group *grp, *ptr; 508 + /* 509 + * If we don't have any partially used tid groups, check 510 + * if we have empty groups. If so, take one from there and 511 + * put in the partially used list. 512 + */ 513 + if (!uctxt->tid_used_list.count || need_group) { 514 + if (!uctxt->tid_group_list.count) 515 + goto unlock; 516 + 517 + grp = tid_group_pop(&uctxt->tid_group_list); 518 + tid_group_add_tail(grp, &uctxt->tid_used_list); 519 + need_group = 0; 520 + } 521 + /* 522 + * There is an optimization opportunity here - instead of 523 + * fitting as many page sets as we can, check for a group 524 + * later on in the list that could fit all of them. 525 + */ 526 + list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, 527 + list) { 528 + unsigned use = min_t(unsigned, pageset_count - pageidx, 529 + grp->size - grp->used); 530 + 531 + ret = program_rcvarray(fp, vaddr, grp, pagesets, 532 + pageidx, use, pages, tidlist, 533 + &tididx, &mapped); 534 + if (ret < 0) { 535 + hfi1_cdbg(TID, 536 + "Failed to program RcvArray entries %d", 537 + ret); 538 + ret = -EFAULT; 539 + goto unlock; 540 + } else if (ret > 0) { 541 + if (grp->used == grp->size) 542 + tid_group_move(grp, 543 + &uctxt->tid_used_list, 544 + &uctxt->tid_full_list); 545 + pageidx += ret; 546 + mapped_pages += mapped; 547 + need_group = 0; 548 + /* Check if we are done so we break out early */ 549 + if (pageidx >= pageset_count) 550 + break; 551 + } else if (WARN_ON(ret == 0)) { 552 + /* 553 + * If ret is 0, we did not program any entries 554 + * into this group, which can only happen if 555 + * we've screwed up the accounting somewhere. 556 + * Warn and try to continue. 557 + */ 558 + need_group = 1; 559 + } 560 + } 561 + } 562 + unlock: 563 + mutex_unlock(&uctxt->exp_lock); 564 + nomem: 565 + hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, 566 + mapped_pages, ret); 567 + if (tididx) { 568 + spin_lock(&fd->tid_lock); 569 + fd->tid_used += tididx; 570 + spin_unlock(&fd->tid_lock); 571 + tinfo->tidcnt = tididx; 572 + tinfo->length = mapped_pages * PAGE_SIZE; 573 + 574 + if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, 575 + tidlist, sizeof(tidlist[0]) * tididx)) { 576 + /* 577 + * On failure to copy to the user level, we need to undo 578 + * everything done so far so we don't leak resources. 579 + */ 580 + tinfo->tidlist = (unsigned long)&tidlist; 581 + hfi1_user_exp_rcv_clear(fp, tinfo); 582 + tinfo->tidlist = 0; 583 + ret = -EFAULT; 584 + goto bail; 585 + } 586 + } 587 + 588 + /* 589 + * If not everything was mapped (due to insufficient RcvArray entries, 590 + * for example), unpin all unmapped pages so we can pin them nex time. 591 + */ 592 + if (mapped_pages != pinned) 593 + hfi1_release_user_pages(&pages[mapped_pages], 594 + pinned - mapped_pages, 595 + false); 596 + bail: 597 + kfree(pagesets); 598 + kfree(pages); 599 + kfree(tidlist); 600 + return ret > 0 ? 0 : ret; 344 601 } 345 602 346 603 int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)