Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nfs/localio: add proper O_DIRECT support for READ and WRITE

Because the NFS client will already happily handle misaligned O_DIRECT
IO (by sending it out to NFSD via RPC) this commit's new capabilities
are for the benefit of LOCALIO.

LOCALIO will make best effort to transform misaligned IO to
DIO-aligned extents when possible.

LOCALIO's READ and WRITE DIO that is misaligned will be split into as
many as 3 component IOs (@start, @middle and @end) as needed -- IFF
the @middle extent is verified to be DIO-aligned, and then the @start
and/or @end are misaligned (due to each being a partial page).
Otherwise if the @middle isn't DIO-aligned the code will fallback to
issuing only a single contiguous buffered IO.

The @middle is only DIO-aligned if both the memory and on-disk offsets
for the IO are aligned relative to the underlying local filesystem's
block device limits (@dma_alignment and @logical_block_size
respectively).

The misaligned @start and/or @end extents are issued using buffered IO
and the DIO-aligned @middle is issued using O_DIRECT. The @start and
@end IOs are issued first using buffered IO with IOCB_SYNC and then
the @middle is issued last using direct IO with async completion (AIO).
This out of order IO completion means that LOCALIO's IO completion
code (nfs_local_read_done and nfs_local_write_done) is only called for
the IO's last associated iov_iter completion. And in the case of
DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done()
is updated to handle piece-wise partial completion of each iov_iter.

This implementation for LOCALIO's misaligned DIO handling uses 3
iov_iter that share the same backing pages in their bio_vecs (so
unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1).

[Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be
explored in the future. One logical progression to improve this code,
and eliminate explicit loops over up to 3 iov_iter, is by extending
'struct iov_iter' to support iov_iter_clone() and iov_iter_chain()
interfaces that are comparable to what 'struct bio' is able to support
in the block layer. But even that wouldn't avoid the need to
allocate/use up to 3 iov_iter]

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>

authored by

Mike Snitzer and committed by
Anna Schumaker
c817248f e43e9a3a

+201 -46
+201 -46
fs/nfs/localio.c
··· 30 30 31 31 #define NFSDBG_FACILITY NFSDBG_VFS 32 32 33 + #define NFSLOCAL_MAX_IOS 3 34 + 33 35 struct nfs_local_kiocb { 34 36 struct kiocb kiocb; 35 37 struct bio_vec *bvec; 36 38 struct nfs_pgio_header *hdr; 37 39 struct work_struct work; 38 40 void (*aio_complete_work)(struct work_struct *); 39 - struct iov_iter iter ____cacheline_aligned; 40 41 struct nfsd_file *localio; 42 + /* Begin mostly DIO-specific members */ 43 + size_t end_len; 44 + short int end_iter_index; 45 + short int n_iters; 46 + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; 47 + loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; 48 + struct iov_iter iters[NFSLOCAL_MAX_IOS]; 49 + /* End mostly DIO-specific members */ 41 50 }; 42 51 43 52 struct nfs_local_fsync_ctx { ··· 300 291 { 301 292 struct nfs_local_kiocb *iocb; 302 293 303 - iocb = kmalloc(sizeof(*iocb), flags); 294 + iocb = kzalloc(sizeof(*iocb), flags); 304 295 if (iocb == NULL) 305 296 return NULL; 306 297 ··· 312 303 } 313 304 314 305 init_sync_kiocb(&iocb->kiocb, file); 315 - if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) 316 - iocb->kiocb.ki_flags = IOCB_DIRECT; 317 306 318 - iocb->kiocb.ki_pos = hdr->args.offset; 319 307 iocb->hdr = hdr; 320 308 iocb->kiocb.ki_flags &= ~IOCB_APPEND; 321 309 iocb->aio_complete_work = NULL; 322 310 311 + iocb->end_iter_index = -1; 312 + 323 313 return iocb; 324 314 } 325 315 316 + struct nfs_local_dio { 317 + u32 mem_align; 318 + u32 offset_align; 319 + loff_t middle_offset; 320 + loff_t end_offset; 321 + ssize_t start_len; /* Length for misaligned first extent */ 322 + ssize_t middle_len; /* Length for DIO-aligned middle extent */ 323 + ssize_t end_len; /* Length for misaligned last extent */ 324 + }; 325 + 326 + static bool 327 + nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, 328 + size_t len, struct nfs_local_dio *local_dio) 329 + { 330 + struct nfs_pgio_header *hdr = iocb->hdr; 331 + loff_t offset = hdr->args.offset; 332 + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; 333 + loff_t start_end, orig_end, middle_end; 334 + 335 + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, 336 + &nf_dio_offset_align, &nf_dio_read_offset_align); 337 + if (rw == ITER_DEST) 338 + nf_dio_offset_align = nf_dio_read_offset_align; 339 + 340 + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) 341 + return false; 342 + if (unlikely(nf_dio_offset_align > PAGE_SIZE)) 343 + return false; 344 + if (unlikely(len < nf_dio_offset_align)) 345 + return false; 346 + 347 + local_dio->mem_align = nf_dio_mem_align; 348 + local_dio->offset_align = nf_dio_offset_align; 349 + 350 + start_end = round_up(offset, nf_dio_offset_align); 351 + orig_end = offset + len; 352 + middle_end = round_down(orig_end, nf_dio_offset_align); 353 + 354 + local_dio->middle_offset = start_end; 355 + local_dio->end_offset = middle_end; 356 + 357 + local_dio->start_len = start_end - offset; 358 + local_dio->middle_len = middle_end - start_end; 359 + local_dio->end_len = orig_end - middle_end; 360 + 361 + return true; 362 + } 363 + 326 364 static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, 327 - loff_t offset, unsigned int addr_mask, unsigned int len_mask) 365 + unsigned int addr_mask, unsigned int len_mask) 328 366 { 329 367 const struct bio_vec *bvec = i->bvec; 330 368 size_t skip = i->iov_offset; 331 369 size_t size = i->count; 332 370 333 - if ((offset | size) & len_mask) 371 + if (size & len_mask) 334 372 return false; 335 373 do { 336 374 size_t len = bvec->bv_len; ··· 394 338 return true; 395 339 } 396 340 397 - static void 398 - nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw) 341 + /* 342 + * Setup as many as 3 iov_iter based on extents described by @local_dio. 343 + * Returns the number of iov_iter that were setup. 344 + */ 345 + static int 346 + nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, 347 + unsigned int nvecs, size_t len, 348 + struct nfs_local_dio *local_dio) 349 + { 350 + int n_iters = 0; 351 + struct iov_iter *iters = iocb->iters; 352 + 353 + /* Setup misaligned start? */ 354 + if (local_dio->start_len) { 355 + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); 356 + iters[n_iters].count = local_dio->start_len; 357 + iocb->offset[n_iters] = iocb->hdr->args.offset; 358 + iocb->iter_is_dio_aligned[n_iters] = false; 359 + ++n_iters; 360 + } 361 + 362 + /* Setup misaligned end? 363 + * If so, the end is purposely setup to be issued using buffered IO 364 + * before the middle (which will use DIO, if DIO-aligned, with AIO). 365 + * This creates problems if/when the end results in a partial write. 366 + * So must save index and length of end to handle this corner case. 367 + */ 368 + if (local_dio->end_len) { 369 + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); 370 + iocb->offset[n_iters] = local_dio->end_offset; 371 + iov_iter_advance(&iters[n_iters], 372 + local_dio->start_len + local_dio->middle_len); 373 + iocb->iter_is_dio_aligned[n_iters] = false; 374 + /* Save index and length of end */ 375 + iocb->end_iter_index = n_iters; 376 + iocb->end_len = local_dio->end_len; 377 + ++n_iters; 378 + } 379 + 380 + /* Setup DIO-aligned middle to be issued last, to allow for 381 + * DIO with AIO completion (see nfs_local_call_{read,write}). 382 + */ 383 + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); 384 + if (local_dio->start_len) 385 + iov_iter_advance(&iters[n_iters], local_dio->start_len); 386 + iters[n_iters].count -= local_dio->end_len; 387 + iocb->offset[n_iters] = local_dio->middle_offset; 388 + 389 + iocb->iter_is_dio_aligned[n_iters] = 390 + nfs_iov_iter_aligned_bvec(&iters[n_iters], 391 + local_dio->mem_align-1, local_dio->offset_align-1); 392 + 393 + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) 394 + return 0; /* no DIO-aligned IO possible */ 395 + ++n_iters; 396 + 397 + iocb->n_iters = n_iters; 398 + return n_iters; 399 + } 400 + 401 + static noinline_for_stack void 402 + nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) 399 403 { 400 404 struct nfs_pgio_header *hdr = iocb->hdr; 401 405 struct page **pagevec = hdr->page_array.pagevec; ··· 476 360 } 477 361 len = hdr->args.count - total; 478 362 479 - iov_iter_bvec(i, rw, iocb->bvec, v, len); 363 + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { 364 + struct nfs_local_dio local_dio; 480 365 481 - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { 482 - u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; 483 - /* Verify the IO is DIO-aligned as required */ 484 - nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, 485 - &nf_dio_offset_align, 486 - &nf_dio_read_offset_align); 487 - if (rw == ITER_DEST) 488 - nf_dio_offset_align = nf_dio_read_offset_align; 489 - 490 - if (nf_dio_mem_align && nf_dio_offset_align && 491 - nfs_iov_iter_aligned_bvec(i, hdr->args.offset, 492 - nf_dio_mem_align - 1, 493 - nf_dio_offset_align - 1)) 366 + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && 367 + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) 494 368 return; /* is DIO-aligned */ 495 - 496 - /* Fallback to using buffered for this misaligned IO */ 497 - iocb->kiocb.ki_flags &= ~IOCB_DIRECT; 498 369 } 370 + 371 + /* Use buffered IO */ 372 + iocb->offset[0] = hdr->args.offset; 373 + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); 374 + iocb->n_iters = 1; 499 375 } 500 376 501 377 static void ··· 510 402 static void 511 403 nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) 512 404 { 405 + /* Must handle partial completions */ 513 406 if (status >= 0) { 514 - hdr->res.count = status; 515 - hdr->res.op_status = NFS4_OK; 516 - hdr->task.tk_status = 0; 407 + hdr->res.count += status; 408 + /* @hdr was initialized to 0 (zeroed during allocation) */ 409 + if (hdr->task.tk_status == 0) 410 + hdr->res.op_status = NFS4_OK; 517 411 } else { 518 412 hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); 519 413 hdr->task.tk_status = status; ··· 561 451 pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); 562 452 } 563 453 564 - nfs_local_pgio_done(hdr, status); 565 - 566 454 /* 567 455 * Must clear replen otherwise NFSv3 data corruption will occur 568 456 * if/when switching from LOCALIO back to using normal RPC. ··· 588 480 struct nfs_local_kiocb *iocb = 589 481 container_of(kiocb, struct nfs_local_kiocb, kiocb); 590 482 483 + nfs_local_pgio_done(iocb->hdr, ret); 591 484 nfs_local_read_done(iocb, ret); 592 485 nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ 593 486 } ··· 603 494 604 495 save_cred = override_creds(filp->f_cred); 605 496 606 - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { 607 - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; 608 - iocb->aio_complete_work = nfs_local_read_aio_complete_work; 609 - } 497 + for (int i = 0; i < iocb->n_iters ; i++) { 498 + if (iocb->iter_is_dio_aligned[i]) { 499 + iocb->kiocb.ki_flags |= IOCB_DIRECT; 500 + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; 501 + iocb->aio_complete_work = nfs_local_read_aio_complete_work; 502 + } 610 503 611 - status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter); 504 + iocb->kiocb.ki_pos = iocb->offset[i]; 505 + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); 506 + if (status != -EIOCBQUEUED) { 507 + nfs_local_pgio_done(iocb->hdr, status); 508 + if (iocb->hdr->task.tk_status) 509 + break; 510 + } 511 + } 612 512 613 513 revert_creds(save_cred); 614 514 ··· 753 635 } 754 636 755 637 /* Handle short writes as if they are ENOSPC */ 638 + status = hdr->res.count; 756 639 if (status > 0 && status < hdr->args.count) { 757 640 hdr->mds_offset += status; 758 641 hdr->args.offset += status; ··· 761 642 hdr->args.count -= status; 762 643 nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); 763 644 status = -ENOSPC; 645 + /* record -ENOSPC in terms of nfs_local_pgio_done */ 646 + nfs_local_pgio_done(hdr, status); 764 647 } 765 - if (status < 0) 648 + if (hdr->task.tk_status < 0) 766 649 nfs_reset_boot_verifier(inode); 767 - 768 - nfs_local_pgio_done(hdr, status); 769 650 } 770 651 771 652 static void nfs_local_write_aio_complete_work(struct work_struct *work) ··· 782 663 struct nfs_local_kiocb *iocb = 783 664 container_of(kiocb, struct nfs_local_kiocb, kiocb); 784 665 666 + nfs_local_pgio_done(iocb->hdr, ret); 785 667 nfs_local_write_done(iocb, ret); 786 668 nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ 787 669 } ··· 799 679 current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; 800 680 save_cred = override_creds(filp->f_cred); 801 681 802 - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { 803 - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; 804 - iocb->aio_complete_work = nfs_local_write_aio_complete_work; 805 - } 806 - 807 682 file_start_write(filp); 808 - status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter); 683 + for (int i = 0; i < iocb->n_iters ; i++) { 684 + if (iocb->iter_is_dio_aligned[i]) { 685 + iocb->kiocb.ki_flags |= IOCB_DIRECT; 686 + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; 687 + iocb->aio_complete_work = nfs_local_write_aio_complete_work; 688 + } 689 + retry: 690 + iocb->kiocb.ki_pos = iocb->offset[i]; 691 + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); 692 + if (status != -EIOCBQUEUED) { 693 + if (unlikely(status >= 0 && status < iocb->iters[i].count)) { 694 + /* partial write */ 695 + if (i == iocb->end_iter_index) { 696 + /* Must not account partial end, otherwise, due 697 + * to end being issued before middle: the partial 698 + * write accounting in nfs_local_write_done() 699 + * would incorrectly advance hdr->args.offset 700 + */ 701 + status = 0; 702 + } else { 703 + /* Partial write at start or buffered middle, 704 + * exit early. 705 + */ 706 + nfs_local_pgio_done(iocb->hdr, status); 707 + break; 708 + } 709 + } else if (unlikely(status == -ENOTBLK && 710 + (iocb->kiocb.ki_flags & IOCB_DIRECT))) { 711 + /* VFS will return -ENOTBLK if DIO WRITE fails to 712 + * invalidate the page cache. Retry using buffered IO. 713 + */ 714 + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; 715 + iocb->kiocb.ki_complete = NULL; 716 + iocb->aio_complete_work = NULL; 717 + goto retry; 718 + } 719 + nfs_local_pgio_done(iocb->hdr, status); 720 + if (iocb->hdr->task.tk_status) 721 + break; 722 + } 723 + } 809 724 file_end_write(filp); 810 725 811 726 revert_creds(save_cred); ··· 909 754 iocb->hdr = hdr; 910 755 iocb->localio = localio; 911 756 912 - nfs_local_iter_init(&iocb->iter, iocb, rw); 757 + nfs_local_iters_init(iocb, rw); 913 758 914 759 return iocb; 915 760 }