Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.39 1101 lines 30 kB view raw
1/* 2 * pNFS functions to call and manage layout drivers. 3 * 4 * Copyright (c) 2002 [year of first publication] 5 * The Regents of the University of Michigan 6 * All Rights Reserved 7 * 8 * Dean Hildebrand <dhildebz@umich.edu> 9 * 10 * Permission is granted to use, copy, create derivative works, and 11 * redistribute this software and such derivative works for any purpose, 12 * so long as the name of the University of Michigan is not used in 13 * any advertising or publicity pertaining to the use or distribution 14 * of this software without specific, written prior authorization. If 15 * the above copyright notice or any other identification of the 16 * University of Michigan is included in any copy of any portion of 17 * this software, then the disclaimer below must also be included. 18 * 19 * This software is provided as is, without representation or warranty 20 * of any kind either express or implied, including without limitation 21 * the implied warranties of merchantability, fitness for a particular 22 * purpose, or noninfringement. The Regents of the University of 23 * Michigan shall not be liable for any damages, including special, 24 * indirect, incidental, or consequential damages, with respect to any 25 * claim arising out of or in connection with the use of the software, 26 * even if it has been or is hereafter advised of the possibility of 27 * such damages. 28 */ 29 30#include <linux/nfs_fs.h> 31#include "internal.h" 32#include "pnfs.h" 33#include "iostat.h" 34 35#define NFSDBG_FACILITY NFSDBG_PNFS 36 37/* Locking: 38 * 39 * pnfs_spinlock: 40 * protects pnfs_modules_tbl. 41 */ 42static DEFINE_SPINLOCK(pnfs_spinlock); 43 44/* 45 * pnfs_modules_tbl holds all pnfs modules 46 */ 47static LIST_HEAD(pnfs_modules_tbl); 48 49/* Return the registered pnfs layout driver module matching given id */ 50static struct pnfs_layoutdriver_type * 51find_pnfs_driver_locked(u32 id) 52{ 53 struct pnfs_layoutdriver_type *local; 54 55 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) 56 if (local->id == id) 57 goto out; 58 local = NULL; 59out: 60 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); 61 return local; 62} 63 64static struct pnfs_layoutdriver_type * 65find_pnfs_driver(u32 id) 66{ 67 struct pnfs_layoutdriver_type *local; 68 69 spin_lock(&pnfs_spinlock); 70 local = find_pnfs_driver_locked(id); 71 spin_unlock(&pnfs_spinlock); 72 return local; 73} 74 75void 76unset_pnfs_layoutdriver(struct nfs_server *nfss) 77{ 78 if (nfss->pnfs_curr_ld) 79 module_put(nfss->pnfs_curr_ld->owner); 80 nfss->pnfs_curr_ld = NULL; 81} 82 83/* 84 * Try to set the server's pnfs module to the pnfs layout type specified by id. 85 * Currently only one pNFS layout driver per filesystem is supported. 86 * 87 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 88 */ 89void 90set_pnfs_layoutdriver(struct nfs_server *server, u32 id) 91{ 92 struct pnfs_layoutdriver_type *ld_type = NULL; 93 94 if (id == 0) 95 goto out_no_driver; 96 if (!(server->nfs_client->cl_exchange_flags & 97 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 98 printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, 99 id, server->nfs_client->cl_exchange_flags); 100 goto out_no_driver; 101 } 102 ld_type = find_pnfs_driver(id); 103 if (!ld_type) { 104 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); 105 ld_type = find_pnfs_driver(id); 106 if (!ld_type) { 107 dprintk("%s: No pNFS module found for %u.\n", 108 __func__, id); 109 goto out_no_driver; 110 } 111 } 112 if (!try_module_get(ld_type->owner)) { 113 dprintk("%s: Could not grab reference on module\n", __func__); 114 goto out_no_driver; 115 } 116 server->pnfs_curr_ld = ld_type; 117 118 dprintk("%s: pNFS module for %u set\n", __func__, id); 119 return; 120 121out_no_driver: 122 dprintk("%s: Using NFSv4 I/O\n", __func__); 123 server->pnfs_curr_ld = NULL; 124} 125 126int 127pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 128{ 129 int status = -EINVAL; 130 struct pnfs_layoutdriver_type *tmp; 131 132 if (ld_type->id == 0) { 133 printk(KERN_ERR "%s id 0 is reserved\n", __func__); 134 return status; 135 } 136 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 137 printk(KERN_ERR "%s Layout driver must provide " 138 "alloc_lseg and free_lseg.\n", __func__); 139 return status; 140 } 141 142 spin_lock(&pnfs_spinlock); 143 tmp = find_pnfs_driver_locked(ld_type->id); 144 if (!tmp) { 145 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); 146 status = 0; 147 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 148 ld_type->name); 149 } else { 150 printk(KERN_ERR "%s Module with id %d already loaded!\n", 151 __func__, ld_type->id); 152 } 153 spin_unlock(&pnfs_spinlock); 154 155 return status; 156} 157EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); 158 159void 160pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 161{ 162 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); 163 spin_lock(&pnfs_spinlock); 164 list_del(&ld_type->pnfs_tblid); 165 spin_unlock(&pnfs_spinlock); 166} 167EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 168 169/* 170 * pNFS client layout cache 171 */ 172 173/* Need to hold i_lock if caller does not already hold reference */ 174void 175get_layout_hdr(struct pnfs_layout_hdr *lo) 176{ 177 atomic_inc(&lo->plh_refcount); 178} 179 180static void 181destroy_layout_hdr(struct pnfs_layout_hdr *lo) 182{ 183 dprintk("%s: freeing layout cache %p\n", __func__, lo); 184 BUG_ON(!list_empty(&lo->plh_layouts)); 185 NFS_I(lo->plh_inode)->layout = NULL; 186 kfree(lo); 187} 188 189static void 190put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 191{ 192 if (atomic_dec_and_test(&lo->plh_refcount)) 193 destroy_layout_hdr(lo); 194} 195 196void 197put_layout_hdr(struct pnfs_layout_hdr *lo) 198{ 199 struct inode *inode = lo->plh_inode; 200 201 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 202 destroy_layout_hdr(lo); 203 spin_unlock(&inode->i_lock); 204 } 205} 206 207static void 208init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 209{ 210 INIT_LIST_HEAD(&lseg->pls_list); 211 atomic_set(&lseg->pls_refcount, 1); 212 smp_mb(); 213 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 214 lseg->pls_layout = lo; 215} 216 217static void free_lseg(struct pnfs_layout_segment *lseg) 218{ 219 struct inode *ino = lseg->pls_layout->plh_inode; 220 221 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 222 /* Matched by get_layout_hdr in pnfs_insert_layout */ 223 put_layout_hdr(NFS_I(ino)->layout); 224} 225 226static void 227put_lseg_common(struct pnfs_layout_segment *lseg) 228{ 229 struct inode *inode = lseg->pls_layout->plh_inode; 230 231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 232 list_del_init(&lseg->pls_list); 233 if (list_empty(&lseg->pls_layout->plh_segs)) { 234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 235 /* Matched by initial refcount set in alloc_init_layout_hdr */ 236 put_layout_hdr_locked(lseg->pls_layout); 237 } 238 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 239} 240 241void 242put_lseg(struct pnfs_layout_segment *lseg) 243{ 244 struct inode *inode; 245 246 if (!lseg) 247 return; 248 249 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 250 atomic_read(&lseg->pls_refcount), 251 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 252 inode = lseg->pls_layout->plh_inode; 253 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 254 LIST_HEAD(free_me); 255 256 put_lseg_common(lseg); 257 list_add(&lseg->pls_list, &free_me); 258 spin_unlock(&inode->i_lock); 259 pnfs_free_lseg_list(&free_me); 260 } 261} 262EXPORT_SYMBOL_GPL(put_lseg); 263 264static bool 265should_free_lseg(u32 lseg_iomode, u32 recall_iomode) 266{ 267 return (recall_iomode == IOMODE_ANY || 268 lseg_iomode == recall_iomode); 269} 270 271/* Returns 1 if lseg is removed from list, 0 otherwise */ 272static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, 273 struct list_head *tmp_list) 274{ 275 int rv = 0; 276 277 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 278 /* Remove the reference keeping the lseg in the 279 * list. It will now be removed when all 280 * outstanding io is finished. 281 */ 282 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 283 atomic_read(&lseg->pls_refcount)); 284 if (atomic_dec_and_test(&lseg->pls_refcount)) { 285 put_lseg_common(lseg); 286 list_add(&lseg->pls_list, tmp_list); 287 rv = 1; 288 } 289 } 290 return rv; 291} 292 293/* Returns count of number of matching invalid lsegs remaining in list 294 * after call. 295 */ 296int 297mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 298 struct list_head *tmp_list, 299 u32 iomode) 300{ 301 struct pnfs_layout_segment *lseg, *next; 302 int invalid = 0, removed = 0; 303 304 dprintk("%s:Begin lo %p\n", __func__, lo); 305 306 if (list_empty(&lo->plh_segs)) { 307 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) 308 put_layout_hdr_locked(lo); 309 return 0; 310 } 311 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 312 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 313 dprintk("%s: freeing lseg %p iomode %d " 314 "offset %llu length %llu\n", __func__, 315 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 316 lseg->pls_range.length); 317 invalid++; 318 removed += mark_lseg_invalid(lseg, tmp_list); 319 } 320 dprintk("%s:Return %i\n", __func__, invalid - removed); 321 return invalid - removed; 322} 323 324/* note free_me must contain lsegs from a single layout_hdr */ 325void 326pnfs_free_lseg_list(struct list_head *free_me) 327{ 328 struct pnfs_layout_segment *lseg, *tmp; 329 struct pnfs_layout_hdr *lo; 330 331 if (list_empty(free_me)) 332 return; 333 334 lo = list_first_entry(free_me, struct pnfs_layout_segment, 335 pls_list)->pls_layout; 336 337 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { 338 struct nfs_client *clp; 339 340 clp = NFS_SERVER(lo->plh_inode)->nfs_client; 341 spin_lock(&clp->cl_lock); 342 list_del_init(&lo->plh_layouts); 343 spin_unlock(&clp->cl_lock); 344 } 345 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 346 list_del(&lseg->pls_list); 347 free_lseg(lseg); 348 } 349} 350 351void 352pnfs_destroy_layout(struct nfs_inode *nfsi) 353{ 354 struct pnfs_layout_hdr *lo; 355 LIST_HEAD(tmp_list); 356 357 spin_lock(&nfsi->vfs_inode.i_lock); 358 lo = nfsi->layout; 359 if (lo) { 360 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 361 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 362 } 363 spin_unlock(&nfsi->vfs_inode.i_lock); 364 pnfs_free_lseg_list(&tmp_list); 365} 366 367/* 368 * Called by the state manger to remove all layouts established under an 369 * expired lease. 370 */ 371void 372pnfs_destroy_all_layouts(struct nfs_client *clp) 373{ 374 struct pnfs_layout_hdr *lo; 375 LIST_HEAD(tmp_list); 376 377 spin_lock(&clp->cl_lock); 378 list_splice_init(&clp->cl_layouts, &tmp_list); 379 spin_unlock(&clp->cl_lock); 380 381 while (!list_empty(&tmp_list)) { 382 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, 383 plh_layouts); 384 dprintk("%s freeing layout for inode %lu\n", __func__, 385 lo->plh_inode->i_ino); 386 list_del_init(&lo->plh_layouts); 387 pnfs_destroy_layout(NFS_I(lo->plh_inode)); 388 } 389} 390 391/* update lo->plh_stateid with new if is more recent */ 392void 393pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 394 bool update_barrier) 395{ 396 u32 oldseq, newseq; 397 398 oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); 399 newseq = be32_to_cpu(new->stateid.seqid); 400 if ((int)(newseq - oldseq) > 0) { 401 memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); 402 if (update_barrier) { 403 u32 new_barrier = be32_to_cpu(new->stateid.seqid); 404 405 if ((int)(new_barrier - lo->plh_barrier)) 406 lo->plh_barrier = new_barrier; 407 } else { 408 /* Because of wraparound, we want to keep the barrier 409 * "close" to the current seqids. It needs to be 410 * within 2**31 to count as "behind", so if it 411 * gets too near that limit, give us a litle leeway 412 * and bring it to within 2**30. 413 * NOTE - and yes, this is all unsigned arithmetic. 414 */ 415 if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) 416 lo->plh_barrier = newseq - (1 << 30); 417 } 418 } 419} 420 421/* lget is set to 1 if called from inside send_layoutget call chain */ 422static bool 423pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, 424 int lget) 425{ 426 if ((stateid) && 427 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) 428 return true; 429 return lo->plh_block_lgets || 430 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || 431 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 432 (list_empty(&lo->plh_segs) && 433 (atomic_read(&lo->plh_outstanding) > lget)); 434} 435 436int 437pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 438 struct nfs4_state *open_state) 439{ 440 int status = 0; 441 442 dprintk("--> %s\n", __func__); 443 spin_lock(&lo->plh_inode->i_lock); 444 if (pnfs_layoutgets_blocked(lo, NULL, 1)) { 445 status = -EAGAIN; 446 } else if (list_empty(&lo->plh_segs)) { 447 int seq; 448 449 do { 450 seq = read_seqbegin(&open_state->seqlock); 451 memcpy(dst->data, open_state->stateid.data, 452 sizeof(open_state->stateid.data)); 453 } while (read_seqretry(&open_state->seqlock, seq)); 454 } else 455 memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); 456 spin_unlock(&lo->plh_inode->i_lock); 457 dprintk("<-- %s\n", __func__); 458 return status; 459} 460 461/* 462* Get layout from server. 463* for now, assume that whole file layouts are requested. 464* arg->offset: 0 465* arg->length: all ones 466*/ 467static struct pnfs_layout_segment * 468send_layoutget(struct pnfs_layout_hdr *lo, 469 struct nfs_open_context *ctx, 470 u32 iomode, 471 gfp_t gfp_flags) 472{ 473 struct inode *ino = lo->plh_inode; 474 struct nfs_server *server = NFS_SERVER(ino); 475 struct nfs4_layoutget *lgp; 476 struct pnfs_layout_segment *lseg = NULL; 477 struct page **pages = NULL; 478 int i; 479 u32 max_resp_sz, max_pages; 480 481 dprintk("--> %s\n", __func__); 482 483 BUG_ON(ctx == NULL); 484 lgp = kzalloc(sizeof(*lgp), gfp_flags); 485 if (lgp == NULL) 486 return NULL; 487 488 /* allocate pages for xdr post processing */ 489 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 490 max_pages = max_resp_sz >> PAGE_SHIFT; 491 492 pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); 493 if (!pages) 494 goto out_err_free; 495 496 for (i = 0; i < max_pages; i++) { 497 pages[i] = alloc_page(gfp_flags); 498 if (!pages[i]) 499 goto out_err_free; 500 } 501 502 lgp->args.minlength = NFS4_MAX_UINT64; 503 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 504 lgp->args.range.iomode = iomode; 505 lgp->args.range.offset = 0; 506 lgp->args.range.length = NFS4_MAX_UINT64; 507 lgp->args.type = server->pnfs_curr_ld->id; 508 lgp->args.inode = ino; 509 lgp->args.ctx = get_nfs_open_context(ctx); 510 lgp->args.layout.pages = pages; 511 lgp->args.layout.pglen = max_pages * PAGE_SIZE; 512 lgp->lsegpp = &lseg; 513 lgp->gfp_flags = gfp_flags; 514 515 /* Synchronously retrieve layout information from server and 516 * store in lseg. 517 */ 518 nfs4_proc_layoutget(lgp); 519 if (!lseg) { 520 /* remember that LAYOUTGET failed and suspend trying */ 521 set_bit(lo_fail_bit(iomode), &lo->plh_flags); 522 } 523 524 /* free xdr pages */ 525 for (i = 0; i < max_pages; i++) 526 __free_page(pages[i]); 527 kfree(pages); 528 529 return lseg; 530 531out_err_free: 532 /* free any allocated xdr pages, lgp as it's not used */ 533 if (pages) { 534 for (i = 0; i < max_pages; i++) { 535 if (!pages[i]) 536 break; 537 __free_page(pages[i]); 538 } 539 kfree(pages); 540 } 541 kfree(lgp); 542 return NULL; 543} 544 545bool pnfs_roc(struct inode *ino) 546{ 547 struct pnfs_layout_hdr *lo; 548 struct pnfs_layout_segment *lseg, *tmp; 549 LIST_HEAD(tmp_list); 550 bool found = false; 551 552 spin_lock(&ino->i_lock); 553 lo = NFS_I(ino)->layout; 554 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 555 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 556 goto out_nolayout; 557 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 558 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 559 mark_lseg_invalid(lseg, &tmp_list); 560 found = true; 561 } 562 if (!found) 563 goto out_nolayout; 564 lo->plh_block_lgets++; 565 get_layout_hdr(lo); /* matched in pnfs_roc_release */ 566 spin_unlock(&ino->i_lock); 567 pnfs_free_lseg_list(&tmp_list); 568 return true; 569 570out_nolayout: 571 spin_unlock(&ino->i_lock); 572 return false; 573} 574 575void pnfs_roc_release(struct inode *ino) 576{ 577 struct pnfs_layout_hdr *lo; 578 579 spin_lock(&ino->i_lock); 580 lo = NFS_I(ino)->layout; 581 lo->plh_block_lgets--; 582 put_layout_hdr_locked(lo); 583 spin_unlock(&ino->i_lock); 584} 585 586void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 587{ 588 struct pnfs_layout_hdr *lo; 589 590 spin_lock(&ino->i_lock); 591 lo = NFS_I(ino)->layout; 592 if ((int)(barrier - lo->plh_barrier) > 0) 593 lo->plh_barrier = barrier; 594 spin_unlock(&ino->i_lock); 595} 596 597bool pnfs_roc_drain(struct inode *ino, u32 *barrier) 598{ 599 struct nfs_inode *nfsi = NFS_I(ino); 600 struct pnfs_layout_segment *lseg; 601 bool found = false; 602 603 spin_lock(&ino->i_lock); 604 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 605 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 606 found = true; 607 break; 608 } 609 if (!found) { 610 struct pnfs_layout_hdr *lo = nfsi->layout; 611 u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); 612 613 /* Since close does not return a layout stateid for use as 614 * a barrier, we choose the worst-case barrier. 615 */ 616 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 617 } 618 spin_unlock(&ino->i_lock); 619 return found; 620} 621 622/* 623 * Compare two layout segments for sorting into layout cache. 624 * We want to preferentially return RW over RO layouts, so ensure those 625 * are seen first. 626 */ 627static s64 628cmp_layout(u32 iomode1, u32 iomode2) 629{ 630 /* read > read/write */ 631 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); 632} 633 634static void 635pnfs_insert_layout(struct pnfs_layout_hdr *lo, 636 struct pnfs_layout_segment *lseg) 637{ 638 struct pnfs_layout_segment *lp; 639 int found = 0; 640 641 dprintk("%s:Begin\n", __func__); 642 643 assert_spin_locked(&lo->plh_inode->i_lock); 644 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 645 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) 646 continue; 647 list_add_tail(&lseg->pls_list, &lp->pls_list); 648 dprintk("%s: inserted lseg %p " 649 "iomode %d offset %llu length %llu before " 650 "lp %p iomode %d offset %llu length %llu\n", 651 __func__, lseg, lseg->pls_range.iomode, 652 lseg->pls_range.offset, lseg->pls_range.length, 653 lp, lp->pls_range.iomode, lp->pls_range.offset, 654 lp->pls_range.length); 655 found = 1; 656 break; 657 } 658 if (!found) { 659 list_add_tail(&lseg->pls_list, &lo->plh_segs); 660 dprintk("%s: inserted lseg %p " 661 "iomode %d offset %llu length %llu at tail\n", 662 __func__, lseg, lseg->pls_range.iomode, 663 lseg->pls_range.offset, lseg->pls_range.length); 664 } 665 get_layout_hdr(lo); 666 667 dprintk("%s:Return\n", __func__); 668} 669 670static struct pnfs_layout_hdr * 671alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) 672{ 673 struct pnfs_layout_hdr *lo; 674 675 lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); 676 if (!lo) 677 return NULL; 678 atomic_set(&lo->plh_refcount, 1); 679 INIT_LIST_HEAD(&lo->plh_layouts); 680 INIT_LIST_HEAD(&lo->plh_segs); 681 INIT_LIST_HEAD(&lo->plh_bulk_recall); 682 lo->plh_inode = ino; 683 return lo; 684} 685 686static struct pnfs_layout_hdr * 687pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) 688{ 689 struct nfs_inode *nfsi = NFS_I(ino); 690 struct pnfs_layout_hdr *new = NULL; 691 692 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 693 694 assert_spin_locked(&ino->i_lock); 695 if (nfsi->layout) { 696 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) 697 return NULL; 698 else 699 return nfsi->layout; 700 } 701 spin_unlock(&ino->i_lock); 702 new = alloc_init_layout_hdr(ino, gfp_flags); 703 spin_lock(&ino->i_lock); 704 705 if (likely(nfsi->layout == NULL)) /* Won the race? */ 706 nfsi->layout = new; 707 else 708 kfree(new); 709 return nfsi->layout; 710} 711 712/* 713 * iomode matching rules: 714 * iomode lseg match 715 * ----- ----- ----- 716 * ANY READ true 717 * ANY RW true 718 * RW READ false 719 * RW RW true 720 * READ READ true 721 * READ RW true 722 */ 723static int 724is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 725{ 726 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); 727} 728 729/* 730 * lookup range in layout 731 */ 732static struct pnfs_layout_segment * 733pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) 734{ 735 struct pnfs_layout_segment *lseg, *ret = NULL; 736 737 dprintk("%s:Begin\n", __func__); 738 739 assert_spin_locked(&lo->plh_inode->i_lock); 740 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 741 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 742 is_matching_lseg(lseg, iomode)) { 743 ret = get_lseg(lseg); 744 break; 745 } 746 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 747 break; 748 } 749 750 dprintk("%s:Return lseg %p ref %d\n", 751 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); 752 return ret; 753} 754 755/* 756 * Layout segment is retreived from the server if not cached. 757 * The appropriate layout segment is referenced and returned to the caller. 758 */ 759struct pnfs_layout_segment * 760pnfs_update_layout(struct inode *ino, 761 struct nfs_open_context *ctx, 762 enum pnfs_iomode iomode, 763 gfp_t gfp_flags) 764{ 765 struct nfs_inode *nfsi = NFS_I(ino); 766 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 767 struct pnfs_layout_hdr *lo; 768 struct pnfs_layout_segment *lseg = NULL; 769 bool first = false; 770 771 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 772 return NULL; 773 spin_lock(&ino->i_lock); 774 lo = pnfs_find_alloc_layout(ino, gfp_flags); 775 if (lo == NULL) { 776 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 777 goto out_unlock; 778 } 779 780 /* Do we even need to bother with this? */ 781 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || 782 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 783 dprintk("%s matches recall, use MDS\n", __func__); 784 goto out_unlock; 785 } 786 787 /* if LAYOUTGET already failed once we don't try again */ 788 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 789 goto out_unlock; 790 791 /* Check to see if the layout for the given range already exists */ 792 lseg = pnfs_find_lseg(lo, iomode); 793 if (lseg) 794 goto out_unlock; 795 796 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 797 goto out_unlock; 798 atomic_inc(&lo->plh_outstanding); 799 800 get_layout_hdr(lo); 801 if (list_empty(&lo->plh_segs)) 802 first = true; 803 spin_unlock(&ino->i_lock); 804 if (first) { 805 /* The lo must be on the clp list if there is any 806 * chance of a CB_LAYOUTRECALL(FILE) coming in. 807 */ 808 spin_lock(&clp->cl_lock); 809 BUG_ON(!list_empty(&lo->plh_layouts)); 810 list_add_tail(&lo->plh_layouts, &clp->cl_layouts); 811 spin_unlock(&clp->cl_lock); 812 } 813 814 lseg = send_layoutget(lo, ctx, iomode, gfp_flags); 815 if (!lseg && first) { 816 spin_lock(&clp->cl_lock); 817 list_del_init(&lo->plh_layouts); 818 spin_unlock(&clp->cl_lock); 819 } 820 atomic_dec(&lo->plh_outstanding); 821 put_layout_hdr(lo); 822out: 823 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 824 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); 825 return lseg; 826out_unlock: 827 spin_unlock(&ino->i_lock); 828 goto out; 829} 830 831int 832pnfs_layout_process(struct nfs4_layoutget *lgp) 833{ 834 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 835 struct nfs4_layoutget_res *res = &lgp->res; 836 struct pnfs_layout_segment *lseg; 837 struct inode *ino = lo->plh_inode; 838 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 839 int status = 0; 840 841 /* Verify we got what we asked for. 842 * Note that because the xdr parsing only accepts a single 843 * element array, this can fail even if the server is behaving 844 * correctly. 845 */ 846 if (lgp->args.range.iomode > res->range.iomode || 847 res->range.offset != 0 || 848 res->range.length != NFS4_MAX_UINT64) { 849 status = -EINVAL; 850 goto out; 851 } 852 /* Inject layout blob into I/O device driver */ 853 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 854 if (!lseg || IS_ERR(lseg)) { 855 if (!lseg) 856 status = -ENOMEM; 857 else 858 status = PTR_ERR(lseg); 859 dprintk("%s: Could not allocate layout: error %d\n", 860 __func__, status); 861 goto out; 862 } 863 864 spin_lock(&ino->i_lock); 865 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || 866 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 867 dprintk("%s forget reply due to recall\n", __func__); 868 goto out_forget_reply; 869 } 870 871 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { 872 dprintk("%s forget reply due to state\n", __func__); 873 goto out_forget_reply; 874 } 875 init_lseg(lo, lseg); 876 lseg->pls_range = res->range; 877 *lgp->lsegpp = get_lseg(lseg); 878 pnfs_insert_layout(lo, lseg); 879 880 if (res->return_on_close) { 881 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 882 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 883 } 884 885 /* Done processing layoutget. Set the layout stateid */ 886 pnfs_set_layout_stateid(lo, &res->stateid, false); 887 spin_unlock(&ino->i_lock); 888out: 889 return status; 890 891out_forget_reply: 892 spin_unlock(&ino->i_lock); 893 lseg->pls_layout = lo; 894 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 895 goto out; 896} 897 898static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, 899 struct nfs_page *prev, 900 struct nfs_page *req) 901{ 902 if (pgio->pg_count == prev->wb_bytes) { 903 /* This is first coelesce call for a series of nfs_pages */ 904 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 905 prev->wb_context, 906 IOMODE_READ, 907 GFP_KERNEL); 908 } 909 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); 910} 911 912void 913pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) 914{ 915 struct pnfs_layoutdriver_type *ld; 916 917 ld = NFS_SERVER(inode)->pnfs_curr_ld; 918 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; 919} 920 921static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, 922 struct nfs_page *prev, 923 struct nfs_page *req) 924{ 925 if (pgio->pg_count == prev->wb_bytes) { 926 /* This is first coelesce call for a series of nfs_pages */ 927 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 928 prev->wb_context, 929 IOMODE_RW, 930 GFP_NOFS); 931 } 932 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); 933} 934 935void 936pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) 937{ 938 struct pnfs_layoutdriver_type *ld; 939 940 ld = NFS_SERVER(inode)->pnfs_curr_ld; 941 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; 942} 943 944enum pnfs_try_status 945pnfs_try_to_write_data(struct nfs_write_data *wdata, 946 const struct rpc_call_ops *call_ops, int how) 947{ 948 struct inode *inode = wdata->inode; 949 enum pnfs_try_status trypnfs; 950 struct nfs_server *nfss = NFS_SERVER(inode); 951 952 wdata->mds_ops = call_ops; 953 954 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 955 inode->i_ino, wdata->args.count, wdata->args.offset, how); 956 957 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); 958 if (trypnfs == PNFS_NOT_ATTEMPTED) { 959 put_lseg(wdata->lseg); 960 wdata->lseg = NULL; 961 } else 962 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 963 964 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 965 return trypnfs; 966} 967 968/* 969 * Call the appropriate parallel I/O subsystem read function. 970 */ 971enum pnfs_try_status 972pnfs_try_to_read_data(struct nfs_read_data *rdata, 973 const struct rpc_call_ops *call_ops) 974{ 975 struct inode *inode = rdata->inode; 976 struct nfs_server *nfss = NFS_SERVER(inode); 977 enum pnfs_try_status trypnfs; 978 979 rdata->mds_ops = call_ops; 980 981 dprintk("%s: Reading ino:%lu %u@%llu\n", 982 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 983 984 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); 985 if (trypnfs == PNFS_NOT_ATTEMPTED) { 986 put_lseg(rdata->lseg); 987 rdata->lseg = NULL; 988 } else { 989 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 990 } 991 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 992 return trypnfs; 993} 994 995/* 996 * Currently there is only one (whole file) write lseg. 997 */ 998static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) 999{ 1000 struct pnfs_layout_segment *lseg, *rv = NULL; 1001 1002 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) 1003 if (lseg->pls_range.iomode == IOMODE_RW) 1004 rv = lseg; 1005 return rv; 1006} 1007 1008void 1009pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1010{ 1011 struct nfs_inode *nfsi = NFS_I(wdata->inode); 1012 loff_t end_pos = wdata->args.offset + wdata->res.count; 1013 bool mark_as_dirty = false; 1014 1015 spin_lock(&nfsi->vfs_inode.i_lock); 1016 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1017 /* references matched in nfs4_layoutcommit_release */ 1018 get_lseg(wdata->lseg); 1019 wdata->lseg->pls_lc_cred = 1020 get_rpccred(wdata->args.context->state->owner->so_cred); 1021 mark_as_dirty = true; 1022 dprintk("%s: Set layoutcommit for inode %lu ", 1023 __func__, wdata->inode->i_ino); 1024 } 1025 if (end_pos > wdata->lseg->pls_end_pos) 1026 wdata->lseg->pls_end_pos = end_pos; 1027 spin_unlock(&nfsi->vfs_inode.i_lock); 1028 1029 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1030 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1031 if (mark_as_dirty) 1032 mark_inode_dirty_sync(wdata->inode); 1033} 1034EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1035 1036/* 1037 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1038 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1039 * data to disk to allow the server to recover the data if it crashes. 1040 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag 1041 * is off, and a COMMIT is sent to a data server, or 1042 * if WRITEs to a data server return NFS_DATA_SYNC. 1043 */ 1044int 1045pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1046{ 1047 struct nfs4_layoutcommit_data *data; 1048 struct nfs_inode *nfsi = NFS_I(inode); 1049 struct pnfs_layout_segment *lseg; 1050 struct rpc_cred *cred; 1051 loff_t end_pos; 1052 int status = 0; 1053 1054 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 1055 1056 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1057 return 0; 1058 1059 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1060 data = kzalloc(sizeof(*data), GFP_NOFS); 1061 if (!data) { 1062 mark_inode_dirty_sync(inode); 1063 status = -ENOMEM; 1064 goto out; 1065 } 1066 1067 spin_lock(&inode->i_lock); 1068 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1069 spin_unlock(&inode->i_lock); 1070 kfree(data); 1071 goto out; 1072 } 1073 /* 1074 * Currently only one (whole file) write lseg which is referenced 1075 * in pnfs_set_layoutcommit and will be found. 1076 */ 1077 lseg = pnfs_list_write_lseg(inode); 1078 1079 end_pos = lseg->pls_end_pos; 1080 cred = lseg->pls_lc_cred; 1081 lseg->pls_end_pos = 0; 1082 lseg->pls_lc_cred = NULL; 1083 1084 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, 1085 sizeof(nfsi->layout->plh_stateid.data)); 1086 spin_unlock(&inode->i_lock); 1087 1088 data->args.inode = inode; 1089 data->lseg = lseg; 1090 data->cred = cred; 1091 nfs_fattr_init(&data->fattr); 1092 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1093 data->res.fattr = &data->fattr; 1094 data->args.lastbytewritten = end_pos - 1; 1095 data->res.server = NFS_SERVER(inode); 1096 1097 status = nfs4_proc_layoutcommit(data, sync); 1098out: 1099 dprintk("<-- %s status %d\n", __func__, status); 1100 return status; 1101}