Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

* 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (28 commits)
pnfsblock: write_pagelist handle zero invalid extents
pnfsblock: note written INVAL areas for layoutcommit
pnfsblock: bl_write_pagelist
pnfsblock: bl_read_pagelist
pnfsblock: cleanup_layoutcommit
pnfsblock: encode_layoutcommit
pnfsblock: merge rw extents
pnfsblock: add extent manipulation functions
pnfsblock: bl_find_get_extent
pnfsblock: xdr decode pnfs_block_layout4
pnfsblock: call and parse getdevicelist
pnfsblock: merge extents
pnfsblock: lseg alloc and free
pnfsblock: remove device operations
pnfsblock: add device operations
pnfsblock: basic extent code
pnfsblock: use pageio_ops api
pnfsblock: add blocklayout Kconfig option, Makefile, and stubs
pnfs: cleanup_layoutcommit
pnfs: ask for layout_blksize and save it in nfs_server
...

+3113 -91
+7 -1
fs/nfs/Kconfig
··· 79 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL 80 80 select SUNRPC_BACKCHANNEL 81 81 select PNFS_FILE_LAYOUT 82 + select PNFS_BLOCK 83 + select MD 84 + select BLK_DEV_DM 82 85 help 83 86 This option enables support for minor version 1 of the NFSv4 protocol 84 - (RFC 5661) in the kernel's NFS client. 87 + (RFC 5661 and RFC 5663) in the kernel's NFS client. 85 88 86 89 If unsure, say N. 87 90 88 91 config PNFS_FILE_LAYOUT 92 + tristate 93 + 94 + config PNFS_BLOCK 89 95 tristate 90 96 91 97 config PNFS_OBJLAYOUT
+1
fs/nfs/Makefile
··· 23 23 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 24 24 25 25 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 26 + obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+5
fs/nfs/blocklayout/Makefile
··· 1 + # 2 + # Makefile for the pNFS block layout driver kernel module 3 + # 4 + obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o 5 + blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+1019
fs/nfs/blocklayout/blocklayout.c
··· 1 + /* 2 + * linux/fs/nfs/blocklayout/blocklayout.c 3 + * 4 + * Module for the NFSv4.1 pNFS block layout driver. 5 + * 6 + * Copyright (c) 2006 The Regents of the University of Michigan. 7 + * All rights reserved. 8 + * 9 + * Andy Adamson <andros@citi.umich.edu> 10 + * Fred Isaman <iisaman@umich.edu> 11 + * 12 + * permission is granted to use, copy, create derivative works and 13 + * redistribute this software and such derivative works for any purpose, 14 + * so long as the name of the university of michigan is not used in 15 + * any advertising or publicity pertaining to the use or distribution 16 + * of this software without specific, written prior authorization. if 17 + * the above copyright notice or any other identification of the 18 + * university of michigan is included in any copy of any portion of 19 + * this software, then the disclaimer below must also be included. 20 + * 21 + * this software is provided as is, without representation from the 22 + * university of michigan as to its fitness for any purpose, and without 23 + * warranty by the university of michigan of any kind, either express 24 + * or implied, including without limitation the implied warranties of 25 + * merchantability and fitness for a particular purpose. the regents 26 + * of the university of michigan shall not be liable for any damages, 27 + * including special, indirect, incidental, or consequential damages, 28 + * with respect to any claim arising out or in connection with the use 29 + * of the software, even if it has been or is hereafter advised of the 30 + * possibility of such damages. 31 + */ 32 + 33 + #include <linux/module.h> 34 + #include <linux/init.h> 35 + #include <linux/mount.h> 36 + #include <linux/namei.h> 37 + #include <linux/bio.h> /* struct bio */ 38 + #include <linux/buffer_head.h> /* various write calls */ 39 + 40 + #include "blocklayout.h" 41 + 42 + #define NFSDBG_FACILITY NFSDBG_PNFS_LD 43 + 44 + MODULE_LICENSE("GPL"); 45 + MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 46 + MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 47 + 48 + struct dentry *bl_device_pipe; 49 + wait_queue_head_t bl_wq; 50 + 51 + static void print_page(struct page *page) 52 + { 53 + dprintk("PRINTPAGE page %p\n", page); 54 + dprintk(" PagePrivate %d\n", PagePrivate(page)); 55 + dprintk(" PageUptodate %d\n", PageUptodate(page)); 56 + dprintk(" PageError %d\n", PageError(page)); 57 + dprintk(" PageDirty %d\n", PageDirty(page)); 58 + dprintk(" PageReferenced %d\n", PageReferenced(page)); 59 + dprintk(" PageLocked %d\n", PageLocked(page)); 60 + dprintk(" PageWriteback %d\n", PageWriteback(page)); 61 + dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); 62 + dprintk("\n"); 63 + } 64 + 65 + /* Given the be associated with isect, determine if page data needs to be 66 + * initialized. 67 + */ 68 + static int is_hole(struct pnfs_block_extent *be, sector_t isect) 69 + { 70 + if (be->be_state == PNFS_BLOCK_NONE_DATA) 71 + return 1; 72 + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) 73 + return 0; 74 + else 75 + return !bl_is_sector_init(be->be_inval, isect); 76 + } 77 + 78 + /* Given the be associated with isect, determine if page data can be 79 + * written to disk. 80 + */ 81 + static int is_writable(struct pnfs_block_extent *be, sector_t isect) 82 + { 83 + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || 84 + be->be_state == PNFS_BLOCK_INVALID_DATA); 85 + } 86 + 87 + /* The data we are handed might be spread across several bios. We need 88 + * to track when the last one is finished. 89 + */ 90 + struct parallel_io { 91 + struct kref refcnt; 92 + struct rpc_call_ops call_ops; 93 + void (*pnfs_callback) (void *data); 94 + void *data; 95 + }; 96 + 97 + static inline struct parallel_io *alloc_parallel(void *data) 98 + { 99 + struct parallel_io *rv; 100 + 101 + rv = kmalloc(sizeof(*rv), GFP_NOFS); 102 + if (rv) { 103 + rv->data = data; 104 + kref_init(&rv->refcnt); 105 + } 106 + return rv; 107 + } 108 + 109 + static inline void get_parallel(struct parallel_io *p) 110 + { 111 + kref_get(&p->refcnt); 112 + } 113 + 114 + static void destroy_parallel(struct kref *kref) 115 + { 116 + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 117 + 118 + dprintk("%s enter\n", __func__); 119 + p->pnfs_callback(p->data); 120 + kfree(p); 121 + } 122 + 123 + static inline void put_parallel(struct parallel_io *p) 124 + { 125 + kref_put(&p->refcnt, destroy_parallel); 126 + } 127 + 128 + static struct bio * 129 + bl_submit_bio(int rw, struct bio *bio) 130 + { 131 + if (bio) { 132 + get_parallel(bio->bi_private); 133 + dprintk("%s submitting %s bio %u@%llu\n", __func__, 134 + rw == READ ? "read" : "write", 135 + bio->bi_size, (unsigned long long)bio->bi_sector); 136 + submit_bio(rw, bio); 137 + } 138 + return NULL; 139 + } 140 + 141 + static struct bio *bl_alloc_init_bio(int npg, sector_t isect, 142 + struct pnfs_block_extent *be, 143 + void (*end_io)(struct bio *, int err), 144 + struct parallel_io *par) 145 + { 146 + struct bio *bio; 147 + 148 + bio = bio_alloc(GFP_NOIO, npg); 149 + if (!bio) 150 + return NULL; 151 + 152 + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; 153 + bio->bi_bdev = be->be_mdev; 154 + bio->bi_end_io = end_io; 155 + bio->bi_private = par; 156 + return bio; 157 + } 158 + 159 + static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, 160 + sector_t isect, struct page *page, 161 + struct pnfs_block_extent *be, 162 + void (*end_io)(struct bio *, int err), 163 + struct parallel_io *par) 164 + { 165 + retry: 166 + if (!bio) { 167 + bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 168 + if (!bio) 169 + return ERR_PTR(-ENOMEM); 170 + } 171 + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 172 + bio = bl_submit_bio(rw, bio); 173 + goto retry; 174 + } 175 + return bio; 176 + } 177 + 178 + static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) 179 + { 180 + if (lseg->pls_range.iomode == IOMODE_RW) { 181 + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); 182 + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); 183 + } else { 184 + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); 185 + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 186 + } 187 + } 188 + 189 + /* This is basically copied from mpage_end_io_read */ 190 + static void bl_end_io_read(struct bio *bio, int err) 191 + { 192 + struct parallel_io *par = bio->bi_private; 193 + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 194 + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 195 + struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; 196 + 197 + do { 198 + struct page *page = bvec->bv_page; 199 + 200 + if (--bvec >= bio->bi_io_vec) 201 + prefetchw(&bvec->bv_page->flags); 202 + if (uptodate) 203 + SetPageUptodate(page); 204 + } while (bvec >= bio->bi_io_vec); 205 + if (!uptodate) { 206 + if (!rdata->pnfs_error) 207 + rdata->pnfs_error = -EIO; 208 + bl_set_lo_fail(rdata->lseg); 209 + } 210 + bio_put(bio); 211 + put_parallel(par); 212 + } 213 + 214 + static void bl_read_cleanup(struct work_struct *work) 215 + { 216 + struct rpc_task *task; 217 + struct nfs_read_data *rdata; 218 + dprintk("%s enter\n", __func__); 219 + task = container_of(work, struct rpc_task, u.tk_work); 220 + rdata = container_of(task, struct nfs_read_data, task); 221 + pnfs_ld_read_done(rdata); 222 + } 223 + 224 + static void 225 + bl_end_par_io_read(void *data) 226 + { 227 + struct nfs_read_data *rdata = data; 228 + 229 + INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); 230 + schedule_work(&rdata->task.u.tk_work); 231 + } 232 + 233 + /* We don't want normal .rpc_call_done callback used, so we replace it 234 + * with this stub. 235 + */ 236 + static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) 237 + { 238 + return; 239 + } 240 + 241 + static enum pnfs_try_status 242 + bl_read_pagelist(struct nfs_read_data *rdata) 243 + { 244 + int i, hole; 245 + struct bio *bio = NULL; 246 + struct pnfs_block_extent *be = NULL, *cow_read = NULL; 247 + sector_t isect, extent_length = 0; 248 + struct parallel_io *par; 249 + loff_t f_offset = rdata->args.offset; 250 + size_t count = rdata->args.count; 251 + struct page **pages = rdata->args.pages; 252 + int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 253 + 254 + dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, 255 + rdata->npages, f_offset, count); 256 + 257 + par = alloc_parallel(rdata); 258 + if (!par) 259 + goto use_mds; 260 + par->call_ops = *rdata->mds_ops; 261 + par->call_ops.rpc_call_done = bl_rpc_do_nothing; 262 + par->pnfs_callback = bl_end_par_io_read; 263 + /* At this point, we can no longer jump to use_mds */ 264 + 265 + isect = (sector_t) (f_offset >> SECTOR_SHIFT); 266 + /* Code assumes extents are page-aligned */ 267 + for (i = pg_index; i < rdata->npages; i++) { 268 + if (!extent_length) { 269 + /* We've used up the previous extent */ 270 + bl_put_extent(be); 271 + bl_put_extent(cow_read); 272 + bio = bl_submit_bio(READ, bio); 273 + /* Get the next one */ 274 + be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), 275 + isect, &cow_read); 276 + if (!be) { 277 + rdata->pnfs_error = -EIO; 278 + goto out; 279 + } 280 + extent_length = be->be_length - 281 + (isect - be->be_f_offset); 282 + if (cow_read) { 283 + sector_t cow_length = cow_read->be_length - 284 + (isect - cow_read->be_f_offset); 285 + extent_length = min(extent_length, cow_length); 286 + } 287 + } 288 + hole = is_hole(be, isect); 289 + if (hole && !cow_read) { 290 + bio = bl_submit_bio(READ, bio); 291 + /* Fill hole w/ zeroes w/o accessing device */ 292 + dprintk("%s Zeroing page for hole\n", __func__); 293 + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); 294 + print_page(pages[i]); 295 + SetPageUptodate(pages[i]); 296 + } else { 297 + struct pnfs_block_extent *be_read; 298 + 299 + be_read = (hole && cow_read) ? cow_read : be; 300 + bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, 301 + isect, pages[i], be_read, 302 + bl_end_io_read, par); 303 + if (IS_ERR(bio)) { 304 + rdata->pnfs_error = PTR_ERR(bio); 305 + goto out; 306 + } 307 + } 308 + isect += PAGE_CACHE_SECTORS; 309 + extent_length -= PAGE_CACHE_SECTORS; 310 + } 311 + if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { 312 + rdata->res.eof = 1; 313 + rdata->res.count = rdata->inode->i_size - f_offset; 314 + } else { 315 + rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; 316 + } 317 + out: 318 + bl_put_extent(be); 319 + bl_put_extent(cow_read); 320 + bl_submit_bio(READ, bio); 321 + put_parallel(par); 322 + return PNFS_ATTEMPTED; 323 + 324 + use_mds: 325 + dprintk("Giving up and using normal NFS\n"); 326 + return PNFS_NOT_ATTEMPTED; 327 + } 328 + 329 + static void mark_extents_written(struct pnfs_block_layout *bl, 330 + __u64 offset, __u32 count) 331 + { 332 + sector_t isect, end; 333 + struct pnfs_block_extent *be; 334 + 335 + dprintk("%s(%llu, %u)\n", __func__, offset, count); 336 + if (count == 0) 337 + return; 338 + isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; 339 + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); 340 + end >>= SECTOR_SHIFT; 341 + while (isect < end) { 342 + sector_t len; 343 + be = bl_find_get_extent(bl, isect, NULL); 344 + BUG_ON(!be); /* FIXME */ 345 + len = min(end, be->be_f_offset + be->be_length) - isect; 346 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) 347 + bl_mark_for_commit(be, isect, len); /* What if fails? */ 348 + isect += len; 349 + bl_put_extent(be); 350 + } 351 + } 352 + 353 + static void bl_end_io_write_zero(struct bio *bio, int err) 354 + { 355 + struct parallel_io *par = bio->bi_private; 356 + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 357 + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 358 + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; 359 + 360 + do { 361 + struct page *page = bvec->bv_page; 362 + 363 + if (--bvec >= bio->bi_io_vec) 364 + prefetchw(&bvec->bv_page->flags); 365 + /* This is the zeroing page we added */ 366 + end_page_writeback(page); 367 + page_cache_release(page); 368 + } while (bvec >= bio->bi_io_vec); 369 + if (!uptodate) { 370 + if (!wdata->pnfs_error) 371 + wdata->pnfs_error = -EIO; 372 + bl_set_lo_fail(wdata->lseg); 373 + } 374 + bio_put(bio); 375 + put_parallel(par); 376 + } 377 + 378 + /* This is basically copied from mpage_end_io_read */ 379 + static void bl_end_io_write(struct bio *bio, int err) 380 + { 381 + struct parallel_io *par = bio->bi_private; 382 + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 383 + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; 384 + 385 + if (!uptodate) { 386 + if (!wdata->pnfs_error) 387 + wdata->pnfs_error = -EIO; 388 + bl_set_lo_fail(wdata->lseg); 389 + } 390 + bio_put(bio); 391 + put_parallel(par); 392 + } 393 + 394 + /* Function scheduled for call during bl_end_par_io_write, 395 + * it marks sectors as written and extends the commitlist. 396 + */ 397 + static void bl_write_cleanup(struct work_struct *work) 398 + { 399 + struct rpc_task *task; 400 + struct nfs_write_data *wdata; 401 + dprintk("%s enter\n", __func__); 402 + task = container_of(work, struct rpc_task, u.tk_work); 403 + wdata = container_of(task, struct nfs_write_data, task); 404 + if (!wdata->pnfs_error) { 405 + /* Marks for LAYOUTCOMMIT */ 406 + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), 407 + wdata->args.offset, wdata->args.count); 408 + } 409 + pnfs_ld_write_done(wdata); 410 + } 411 + 412 + /* Called when last of bios associated with a bl_write_pagelist call finishes */ 413 + static void bl_end_par_io_write(void *data) 414 + { 415 + struct nfs_write_data *wdata = data; 416 + 417 + wdata->task.tk_status = 0; 418 + wdata->verf.committed = NFS_FILE_SYNC; 419 + INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 420 + schedule_work(&wdata->task.u.tk_work); 421 + } 422 + 423 + /* FIXME STUB - mark intersection of layout and page as bad, so is not 424 + * used again. 425 + */ 426 + static void mark_bad_read(void) 427 + { 428 + return; 429 + } 430 + 431 + /* 432 + * map_block: map a requested I/0 block (isect) into an offset in the LVM 433 + * block_device 434 + */ 435 + static void 436 + map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) 437 + { 438 + dprintk("%s enter be=%p\n", __func__, be); 439 + 440 + set_buffer_mapped(bh); 441 + bh->b_bdev = be->be_mdev; 442 + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> 443 + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); 444 + 445 + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", 446 + __func__, (unsigned long long)isect, (long)bh->b_blocknr, 447 + bh->b_size); 448 + return; 449 + } 450 + 451 + /* Given an unmapped page, zero it or read in page for COW, page is locked 452 + * by caller. 453 + */ 454 + static int 455 + init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) 456 + { 457 + struct buffer_head *bh = NULL; 458 + int ret = 0; 459 + sector_t isect; 460 + 461 + dprintk("%s enter, %p\n", __func__, page); 462 + BUG_ON(PageUptodate(page)); 463 + if (!cow_read) { 464 + zero_user_segment(page, 0, PAGE_SIZE); 465 + SetPageUptodate(page); 466 + goto cleanup; 467 + } 468 + 469 + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); 470 + if (!bh) { 471 + ret = -ENOMEM; 472 + goto cleanup; 473 + } 474 + 475 + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; 476 + map_block(bh, isect, cow_read); 477 + if (!bh_uptodate_or_lock(bh)) 478 + ret = bh_submit_read(bh); 479 + if (ret) 480 + goto cleanup; 481 + SetPageUptodate(page); 482 + 483 + cleanup: 484 + bl_put_extent(cow_read); 485 + if (bh) 486 + free_buffer_head(bh); 487 + if (ret) { 488 + /* Need to mark layout with bad read...should now 489 + * just use nfs4 for reads and writes. 490 + */ 491 + mark_bad_read(); 492 + } 493 + return ret; 494 + } 495 + 496 + static enum pnfs_try_status 497 + bl_write_pagelist(struct nfs_write_data *wdata, int sync) 498 + { 499 + int i, ret, npg_zero, pg_index, last = 0; 500 + struct bio *bio = NULL; 501 + struct pnfs_block_extent *be = NULL, *cow_read = NULL; 502 + sector_t isect, last_isect = 0, extent_length = 0; 503 + struct parallel_io *par; 504 + loff_t offset = wdata->args.offset; 505 + size_t count = wdata->args.count; 506 + struct page **pages = wdata->args.pages; 507 + struct page *page; 508 + pgoff_t index; 509 + u64 temp; 510 + int npg_per_block = 511 + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; 512 + 513 + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 514 + /* At this point, wdata->pages is a (sequential) list of nfs_pages. 515 + * We want to write each, and if there is an error set pnfs_error 516 + * to have it redone using nfs. 517 + */ 518 + par = alloc_parallel(wdata); 519 + if (!par) 520 + return PNFS_NOT_ATTEMPTED; 521 + par->call_ops = *wdata->mds_ops; 522 + par->call_ops.rpc_call_done = bl_rpc_do_nothing; 523 + par->pnfs_callback = bl_end_par_io_write; 524 + /* At this point, have to be more careful with error handling */ 525 + 526 + isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 527 + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); 528 + if (!be || !is_writable(be, isect)) { 529 + dprintk("%s no matching extents!\n", __func__); 530 + wdata->pnfs_error = -EINVAL; 531 + goto out; 532 + } 533 + 534 + /* First page inside INVALID extent */ 535 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 536 + temp = offset >> PAGE_CACHE_SHIFT; 537 + npg_zero = do_div(temp, npg_per_block); 538 + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & 539 + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 540 + extent_length = be->be_length - (isect - be->be_f_offset); 541 + 542 + fill_invalid_ext: 543 + dprintk("%s need to zero %d pages\n", __func__, npg_zero); 544 + for (;npg_zero > 0; npg_zero--) { 545 + /* page ref released in bl_end_io_write_zero */ 546 + index = isect >> PAGE_CACHE_SECTOR_SHIFT; 547 + dprintk("%s zero %dth page: index %lu isect %llu\n", 548 + __func__, npg_zero, index, 549 + (unsigned long long)isect); 550 + page = 551 + find_or_create_page(wdata->inode->i_mapping, index, 552 + GFP_NOFS); 553 + if (!page) { 554 + dprintk("%s oom\n", __func__); 555 + wdata->pnfs_error = -ENOMEM; 556 + goto out; 557 + } 558 + 559 + /* PageDirty: Other will write this out 560 + * PageWriteback: Other is writing this out 561 + * PageUptodate: It was read before 562 + * sector_initialized: already written out 563 + */ 564 + if (PageDirty(page) || PageWriteback(page) || 565 + bl_is_sector_init(be->be_inval, isect)) { 566 + print_page(page); 567 + unlock_page(page); 568 + page_cache_release(page); 569 + goto next_page; 570 + } 571 + if (!PageUptodate(page)) { 572 + /* New page, readin or zero it */ 573 + init_page_for_write(page, cow_read); 574 + } 575 + set_page_writeback(page); 576 + unlock_page(page); 577 + 578 + ret = bl_mark_sectors_init(be->be_inval, isect, 579 + PAGE_CACHE_SECTORS, 580 + NULL); 581 + if (unlikely(ret)) { 582 + dprintk("%s bl_mark_sectors_init fail %d\n", 583 + __func__, ret); 584 + end_page_writeback(page); 585 + page_cache_release(page); 586 + wdata->pnfs_error = ret; 587 + goto out; 588 + } 589 + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, 590 + isect, page, be, 591 + bl_end_io_write_zero, par); 592 + if (IS_ERR(bio)) { 593 + wdata->pnfs_error = PTR_ERR(bio); 594 + goto out; 595 + } 596 + /* FIXME: This should be done in bi_end_io */ 597 + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), 598 + page->index << PAGE_CACHE_SHIFT, 599 + PAGE_CACHE_SIZE); 600 + next_page: 601 + isect += PAGE_CACHE_SECTORS; 602 + extent_length -= PAGE_CACHE_SECTORS; 603 + } 604 + if (last) 605 + goto write_done; 606 + } 607 + bio = bl_submit_bio(WRITE, bio); 608 + 609 + /* Middle pages */ 610 + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; 611 + for (i = pg_index; i < wdata->npages; i++) { 612 + if (!extent_length) { 613 + /* We've used up the previous extent */ 614 + bl_put_extent(be); 615 + bio = bl_submit_bio(WRITE, bio); 616 + /* Get the next one */ 617 + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), 618 + isect, NULL); 619 + if (!be || !is_writable(be, isect)) { 620 + wdata->pnfs_error = -EINVAL; 621 + goto out; 622 + } 623 + extent_length = be->be_length - 624 + (isect - be->be_f_offset); 625 + } 626 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 627 + ret = bl_mark_sectors_init(be->be_inval, isect, 628 + PAGE_CACHE_SECTORS, 629 + NULL); 630 + if (unlikely(ret)) { 631 + dprintk("%s bl_mark_sectors_init fail %d\n", 632 + __func__, ret); 633 + wdata->pnfs_error = ret; 634 + goto out; 635 + } 636 + } 637 + bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, 638 + isect, pages[i], be, 639 + bl_end_io_write, par); 640 + if (IS_ERR(bio)) { 641 + wdata->pnfs_error = PTR_ERR(bio); 642 + goto out; 643 + } 644 + isect += PAGE_CACHE_SECTORS; 645 + last_isect = isect; 646 + extent_length -= PAGE_CACHE_SECTORS; 647 + } 648 + 649 + /* Last page inside INVALID extent */ 650 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 651 + bio = bl_submit_bio(WRITE, bio); 652 + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; 653 + npg_zero = npg_per_block - do_div(temp, npg_per_block); 654 + if (npg_zero < npg_per_block) { 655 + last = 1; 656 + goto fill_invalid_ext; 657 + } 658 + } 659 + 660 + write_done: 661 + wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); 662 + if (count < wdata->res.count) { 663 + wdata->res.count = count; 664 + } 665 + out: 666 + bl_put_extent(be); 667 + bl_submit_bio(WRITE, bio); 668 + put_parallel(par); 669 + return PNFS_ATTEMPTED; 670 + } 671 + 672 + /* FIXME - range ignored */ 673 + static void 674 + release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) 675 + { 676 + int i; 677 + struct pnfs_block_extent *be; 678 + 679 + spin_lock(&bl->bl_ext_lock); 680 + for (i = 0; i < EXTENT_LISTS; i++) { 681 + while (!list_empty(&bl->bl_extents[i])) { 682 + be = list_first_entry(&bl->bl_extents[i], 683 + struct pnfs_block_extent, 684 + be_node); 685 + list_del(&be->be_node); 686 + bl_put_extent(be); 687 + } 688 + } 689 + spin_unlock(&bl->bl_ext_lock); 690 + } 691 + 692 + static void 693 + release_inval_marks(struct pnfs_inval_markings *marks) 694 + { 695 + struct pnfs_inval_tracking *pos, *temp; 696 + 697 + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { 698 + list_del(&pos->it_link); 699 + kfree(pos); 700 + } 701 + return; 702 + } 703 + 704 + static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 705 + { 706 + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 707 + 708 + dprintk("%s enter\n", __func__); 709 + release_extents(bl, NULL); 710 + release_inval_marks(&bl->bl_inval); 711 + kfree(bl); 712 + } 713 + 714 + static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, 715 + gfp_t gfp_flags) 716 + { 717 + struct pnfs_block_layout *bl; 718 + 719 + dprintk("%s enter\n", __func__); 720 + bl = kzalloc(sizeof(*bl), gfp_flags); 721 + if (!bl) 722 + return NULL; 723 + spin_lock_init(&bl->bl_ext_lock); 724 + INIT_LIST_HEAD(&bl->bl_extents[0]); 725 + INIT_LIST_HEAD(&bl->bl_extents[1]); 726 + INIT_LIST_HEAD(&bl->bl_commit); 727 + INIT_LIST_HEAD(&bl->bl_committing); 728 + bl->bl_count = 0; 729 + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; 730 + BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); 731 + return &bl->bl_layout; 732 + } 733 + 734 + static void bl_free_lseg(struct pnfs_layout_segment *lseg) 735 + { 736 + dprintk("%s enter\n", __func__); 737 + kfree(lseg); 738 + } 739 + 740 + /* We pretty much ignore lseg, and store all data layout wide, so we 741 + * can correctly merge. 742 + */ 743 + static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, 744 + struct nfs4_layoutget_res *lgr, 745 + gfp_t gfp_flags) 746 + { 747 + struct pnfs_layout_segment *lseg; 748 + int status; 749 + 750 + dprintk("%s enter\n", __func__); 751 + lseg = kzalloc(sizeof(*lseg), gfp_flags); 752 + if (!lseg) 753 + return ERR_PTR(-ENOMEM); 754 + status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); 755 + if (status) { 756 + /* We don't want to call the full-blown bl_free_lseg, 757 + * since on error extents were not touched. 758 + */ 759 + kfree(lseg); 760 + return ERR_PTR(status); 761 + } 762 + return lseg; 763 + } 764 + 765 + static void 766 + bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, 767 + const struct nfs4_layoutcommit_args *arg) 768 + { 769 + dprintk("%s enter\n", __func__); 770 + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 771 + } 772 + 773 + static void 774 + bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) 775 + { 776 + struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 777 + 778 + dprintk("%s enter\n", __func__); 779 + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 780 + } 781 + 782 + static void free_blk_mountid(struct block_mount_id *mid) 783 + { 784 + if (mid) { 785 + struct pnfs_block_dev *dev; 786 + spin_lock(&mid->bm_lock); 787 + while (!list_empty(&mid->bm_devlist)) { 788 + dev = list_first_entry(&mid->bm_devlist, 789 + struct pnfs_block_dev, 790 + bm_node); 791 + list_del(&dev->bm_node); 792 + bl_free_block_dev(dev); 793 + } 794 + spin_unlock(&mid->bm_lock); 795 + kfree(mid); 796 + } 797 + } 798 + 799 + /* This is mostly copied from the filelayout's get_device_info function. 800 + * It seems much of this should be at the generic pnfs level. 801 + */ 802 + static struct pnfs_block_dev * 803 + nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, 804 + struct nfs4_deviceid *d_id) 805 + { 806 + struct pnfs_device *dev; 807 + struct pnfs_block_dev *rv = NULL; 808 + u32 max_resp_sz; 809 + int max_pages; 810 + struct page **pages = NULL; 811 + int i, rc; 812 + 813 + /* 814 + * Use the session max response size as the basis for setting 815 + * GETDEVICEINFO's maxcount 816 + */ 817 + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 818 + max_pages = max_resp_sz >> PAGE_SHIFT; 819 + dprintk("%s max_resp_sz %u max_pages %d\n", 820 + __func__, max_resp_sz, max_pages); 821 + 822 + dev = kmalloc(sizeof(*dev), GFP_NOFS); 823 + if (!dev) { 824 + dprintk("%s kmalloc failed\n", __func__); 825 + return NULL; 826 + } 827 + 828 + pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); 829 + if (pages == NULL) { 830 + kfree(dev); 831 + return NULL; 832 + } 833 + for (i = 0; i < max_pages; i++) { 834 + pages[i] = alloc_page(GFP_NOFS); 835 + if (!pages[i]) 836 + goto out_free; 837 + } 838 + 839 + memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 840 + dev->layout_type = LAYOUT_BLOCK_VOLUME; 841 + dev->pages = pages; 842 + dev->pgbase = 0; 843 + dev->pglen = PAGE_SIZE * max_pages; 844 + dev->mincount = 0; 845 + 846 + dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 847 + rc = nfs4_proc_getdeviceinfo(server, dev); 848 + dprintk("%s getdevice info returns %d\n", __func__, rc); 849 + if (rc) 850 + goto out_free; 851 + 852 + rv = nfs4_blk_decode_device(server, dev); 853 + out_free: 854 + for (i = 0; i < max_pages; i++) 855 + __free_page(pages[i]); 856 + kfree(pages); 857 + kfree(dev); 858 + return rv; 859 + } 860 + 861 + static int 862 + bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) 863 + { 864 + struct block_mount_id *b_mt_id = NULL; 865 + struct pnfs_devicelist *dlist = NULL; 866 + struct pnfs_block_dev *bdev; 867 + LIST_HEAD(block_disklist); 868 + int status = 0, i; 869 + 870 + dprintk("%s enter\n", __func__); 871 + 872 + if (server->pnfs_blksize == 0) { 873 + dprintk("%s Server did not return blksize\n", __func__); 874 + return -EINVAL; 875 + } 876 + b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); 877 + if (!b_mt_id) { 878 + status = -ENOMEM; 879 + goto out_error; 880 + } 881 + /* Initialize nfs4 block layout mount id */ 882 + spin_lock_init(&b_mt_id->bm_lock); 883 + INIT_LIST_HEAD(&b_mt_id->bm_devlist); 884 + 885 + dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); 886 + if (!dlist) { 887 + status = -ENOMEM; 888 + goto out_error; 889 + } 890 + dlist->eof = 0; 891 + while (!dlist->eof) { 892 + status = nfs4_proc_getdevicelist(server, fh, dlist); 893 + if (status) 894 + goto out_error; 895 + dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", 896 + __func__, dlist->num_devs, dlist->eof); 897 + for (i = 0; i < dlist->num_devs; i++) { 898 + bdev = nfs4_blk_get_deviceinfo(server, fh, 899 + &dlist->dev_id[i]); 900 + if (!bdev) { 901 + status = -ENODEV; 902 + goto out_error; 903 + } 904 + spin_lock(&b_mt_id->bm_lock); 905 + list_add(&bdev->bm_node, &b_mt_id->bm_devlist); 906 + spin_unlock(&b_mt_id->bm_lock); 907 + } 908 + } 909 + dprintk("%s SUCCESS\n", __func__); 910 + server->pnfs_ld_data = b_mt_id; 911 + 912 + out_return: 913 + kfree(dlist); 914 + return status; 915 + 916 + out_error: 917 + free_blk_mountid(b_mt_id); 918 + goto out_return; 919 + } 920 + 921 + static int 922 + bl_clear_layoutdriver(struct nfs_server *server) 923 + { 924 + struct block_mount_id *b_mt_id = server->pnfs_ld_data; 925 + 926 + dprintk("%s enter\n", __func__); 927 + free_blk_mountid(b_mt_id); 928 + dprintk("%s RETURNS\n", __func__); 929 + return 0; 930 + } 931 + 932 + static const struct nfs_pageio_ops bl_pg_read_ops = { 933 + .pg_init = pnfs_generic_pg_init_read, 934 + .pg_test = pnfs_generic_pg_test, 935 + .pg_doio = pnfs_generic_pg_readpages, 936 + }; 937 + 938 + static const struct nfs_pageio_ops bl_pg_write_ops = { 939 + .pg_init = pnfs_generic_pg_init_write, 940 + .pg_test = pnfs_generic_pg_test, 941 + .pg_doio = pnfs_generic_pg_writepages, 942 + }; 943 + 944 + static struct pnfs_layoutdriver_type blocklayout_type = { 945 + .id = LAYOUT_BLOCK_VOLUME, 946 + .name = "LAYOUT_BLOCK_VOLUME", 947 + .read_pagelist = bl_read_pagelist, 948 + .write_pagelist = bl_write_pagelist, 949 + .alloc_layout_hdr = bl_alloc_layout_hdr, 950 + .free_layout_hdr = bl_free_layout_hdr, 951 + .alloc_lseg = bl_alloc_lseg, 952 + .free_lseg = bl_free_lseg, 953 + .encode_layoutcommit = bl_encode_layoutcommit, 954 + .cleanup_layoutcommit = bl_cleanup_layoutcommit, 955 + .set_layoutdriver = bl_set_layoutdriver, 956 + .clear_layoutdriver = bl_clear_layoutdriver, 957 + .pg_read_ops = &bl_pg_read_ops, 958 + .pg_write_ops = &bl_pg_write_ops, 959 + }; 960 + 961 + static const struct rpc_pipe_ops bl_upcall_ops = { 962 + .upcall = bl_pipe_upcall, 963 + .downcall = bl_pipe_downcall, 964 + .destroy_msg = bl_pipe_destroy_msg, 965 + }; 966 + 967 + static int __init nfs4blocklayout_init(void) 968 + { 969 + struct vfsmount *mnt; 970 + struct path path; 971 + int ret; 972 + 973 + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); 974 + 975 + ret = pnfs_register_layoutdriver(&blocklayout_type); 976 + if (ret) 977 + goto out; 978 + 979 + init_waitqueue_head(&bl_wq); 980 + 981 + mnt = rpc_get_mount(); 982 + if (IS_ERR(mnt)) { 983 + ret = PTR_ERR(mnt); 984 + goto out_remove; 985 + } 986 + 987 + ret = vfs_path_lookup(mnt->mnt_root, 988 + mnt, 989 + NFS_PIPE_DIRNAME, 0, &path); 990 + if (ret) 991 + goto out_remove; 992 + 993 + bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, 994 + &bl_upcall_ops, 0); 995 + if (IS_ERR(bl_device_pipe)) { 996 + ret = PTR_ERR(bl_device_pipe); 997 + goto out_remove; 998 + } 999 + out: 1000 + return ret; 1001 + 1002 + out_remove: 1003 + pnfs_unregister_layoutdriver(&blocklayout_type); 1004 + return ret; 1005 + } 1006 + 1007 + static void __exit nfs4blocklayout_exit(void) 1008 + { 1009 + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 1010 + __func__); 1011 + 1012 + pnfs_unregister_layoutdriver(&blocklayout_type); 1013 + rpc_unlink(bl_device_pipe); 1014 + } 1015 + 1016 + MODULE_ALIAS("nfs-layouttype4-3"); 1017 + 1018 + module_init(nfs4blocklayout_init); 1019 + module_exit(nfs4blocklayout_exit);
+207
fs/nfs/blocklayout/blocklayout.h
··· 1 + /* 2 + * linux/fs/nfs/blocklayout/blocklayout.h 3 + * 4 + * Module for the NFSv4.1 pNFS block layout driver. 5 + * 6 + * Copyright (c) 2006 The Regents of the University of Michigan. 7 + * All rights reserved. 8 + * 9 + * Andy Adamson <andros@citi.umich.edu> 10 + * Fred Isaman <iisaman@umich.edu> 11 + * 12 + * permission is granted to use, copy, create derivative works and 13 + * redistribute this software and such derivative works for any purpose, 14 + * so long as the name of the university of michigan is not used in 15 + * any advertising or publicity pertaining to the use or distribution 16 + * of this software without specific, written prior authorization. if 17 + * the above copyright notice or any other identification of the 18 + * university of michigan is included in any copy of any portion of 19 + * this software, then the disclaimer below must also be included. 20 + * 21 + * this software is provided as is, without representation from the 22 + * university of michigan as to its fitness for any purpose, and without 23 + * warranty by the university of michigan of any kind, either express 24 + * or implied, including without limitation the implied warranties of 25 + * merchantability and fitness for a particular purpose. the regents 26 + * of the university of michigan shall not be liable for any damages, 27 + * including special, indirect, incidental, or consequential damages, 28 + * with respect to any claim arising out or in connection with the use 29 + * of the software, even if it has been or is hereafter advised of the 30 + * possibility of such damages. 31 + */ 32 + #ifndef FS_NFS_NFS4BLOCKLAYOUT_H 33 + #define FS_NFS_NFS4BLOCKLAYOUT_H 34 + 35 + #include <linux/device-mapper.h> 36 + #include <linux/nfs_fs.h> 37 + #include <linux/sunrpc/rpc_pipe_fs.h> 38 + 39 + #include "../pnfs.h" 40 + 41 + #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) 42 + #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 43 + 44 + struct block_mount_id { 45 + spinlock_t bm_lock; /* protects list */ 46 + struct list_head bm_devlist; /* holds pnfs_block_dev */ 47 + }; 48 + 49 + struct pnfs_block_dev { 50 + struct list_head bm_node; 51 + struct nfs4_deviceid bm_mdevid; /* associated devid */ 52 + struct block_device *bm_mdev; /* meta device itself */ 53 + }; 54 + 55 + enum exstate4 { 56 + PNFS_BLOCK_READWRITE_DATA = 0, 57 + PNFS_BLOCK_READ_DATA = 1, 58 + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ 59 + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ 60 + }; 61 + 62 + #define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ 63 + 64 + struct my_tree { 65 + sector_t mtt_step_size; /* Internal sector alignment */ 66 + struct list_head mtt_stub; /* Should be a radix tree */ 67 + }; 68 + 69 + struct pnfs_inval_markings { 70 + spinlock_t im_lock; 71 + struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 72 + sector_t im_block_size; /* Server blocksize in sectors */ 73 + }; 74 + 75 + struct pnfs_inval_tracking { 76 + struct list_head it_link; 77 + int it_sector; 78 + int it_tags; 79 + }; 80 + 81 + /* sector_t fields are all in 512-byte sectors */ 82 + struct pnfs_block_extent { 83 + struct kref be_refcnt; 84 + struct list_head be_node; /* link into lseg list */ 85 + struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ 86 + struct block_device *be_mdev; 87 + sector_t be_f_offset; /* the starting offset in the file */ 88 + sector_t be_length; /* the size of the extent */ 89 + sector_t be_v_offset; /* the starting offset in the volume */ 90 + enum exstate4 be_state; /* the state of this extent */ 91 + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ 92 + }; 93 + 94 + /* Shortened extent used by LAYOUTCOMMIT */ 95 + struct pnfs_block_short_extent { 96 + struct list_head bse_node; 97 + struct nfs4_deviceid bse_devid; 98 + struct block_device *bse_mdev; 99 + sector_t bse_f_offset; /* the starting offset in the file */ 100 + sector_t bse_length; /* the size of the extent */ 101 + }; 102 + 103 + static inline void 104 + BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) 105 + { 106 + spin_lock_init(&marks->im_lock); 107 + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); 108 + marks->im_block_size = blocksize; 109 + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, 110 + blocksize); 111 + } 112 + 113 + enum extentclass4 { 114 + RW_EXTENT = 0, /* READWRTE and INVAL */ 115 + RO_EXTENT = 1, /* READ and NONE */ 116 + EXTENT_LISTS = 2, 117 + }; 118 + 119 + static inline int bl_choose_list(enum exstate4 state) 120 + { 121 + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) 122 + return RO_EXTENT; 123 + else 124 + return RW_EXTENT; 125 + } 126 + 127 + struct pnfs_block_layout { 128 + struct pnfs_layout_hdr bl_layout; 129 + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ 130 + spinlock_t bl_ext_lock; /* Protects list manipulation */ 131 + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ 132 + struct list_head bl_commit; /* Needs layout commit */ 133 + struct list_head bl_committing; /* Layout committing */ 134 + unsigned int bl_count; /* entries in bl_commit */ 135 + sector_t bl_blocksize; /* Server blocksize in sectors */ 136 + }; 137 + 138 + #define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) 139 + 140 + static inline struct pnfs_block_layout * 141 + BLK_LO2EXT(struct pnfs_layout_hdr *lo) 142 + { 143 + return container_of(lo, struct pnfs_block_layout, bl_layout); 144 + } 145 + 146 + static inline struct pnfs_block_layout * 147 + BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) 148 + { 149 + return BLK_LO2EXT(lseg->pls_layout); 150 + } 151 + 152 + struct bl_dev_msg { 153 + int status; 154 + uint32_t major, minor; 155 + }; 156 + 157 + struct bl_msg_hdr { 158 + u8 type; 159 + u16 totallen; /* length of entire message, including hdr itself */ 160 + }; 161 + 162 + extern struct dentry *bl_device_pipe; 163 + extern wait_queue_head_t bl_wq; 164 + 165 + #define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ 166 + #define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ 167 + #define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ 168 + #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ 169 + #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 170 + 171 + /* blocklayoutdev.c */ 172 + ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, 173 + char __user *, size_t); 174 + ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 175 + void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 176 + struct block_device *nfs4_blkdev_get(dev_t dev); 177 + int nfs4_blkdev_put(struct block_device *bdev); 178 + struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 179 + struct pnfs_device *dev); 180 + int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 181 + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 182 + 183 + /* blocklayoutdm.c */ 184 + void bl_free_block_dev(struct pnfs_block_dev *bdev); 185 + 186 + /* extents.c */ 187 + struct pnfs_block_extent * 188 + bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 189 + struct pnfs_block_extent **cow_read); 190 + int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 191 + sector_t offset, sector_t length, 192 + sector_t **pages); 193 + void bl_put_extent(struct pnfs_block_extent *be); 194 + struct pnfs_block_extent *bl_alloc_extent(void); 195 + int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 196 + int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 197 + struct xdr_stream *xdr, 198 + const struct nfs4_layoutcommit_args *arg); 199 + void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 200 + const struct nfs4_layoutcommit_args *arg, 201 + int status); 202 + int bl_add_merge_extent(struct pnfs_block_layout *bl, 203 + struct pnfs_block_extent *new); 204 + int bl_mark_for_commit(struct pnfs_block_extent *be, 205 + sector_t offset, sector_t length); 206 + 207 + #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
+410
fs/nfs/blocklayout/blocklayoutdev.c
··· 1 + /* 2 + * linux/fs/nfs/blocklayout/blocklayoutdev.c 3 + * 4 + * Device operations for the pnfs nfs4 file layout driver. 5 + * 6 + * Copyright (c) 2006 The Regents of the University of Michigan. 7 + * All rights reserved. 8 + * 9 + * Andy Adamson <andros@citi.umich.edu> 10 + * Fred Isaman <iisaman@umich.edu> 11 + * 12 + * permission is granted to use, copy, create derivative works and 13 + * redistribute this software and such derivative works for any purpose, 14 + * so long as the name of the university of michigan is not used in 15 + * any advertising or publicity pertaining to the use or distribution 16 + * of this software without specific, written prior authorization. if 17 + * the above copyright notice or any other identification of the 18 + * university of michigan is included in any copy of any portion of 19 + * this software, then the disclaimer below must also be included. 20 + * 21 + * this software is provided as is, without representation from the 22 + * university of michigan as to its fitness for any purpose, and without 23 + * warranty by the university of michigan of any kind, either express 24 + * or implied, including without limitation the implied warranties of 25 + * merchantability and fitness for a particular purpose. the regents 26 + * of the university of michigan shall not be liable for any damages, 27 + * including special, indirect, incidental, or consequential damages, 28 + * with respect to any claim arising out or in connection with the use 29 + * of the software, even if it has been or is hereafter advised of the 30 + * possibility of such damages. 31 + */ 32 + #include <linux/module.h> 33 + #include <linux/buffer_head.h> /* __bread */ 34 + 35 + #include <linux/genhd.h> 36 + #include <linux/blkdev.h> 37 + #include <linux/hash.h> 38 + 39 + #include "blocklayout.h" 40 + 41 + #define NFSDBG_FACILITY NFSDBG_PNFS_LD 42 + 43 + static int decode_sector_number(__be32 **rp, sector_t *sp) 44 + { 45 + uint64_t s; 46 + 47 + *rp = xdr_decode_hyper(*rp, &s); 48 + if (s & 0x1ff) { 49 + printk(KERN_WARNING "%s: sector not aligned\n", __func__); 50 + return -1; 51 + } 52 + *sp = s >> SECTOR_SHIFT; 53 + return 0; 54 + } 55 + 56 + /* Open a block_device by device number. */ 57 + struct block_device *nfs4_blkdev_get(dev_t dev) 58 + { 59 + struct block_device *bd; 60 + 61 + dprintk("%s enter\n", __func__); 62 + bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); 63 + if (IS_ERR(bd)) 64 + goto fail; 65 + return bd; 66 + fail: 67 + dprintk("%s failed to open device : %ld\n", 68 + __func__, PTR_ERR(bd)); 69 + return NULL; 70 + } 71 + 72 + /* 73 + * Release the block device 74 + */ 75 + int nfs4_blkdev_put(struct block_device *bdev) 76 + { 77 + dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), 78 + MINOR(bdev->bd_dev)); 79 + return blkdev_put(bdev, FMODE_READ); 80 + } 81 + 82 + /* 83 + * Shouldn't there be a rpc_generic_upcall() to do this for us? 84 + */ 85 + ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, 86 + char __user *dst, size_t buflen) 87 + { 88 + char *data = (char *)msg->data + msg->copied; 89 + size_t mlen = min(msg->len - msg->copied, buflen); 90 + unsigned long left; 91 + 92 + left = copy_to_user(dst, data, mlen); 93 + if (left == mlen) { 94 + msg->errno = -EFAULT; 95 + return -EFAULT; 96 + } 97 + 98 + mlen -= left; 99 + msg->copied += mlen; 100 + msg->errno = 0; 101 + return mlen; 102 + } 103 + 104 + static struct bl_dev_msg bl_mount_reply; 105 + 106 + ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, 107 + size_t mlen) 108 + { 109 + if (mlen != sizeof (struct bl_dev_msg)) 110 + return -EINVAL; 111 + 112 + if (copy_from_user(&bl_mount_reply, src, mlen) != 0) 113 + return -EFAULT; 114 + 115 + wake_up(&bl_wq); 116 + 117 + return mlen; 118 + } 119 + 120 + void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) 121 + { 122 + if (msg->errno >= 0) 123 + return; 124 + wake_up(&bl_wq); 125 + } 126 + 127 + /* 128 + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. 129 + */ 130 + struct pnfs_block_dev * 131 + nfs4_blk_decode_device(struct nfs_server *server, 132 + struct pnfs_device *dev) 133 + { 134 + struct pnfs_block_dev *rv = NULL; 135 + struct block_device *bd = NULL; 136 + struct rpc_pipe_msg msg; 137 + struct bl_msg_hdr bl_msg = { 138 + .type = BL_DEVICE_MOUNT, 139 + .totallen = dev->mincount, 140 + }; 141 + uint8_t *dataptr; 142 + DECLARE_WAITQUEUE(wq, current); 143 + struct bl_dev_msg *reply = &bl_mount_reply; 144 + int offset, len, i; 145 + 146 + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); 147 + dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, 148 + dev->mincount); 149 + 150 + memset(&msg, 0, sizeof(msg)); 151 + msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); 152 + if (!msg.data) { 153 + rv = ERR_PTR(-ENOMEM); 154 + goto out; 155 + } 156 + 157 + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); 158 + dataptr = (uint8_t *) msg.data; 159 + len = dev->mincount; 160 + offset = sizeof(bl_msg); 161 + for (i = 0; len > 0; i++) { 162 + memcpy(&dataptr[offset], page_address(dev->pages[i]), 163 + len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); 164 + len -= PAGE_CACHE_SIZE; 165 + offset += PAGE_CACHE_SIZE; 166 + } 167 + msg.len = sizeof(bl_msg) + dev->mincount; 168 + 169 + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); 170 + add_wait_queue(&bl_wq, &wq); 171 + if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { 172 + remove_wait_queue(&bl_wq, &wq); 173 + goto out; 174 + } 175 + 176 + set_current_state(TASK_UNINTERRUPTIBLE); 177 + schedule(); 178 + __set_current_state(TASK_RUNNING); 179 + remove_wait_queue(&bl_wq, &wq); 180 + 181 + if (reply->status != BL_DEVICE_REQUEST_PROC) { 182 + dprintk("%s failed to open device: %d\n", 183 + __func__, reply->status); 184 + rv = ERR_PTR(-EINVAL); 185 + goto out; 186 + } 187 + 188 + bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); 189 + if (IS_ERR(bd)) { 190 + dprintk("%s failed to open device : %ld\n", 191 + __func__, PTR_ERR(bd)); 192 + goto out; 193 + } 194 + 195 + rv = kzalloc(sizeof(*rv), GFP_NOFS); 196 + if (!rv) { 197 + rv = ERR_PTR(-ENOMEM); 198 + goto out; 199 + } 200 + 201 + rv->bm_mdev = bd; 202 + memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); 203 + dprintk("%s Created device %s with bd_block_size %u\n", 204 + __func__, 205 + bd->bd_disk->disk_name, 206 + bd->bd_block_size); 207 + 208 + out: 209 + kfree(msg.data); 210 + return rv; 211 + } 212 + 213 + /* Map deviceid returned by the server to constructed block_device */ 214 + static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, 215 + struct nfs4_deviceid *id) 216 + { 217 + struct block_device *rv = NULL; 218 + struct block_mount_id *mid; 219 + struct pnfs_block_dev *dev; 220 + 221 + dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); 222 + mid = BLK_ID(lo); 223 + spin_lock(&mid->bm_lock); 224 + list_for_each_entry(dev, &mid->bm_devlist, bm_node) { 225 + if (memcmp(id->data, dev->bm_mdevid.data, 226 + NFS4_DEVICEID4_SIZE) == 0) { 227 + rv = dev->bm_mdev; 228 + goto out; 229 + } 230 + } 231 + out: 232 + spin_unlock(&mid->bm_lock); 233 + dprintk("%s returning %p\n", __func__, rv); 234 + return rv; 235 + } 236 + 237 + /* Tracks info needed to ensure extents in layout obey constraints of spec */ 238 + struct layout_verification { 239 + u32 mode; /* R or RW */ 240 + u64 start; /* Expected start of next non-COW extent */ 241 + u64 inval; /* Start of INVAL coverage */ 242 + u64 cowread; /* End of COW read coverage */ 243 + }; 244 + 245 + /* Verify the extent meets the layout requirements of the pnfs-block draft, 246 + * section 2.3.1. 247 + */ 248 + static int verify_extent(struct pnfs_block_extent *be, 249 + struct layout_verification *lv) 250 + { 251 + if (lv->mode == IOMODE_READ) { 252 + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || 253 + be->be_state == PNFS_BLOCK_INVALID_DATA) 254 + return -EIO; 255 + if (be->be_f_offset != lv->start) 256 + return -EIO; 257 + lv->start += be->be_length; 258 + return 0; 259 + } 260 + /* lv->mode == IOMODE_RW */ 261 + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { 262 + if (be->be_f_offset != lv->start) 263 + return -EIO; 264 + if (lv->cowread > lv->start) 265 + return -EIO; 266 + lv->start += be->be_length; 267 + lv->inval = lv->start; 268 + return 0; 269 + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 270 + if (be->be_f_offset != lv->start) 271 + return -EIO; 272 + lv->start += be->be_length; 273 + return 0; 274 + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { 275 + if (be->be_f_offset > lv->start) 276 + return -EIO; 277 + if (be->be_f_offset < lv->inval) 278 + return -EIO; 279 + if (be->be_f_offset < lv->cowread) 280 + return -EIO; 281 + /* It looks like you might want to min this with lv->start, 282 + * but you really don't. 283 + */ 284 + lv->inval = lv->inval + be->be_length; 285 + lv->cowread = be->be_f_offset + be->be_length; 286 + return 0; 287 + } else 288 + return -EIO; 289 + } 290 + 291 + /* XDR decode pnfs_block_layout4 structure */ 292 + int 293 + nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 294 + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) 295 + { 296 + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 297 + int i, status = -EIO; 298 + uint32_t count; 299 + struct pnfs_block_extent *be = NULL, *save; 300 + struct xdr_stream stream; 301 + struct xdr_buf buf; 302 + struct page *scratch; 303 + __be32 *p; 304 + struct layout_verification lv = { 305 + .mode = lgr->range.iomode, 306 + .start = lgr->range.offset >> SECTOR_SHIFT, 307 + .inval = lgr->range.offset >> SECTOR_SHIFT, 308 + .cowread = lgr->range.offset >> SECTOR_SHIFT, 309 + }; 310 + LIST_HEAD(extents); 311 + 312 + dprintk("---> %s\n", __func__); 313 + 314 + scratch = alloc_page(gfp_flags); 315 + if (!scratch) 316 + return -ENOMEM; 317 + 318 + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); 319 + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 320 + 321 + p = xdr_inline_decode(&stream, 4); 322 + if (unlikely(!p)) 323 + goto out_err; 324 + 325 + count = be32_to_cpup(p++); 326 + 327 + dprintk("%s enter, number of extents %i\n", __func__, count); 328 + p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); 329 + if (unlikely(!p)) 330 + goto out_err; 331 + 332 + /* Decode individual extents, putting them in temporary 333 + * staging area until whole layout is decoded to make error 334 + * recovery easier. 335 + */ 336 + for (i = 0; i < count; i++) { 337 + be = bl_alloc_extent(); 338 + if (!be) { 339 + status = -ENOMEM; 340 + goto out_err; 341 + } 342 + memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); 343 + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 344 + be->be_mdev = translate_devid(lo, &be->be_devid); 345 + if (!be->be_mdev) 346 + goto out_err; 347 + 348 + /* The next three values are read in as bytes, 349 + * but stored as 512-byte sector lengths 350 + */ 351 + if (decode_sector_number(&p, &be->be_f_offset) < 0) 352 + goto out_err; 353 + if (decode_sector_number(&p, &be->be_length) < 0) 354 + goto out_err; 355 + if (decode_sector_number(&p, &be->be_v_offset) < 0) 356 + goto out_err; 357 + be->be_state = be32_to_cpup(p++); 358 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) 359 + be->be_inval = &bl->bl_inval; 360 + if (verify_extent(be, &lv)) { 361 + dprintk("%s verify failed\n", __func__); 362 + goto out_err; 363 + } 364 + list_add_tail(&be->be_node, &extents); 365 + } 366 + if (lgr->range.offset + lgr->range.length != 367 + lv.start << SECTOR_SHIFT) { 368 + dprintk("%s Final length mismatch\n", __func__); 369 + be = NULL; 370 + goto out_err; 371 + } 372 + if (lv.start < lv.cowread) { 373 + dprintk("%s Final uncovered COW extent\n", __func__); 374 + be = NULL; 375 + goto out_err; 376 + } 377 + /* Extents decoded properly, now try to merge them in to 378 + * existing layout extents. 379 + */ 380 + spin_lock(&bl->bl_ext_lock); 381 + list_for_each_entry_safe(be, save, &extents, be_node) { 382 + list_del(&be->be_node); 383 + status = bl_add_merge_extent(bl, be); 384 + if (status) { 385 + spin_unlock(&bl->bl_ext_lock); 386 + /* This is a fairly catastrophic error, as the 387 + * entire layout extent lists are now corrupted. 388 + * We should have some way to distinguish this. 389 + */ 390 + be = NULL; 391 + goto out_err; 392 + } 393 + } 394 + spin_unlock(&bl->bl_ext_lock); 395 + status = 0; 396 + out: 397 + __free_page(scratch); 398 + dprintk("%s returns %i\n", __func__, status); 399 + return status; 400 + 401 + out_err: 402 + bl_put_extent(be); 403 + while (!list_empty(&extents)) { 404 + be = list_first_entry(&extents, struct pnfs_block_extent, 405 + be_node); 406 + list_del(&be->be_node); 407 + bl_put_extent(be); 408 + } 409 + goto out; 410 + }
+111
fs/nfs/blocklayout/blocklayoutdm.c
··· 1 + /* 2 + * linux/fs/nfs/blocklayout/blocklayoutdm.c 3 + * 4 + * Module for the NFSv4.1 pNFS block layout driver. 5 + * 6 + * Copyright (c) 2007 The Regents of the University of Michigan. 7 + * All rights reserved. 8 + * 9 + * Fred Isaman <iisaman@umich.edu> 10 + * Andy Adamson <andros@citi.umich.edu> 11 + * 12 + * permission is granted to use, copy, create derivative works and 13 + * redistribute this software and such derivative works for any purpose, 14 + * so long as the name of the university of michigan is not used in 15 + * any advertising or publicity pertaining to the use or distribution 16 + * of this software without specific, written prior authorization. if 17 + * the above copyright notice or any other identification of the 18 + * university of michigan is included in any copy of any portion of 19 + * this software, then the disclaimer below must also be included. 20 + * 21 + * this software is provided as is, without representation from the 22 + * university of michigan as to its fitness for any purpose, and without 23 + * warranty by the university of michigan of any kind, either express 24 + * or implied, including without limitation the implied warranties of 25 + * merchantability and fitness for a particular purpose. the regents 26 + * of the university of michigan shall not be liable for any damages, 27 + * including special, indirect, incidental, or consequential damages, 28 + * with respect to any claim arising out or in connection with the use 29 + * of the software, even if it has been or is hereafter advised of the 30 + * possibility of such damages. 31 + */ 32 + 33 + #include <linux/genhd.h> /* gendisk - used in a dprintk*/ 34 + #include <linux/sched.h> 35 + #include <linux/hash.h> 36 + 37 + #include "blocklayout.h" 38 + 39 + #define NFSDBG_FACILITY NFSDBG_PNFS_LD 40 + 41 + static void dev_remove(dev_t dev) 42 + { 43 + struct rpc_pipe_msg msg; 44 + struct bl_dev_msg bl_umount_request; 45 + struct bl_msg_hdr bl_msg = { 46 + .type = BL_DEVICE_UMOUNT, 47 + .totallen = sizeof(bl_umount_request), 48 + }; 49 + uint8_t *dataptr; 50 + DECLARE_WAITQUEUE(wq, current); 51 + 52 + dprintk("Entering %s\n", __func__); 53 + 54 + memset(&msg, 0, sizeof(msg)); 55 + msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); 56 + if (!msg.data) 57 + goto out; 58 + 59 + memset(&bl_umount_request, 0, sizeof(bl_umount_request)); 60 + bl_umount_request.major = MAJOR(dev); 61 + bl_umount_request.minor = MINOR(dev); 62 + 63 + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); 64 + dataptr = (uint8_t *) msg.data; 65 + memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); 66 + msg.len = sizeof(bl_msg) + bl_msg.totallen; 67 + 68 + add_wait_queue(&bl_wq, &wq); 69 + if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { 70 + remove_wait_queue(&bl_wq, &wq); 71 + goto out; 72 + } 73 + 74 + set_current_state(TASK_UNINTERRUPTIBLE); 75 + schedule(); 76 + __set_current_state(TASK_RUNNING); 77 + remove_wait_queue(&bl_wq, &wq); 78 + 79 + out: 80 + kfree(msg.data); 81 + } 82 + 83 + /* 84 + * Release meta device 85 + */ 86 + static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) 87 + { 88 + int rv; 89 + 90 + dprintk("%s Releasing\n", __func__); 91 + rv = nfs4_blkdev_put(bdev->bm_mdev); 92 + if (rv) 93 + printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", 94 + __func__, rv); 95 + 96 + dev_remove(bdev->bm_mdev->bd_dev); 97 + } 98 + 99 + void bl_free_block_dev(struct pnfs_block_dev *bdev) 100 + { 101 + if (bdev) { 102 + if (bdev->bm_mdev) { 103 + dprintk("%s Removing DM device: %d:%d\n", 104 + __func__, 105 + MAJOR(bdev->bm_mdev->bd_dev), 106 + MINOR(bdev->bm_mdev->bd_dev)); 107 + nfs4_blk_metadev_release(bdev); 108 + } 109 + kfree(bdev); 110 + } 111 + }
+935
fs/nfs/blocklayout/extents.c
··· 1 + /* 2 + * linux/fs/nfs/blocklayout/blocklayout.h 3 + * 4 + * Module for the NFSv4.1 pNFS block layout driver. 5 + * 6 + * Copyright (c) 2006 The Regents of the University of Michigan. 7 + * All rights reserved. 8 + * 9 + * Andy Adamson <andros@citi.umich.edu> 10 + * Fred Isaman <iisaman@umich.edu> 11 + * 12 + * permission is granted to use, copy, create derivative works and 13 + * redistribute this software and such derivative works for any purpose, 14 + * so long as the name of the university of michigan is not used in 15 + * any advertising or publicity pertaining to the use or distribution 16 + * of this software without specific, written prior authorization. if 17 + * the above copyright notice or any other identification of the 18 + * university of michigan is included in any copy of any portion of 19 + * this software, then the disclaimer below must also be included. 20 + * 21 + * this software is provided as is, without representation from the 22 + * university of michigan as to its fitness for any purpose, and without 23 + * warranty by the university of michigan of any kind, either express 24 + * or implied, including without limitation the implied warranties of 25 + * merchantability and fitness for a particular purpose. the regents 26 + * of the university of michigan shall not be liable for any damages, 27 + * including special, indirect, incidental, or consequential damages, 28 + * with respect to any claim arising out or in connection with the use 29 + * of the software, even if it has been or is hereafter advised of the 30 + * possibility of such damages. 31 + */ 32 + 33 + #include "blocklayout.h" 34 + #define NFSDBG_FACILITY NFSDBG_PNFS_LD 35 + 36 + /* Bit numbers */ 37 + #define EXTENT_INITIALIZED 0 38 + #define EXTENT_WRITTEN 1 39 + #define EXTENT_IN_COMMIT 2 40 + #define INTERNAL_EXISTS MY_MAX_TAGS 41 + #define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) 42 + 43 + /* Returns largest t<=s s.t. t%base==0 */ 44 + static inline sector_t normalize(sector_t s, int base) 45 + { 46 + sector_t tmp = s; /* Since do_div modifies its argument */ 47 + return s - do_div(tmp, base); 48 + } 49 + 50 + static inline sector_t normalize_up(sector_t s, int base) 51 + { 52 + return normalize(s + base - 1, base); 53 + } 54 + 55 + /* Complete stub using list while determine API wanted */ 56 + 57 + /* Returns tags, or negative */ 58 + static int32_t _find_entry(struct my_tree *tree, u64 s) 59 + { 60 + struct pnfs_inval_tracking *pos; 61 + 62 + dprintk("%s(%llu) enter\n", __func__, s); 63 + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { 64 + if (pos->it_sector > s) 65 + continue; 66 + else if (pos->it_sector == s) 67 + return pos->it_tags & INTERNAL_MASK; 68 + else 69 + break; 70 + } 71 + return -ENOENT; 72 + } 73 + 74 + static inline 75 + int _has_tag(struct my_tree *tree, u64 s, int32_t tag) 76 + { 77 + int32_t tags; 78 + 79 + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); 80 + s = normalize(s, tree->mtt_step_size); 81 + tags = _find_entry(tree, s); 82 + if ((tags < 0) || !(tags & (1 << tag))) 83 + return 0; 84 + else 85 + return 1; 86 + } 87 + 88 + /* Creates entry with tag, or if entry already exists, unions tag to it. 89 + * If storage is not NULL, newly created entry will use it. 90 + * Returns number of entries added, or negative on error. 91 + */ 92 + static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, 93 + struct pnfs_inval_tracking *storage) 94 + { 95 + int found = 0; 96 + struct pnfs_inval_tracking *pos; 97 + 98 + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); 99 + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { 100 + if (pos->it_sector > s) 101 + continue; 102 + else if (pos->it_sector == s) { 103 + found = 1; 104 + break; 105 + } else 106 + break; 107 + } 108 + if (found) { 109 + pos->it_tags |= (1 << tag); 110 + return 0; 111 + } else { 112 + struct pnfs_inval_tracking *new; 113 + if (storage) 114 + new = storage; 115 + else { 116 + new = kmalloc(sizeof(*new), GFP_NOFS); 117 + if (!new) 118 + return -ENOMEM; 119 + } 120 + new->it_sector = s; 121 + new->it_tags = (1 << tag); 122 + list_add(&new->it_link, &pos->it_link); 123 + return 1; 124 + } 125 + } 126 + 127 + /* XXXX Really want option to not create */ 128 + /* Over range, unions tag with existing entries, else creates entry with tag */ 129 + static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) 130 + { 131 + u64 i; 132 + 133 + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); 134 + for (i = normalize(s, tree->mtt_step_size); i < s + length; 135 + i += tree->mtt_step_size) 136 + if (_add_entry(tree, i, tag, NULL)) 137 + return -ENOMEM; 138 + return 0; 139 + } 140 + 141 + /* Ensure that future operations on given range of tree will not malloc */ 142 + static int _preload_range(struct my_tree *tree, u64 offset, u64 length) 143 + { 144 + u64 start, end, s; 145 + int count, i, used = 0, status = -ENOMEM; 146 + struct pnfs_inval_tracking **storage; 147 + 148 + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); 149 + start = normalize(offset, tree->mtt_step_size); 150 + end = normalize_up(offset + length, tree->mtt_step_size); 151 + count = (int)(end - start) / (int)tree->mtt_step_size; 152 + 153 + /* Pre-malloc what memory we might need */ 154 + storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); 155 + if (!storage) 156 + return -ENOMEM; 157 + for (i = 0; i < count; i++) { 158 + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), 159 + GFP_NOFS); 160 + if (!storage[i]) 161 + goto out_cleanup; 162 + } 163 + 164 + /* Now need lock - HOW??? */ 165 + 166 + for (s = start; s < end; s += tree->mtt_step_size) 167 + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); 168 + 169 + /* Unlock - HOW??? */ 170 + status = 0; 171 + 172 + out_cleanup: 173 + for (i = used; i < count; i++) { 174 + if (!storage[i]) 175 + break; 176 + kfree(storage[i]); 177 + } 178 + kfree(storage); 179 + return status; 180 + } 181 + 182 + static void set_needs_init(sector_t *array, sector_t offset) 183 + { 184 + sector_t *p = array; 185 + 186 + dprintk("%s enter\n", __func__); 187 + if (!p) 188 + return; 189 + while (*p < offset) 190 + p++; 191 + if (*p == offset) 192 + return; 193 + else if (*p == ~0) { 194 + *p++ = offset; 195 + *p = ~0; 196 + return; 197 + } else { 198 + sector_t *save = p; 199 + dprintk("%s Adding %llu\n", __func__, (u64)offset); 200 + while (*p != ~0) 201 + p++; 202 + p++; 203 + memmove(save + 1, save, (char *)p - (char *)save); 204 + *save = offset; 205 + return; 206 + } 207 + } 208 + 209 + /* We are relying on page lock to serialize this */ 210 + int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) 211 + { 212 + int rv; 213 + 214 + spin_lock(&marks->im_lock); 215 + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); 216 + spin_unlock(&marks->im_lock); 217 + return rv; 218 + } 219 + 220 + /* Assume start, end already sector aligned */ 221 + static int 222 + _range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) 223 + { 224 + struct pnfs_inval_tracking *pos; 225 + u64 expect = 0; 226 + 227 + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); 228 + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { 229 + if (pos->it_sector >= end) 230 + continue; 231 + if (!expect) { 232 + if ((pos->it_sector == end - tree->mtt_step_size) && 233 + (pos->it_tags & (1 << tag))) { 234 + expect = pos->it_sector - tree->mtt_step_size; 235 + if (pos->it_sector < tree->mtt_step_size || expect < start) 236 + return 1; 237 + continue; 238 + } else { 239 + return 0; 240 + } 241 + } 242 + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) 243 + return 0; 244 + expect -= tree->mtt_step_size; 245 + if (expect < start) 246 + return 1; 247 + } 248 + return 0; 249 + } 250 + 251 + static int is_range_written(struct pnfs_inval_markings *marks, 252 + sector_t start, sector_t end) 253 + { 254 + int rv; 255 + 256 + spin_lock(&marks->im_lock); 257 + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); 258 + spin_unlock(&marks->im_lock); 259 + return rv; 260 + } 261 + 262 + /* Marks sectors in [offest, offset_length) as having been initialized. 263 + * All lengths are step-aligned, where step is min(pagesize, blocksize). 264 + * Notes where partial block is initialized, and helps prepare it for 265 + * complete initialization later. 266 + */ 267 + /* Currently assumes offset is page-aligned */ 268 + int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 269 + sector_t offset, sector_t length, 270 + sector_t **pages) 271 + { 272 + sector_t s, start, end; 273 + sector_t *array = NULL; /* Pages to mark */ 274 + 275 + dprintk("%s(offset=%llu,len=%llu) enter\n", 276 + __func__, (u64)offset, (u64)length); 277 + s = max((sector_t) 3, 278 + 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); 279 + dprintk("%s set max=%llu\n", __func__, (u64)s); 280 + if (pages) { 281 + array = kmalloc(s * sizeof(sector_t), GFP_NOFS); 282 + if (!array) 283 + goto outerr; 284 + array[0] = ~0; 285 + } 286 + 287 + start = normalize(offset, marks->im_block_size); 288 + end = normalize_up(offset + length, marks->im_block_size); 289 + if (_preload_range(&marks->im_tree, start, end - start)) 290 + goto outerr; 291 + 292 + spin_lock(&marks->im_lock); 293 + 294 + for (s = normalize_up(start, PAGE_CACHE_SECTORS); 295 + s < offset; s += PAGE_CACHE_SECTORS) { 296 + dprintk("%s pre-area pages\n", __func__); 297 + /* Portion of used block is not initialized */ 298 + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) 299 + set_needs_init(array, s); 300 + } 301 + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) 302 + goto out_unlock; 303 + for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); 304 + s < end; s += PAGE_CACHE_SECTORS) { 305 + dprintk("%s post-area pages\n", __func__); 306 + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) 307 + set_needs_init(array, s); 308 + } 309 + 310 + spin_unlock(&marks->im_lock); 311 + 312 + if (pages) { 313 + if (array[0] == ~0) { 314 + kfree(array); 315 + *pages = NULL; 316 + } else 317 + *pages = array; 318 + } 319 + return 0; 320 + 321 + out_unlock: 322 + spin_unlock(&marks->im_lock); 323 + outerr: 324 + if (pages) { 325 + kfree(array); 326 + *pages = NULL; 327 + } 328 + return -ENOMEM; 329 + } 330 + 331 + /* Marks sectors in [offest, offset+length) as having been written to disk. 332 + * All lengths should be block aligned. 333 + */ 334 + static int mark_written_sectors(struct pnfs_inval_markings *marks, 335 + sector_t offset, sector_t length) 336 + { 337 + int status; 338 + 339 + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, 340 + (u64)offset, (u64)length); 341 + spin_lock(&marks->im_lock); 342 + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); 343 + spin_unlock(&marks->im_lock); 344 + return status; 345 + } 346 + 347 + static void print_short_extent(struct pnfs_block_short_extent *be) 348 + { 349 + dprintk("PRINT SHORT EXTENT extent %p\n", be); 350 + if (be) { 351 + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); 352 + dprintk(" be_length %llu\n", (u64)be->bse_length); 353 + } 354 + } 355 + 356 + static void print_clist(struct list_head *list, unsigned int count) 357 + { 358 + struct pnfs_block_short_extent *be; 359 + unsigned int i = 0; 360 + 361 + ifdebug(FACILITY) { 362 + printk(KERN_DEBUG "****************\n"); 363 + printk(KERN_DEBUG "Extent list looks like:\n"); 364 + list_for_each_entry(be, list, bse_node) { 365 + i++; 366 + print_short_extent(be); 367 + } 368 + if (i != count) 369 + printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); 370 + printk(KERN_DEBUG "****************\n"); 371 + } 372 + } 373 + 374 + /* Note: In theory, we should do more checking that devid's match between 375 + * old and new, but if they don't, the lists are too corrupt to salvage anyway. 376 + */ 377 + /* Note this is very similar to bl_add_merge_extent */ 378 + static void add_to_commitlist(struct pnfs_block_layout *bl, 379 + struct pnfs_block_short_extent *new) 380 + { 381 + struct list_head *clist = &bl->bl_commit; 382 + struct pnfs_block_short_extent *old, *save; 383 + sector_t end = new->bse_f_offset + new->bse_length; 384 + 385 + dprintk("%s enter\n", __func__); 386 + print_short_extent(new); 387 + print_clist(clist, bl->bl_count); 388 + bl->bl_count++; 389 + /* Scan for proper place to insert, extending new to the left 390 + * as much as possible. 391 + */ 392 + list_for_each_entry_safe(old, save, clist, bse_node) { 393 + if (new->bse_f_offset < old->bse_f_offset) 394 + break; 395 + if (end <= old->bse_f_offset + old->bse_length) { 396 + /* Range is already in list */ 397 + bl->bl_count--; 398 + kfree(new); 399 + return; 400 + } else if (new->bse_f_offset <= 401 + old->bse_f_offset + old->bse_length) { 402 + /* new overlaps or abuts existing be */ 403 + if (new->bse_mdev == old->bse_mdev) { 404 + /* extend new to fully replace old */ 405 + new->bse_length += new->bse_f_offset - 406 + old->bse_f_offset; 407 + new->bse_f_offset = old->bse_f_offset; 408 + list_del(&old->bse_node); 409 + bl->bl_count--; 410 + kfree(old); 411 + } 412 + } 413 + } 414 + /* Note that if we never hit the above break, old will not point to a 415 + * valid extent. However, in that case &old->bse_node==list. 416 + */ 417 + list_add_tail(&new->bse_node, &old->bse_node); 418 + /* Scan forward for overlaps. If we find any, extend new and 419 + * remove the overlapped extent. 420 + */ 421 + old = list_prepare_entry(new, clist, bse_node); 422 + list_for_each_entry_safe_continue(old, save, clist, bse_node) { 423 + if (end < old->bse_f_offset) 424 + break; 425 + /* new overlaps or abuts old */ 426 + if (new->bse_mdev == old->bse_mdev) { 427 + if (end < old->bse_f_offset + old->bse_length) { 428 + /* extend new to fully cover old */ 429 + end = old->bse_f_offset + old->bse_length; 430 + new->bse_length = end - new->bse_f_offset; 431 + } 432 + list_del(&old->bse_node); 433 + bl->bl_count--; 434 + kfree(old); 435 + } 436 + } 437 + dprintk("%s: after merging\n", __func__); 438 + print_clist(clist, bl->bl_count); 439 + } 440 + 441 + /* Note the range described by offset, length is guaranteed to be contained 442 + * within be. 443 + */ 444 + int bl_mark_for_commit(struct pnfs_block_extent *be, 445 + sector_t offset, sector_t length) 446 + { 447 + sector_t new_end, end = offset + length; 448 + struct pnfs_block_short_extent *new; 449 + struct pnfs_block_layout *bl = container_of(be->be_inval, 450 + struct pnfs_block_layout, 451 + bl_inval); 452 + 453 + new = kmalloc(sizeof(*new), GFP_NOFS); 454 + if (!new) 455 + return -ENOMEM; 456 + 457 + mark_written_sectors(be->be_inval, offset, length); 458 + /* We want to add the range to commit list, but it must be 459 + * block-normalized, and verified that the normalized range has 460 + * been entirely written to disk. 461 + */ 462 + new->bse_f_offset = offset; 463 + offset = normalize(offset, bl->bl_blocksize); 464 + if (offset < new->bse_f_offset) { 465 + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) 466 + new->bse_f_offset = offset; 467 + else 468 + new->bse_f_offset = offset + bl->bl_blocksize; 469 + } 470 + new_end = normalize_up(end, bl->bl_blocksize); 471 + if (end < new_end) { 472 + if (is_range_written(be->be_inval, end, new_end)) 473 + end = new_end; 474 + else 475 + end = new_end - bl->bl_blocksize; 476 + } 477 + if (end <= new->bse_f_offset) { 478 + kfree(new); 479 + return 0; 480 + } 481 + new->bse_length = end - new->bse_f_offset; 482 + new->bse_devid = be->be_devid; 483 + new->bse_mdev = be->be_mdev; 484 + 485 + spin_lock(&bl->bl_ext_lock); 486 + /* new will be freed, either by add_to_commitlist if it decides not 487 + * to use it, or after LAYOUTCOMMIT uses it in the commitlist. 488 + */ 489 + add_to_commitlist(bl, new); 490 + spin_unlock(&bl->bl_ext_lock); 491 + return 0; 492 + } 493 + 494 + static void print_bl_extent(struct pnfs_block_extent *be) 495 + { 496 + dprintk("PRINT EXTENT extent %p\n", be); 497 + if (be) { 498 + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); 499 + dprintk(" be_length %llu\n", (u64)be->be_length); 500 + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); 501 + dprintk(" be_state %d\n", be->be_state); 502 + } 503 + } 504 + 505 + static void 506 + destroy_extent(struct kref *kref) 507 + { 508 + struct pnfs_block_extent *be; 509 + 510 + be = container_of(kref, struct pnfs_block_extent, be_refcnt); 511 + dprintk("%s be=%p\n", __func__, be); 512 + kfree(be); 513 + } 514 + 515 + void 516 + bl_put_extent(struct pnfs_block_extent *be) 517 + { 518 + if (be) { 519 + dprintk("%s enter %p (%i)\n", __func__, be, 520 + atomic_read(&be->be_refcnt.refcount)); 521 + kref_put(&be->be_refcnt, destroy_extent); 522 + } 523 + } 524 + 525 + struct pnfs_block_extent *bl_alloc_extent(void) 526 + { 527 + struct pnfs_block_extent *be; 528 + 529 + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); 530 + if (!be) 531 + return NULL; 532 + INIT_LIST_HEAD(&be->be_node); 533 + kref_init(&be->be_refcnt); 534 + be->be_inval = NULL; 535 + return be; 536 + } 537 + 538 + static void print_elist(struct list_head *list) 539 + { 540 + struct pnfs_block_extent *be; 541 + dprintk("****************\n"); 542 + dprintk("Extent list looks like:\n"); 543 + list_for_each_entry(be, list, be_node) { 544 + print_bl_extent(be); 545 + } 546 + dprintk("****************\n"); 547 + } 548 + 549 + static inline int 550 + extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) 551 + { 552 + /* Note this assumes new->be_f_offset >= old->be_f_offset */ 553 + return (new->be_state == old->be_state) && 554 + ((new->be_state == PNFS_BLOCK_NONE_DATA) || 555 + ((new->be_v_offset - old->be_v_offset == 556 + new->be_f_offset - old->be_f_offset) && 557 + new->be_mdev == old->be_mdev)); 558 + } 559 + 560 + /* Adds new to appropriate list in bl, modifying new and removing existing 561 + * extents as appropriate to deal with overlaps. 562 + * 563 + * See bl_find_get_extent for list constraints. 564 + * 565 + * Refcount on new is already set. If end up not using it, or error out, 566 + * need to put the reference. 567 + * 568 + * bl->bl_ext_lock is held by caller. 569 + */ 570 + int 571 + bl_add_merge_extent(struct pnfs_block_layout *bl, 572 + struct pnfs_block_extent *new) 573 + { 574 + struct pnfs_block_extent *be, *tmp; 575 + sector_t end = new->be_f_offset + new->be_length; 576 + struct list_head *list; 577 + 578 + dprintk("%s enter with be=%p\n", __func__, new); 579 + print_bl_extent(new); 580 + list = &bl->bl_extents[bl_choose_list(new->be_state)]; 581 + print_elist(list); 582 + 583 + /* Scan for proper place to insert, extending new to the left 584 + * as much as possible. 585 + */ 586 + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { 587 + if (new->be_f_offset >= be->be_f_offset + be->be_length) 588 + break; 589 + if (new->be_f_offset >= be->be_f_offset) { 590 + if (end <= be->be_f_offset + be->be_length) { 591 + /* new is a subset of existing be*/ 592 + if (extents_consistent(be, new)) { 593 + dprintk("%s: new is subset, ignoring\n", 594 + __func__); 595 + bl_put_extent(new); 596 + return 0; 597 + } else { 598 + goto out_err; 599 + } 600 + } else { 601 + /* |<-- be -->| 602 + * |<-- new -->| */ 603 + if (extents_consistent(be, new)) { 604 + /* extend new to fully replace be */ 605 + new->be_length += new->be_f_offset - 606 + be->be_f_offset; 607 + new->be_f_offset = be->be_f_offset; 608 + new->be_v_offset = be->be_v_offset; 609 + dprintk("%s: removing %p\n", __func__, be); 610 + list_del(&be->be_node); 611 + bl_put_extent(be); 612 + } else { 613 + goto out_err; 614 + } 615 + } 616 + } else if (end >= be->be_f_offset + be->be_length) { 617 + /* new extent overlap existing be */ 618 + if (extents_consistent(be, new)) { 619 + /* extend new to fully replace be */ 620 + dprintk("%s: removing %p\n", __func__, be); 621 + list_del(&be->be_node); 622 + bl_put_extent(be); 623 + } else { 624 + goto out_err; 625 + } 626 + } else if (end > be->be_f_offset) { 627 + /* |<-- be -->| 628 + *|<-- new -->| */ 629 + if (extents_consistent(new, be)) { 630 + /* extend new to fully replace be */ 631 + new->be_length += be->be_f_offset + be->be_length - 632 + new->be_f_offset - new->be_length; 633 + dprintk("%s: removing %p\n", __func__, be); 634 + list_del(&be->be_node); 635 + bl_put_extent(be); 636 + } else { 637 + goto out_err; 638 + } 639 + } 640 + } 641 + /* Note that if we never hit the above break, be will not point to a 642 + * valid extent. However, in that case &be->be_node==list. 643 + */ 644 + list_add(&new->be_node, &be->be_node); 645 + dprintk("%s: inserting new\n", __func__); 646 + print_elist(list); 647 + /* FIXME - The per-list consistency checks have all been done, 648 + * should now check cross-list consistency. 649 + */ 650 + return 0; 651 + 652 + out_err: 653 + bl_put_extent(new); 654 + return -EIO; 655 + } 656 + 657 + /* Returns extent, or NULL. If a second READ extent exists, it is returned 658 + * in cow_read, if given. 659 + * 660 + * The extents are kept in two seperate ordered lists, one for READ and NONE, 661 + * one for READWRITE and INVALID. Within each list, we assume: 662 + * 1. Extents are ordered by file offset. 663 + * 2. For any given isect, there is at most one extents that matches. 664 + */ 665 + struct pnfs_block_extent * 666 + bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 667 + struct pnfs_block_extent **cow_read) 668 + { 669 + struct pnfs_block_extent *be, *cow, *ret; 670 + int i; 671 + 672 + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); 673 + cow = ret = NULL; 674 + spin_lock(&bl->bl_ext_lock); 675 + for (i = 0; i < EXTENT_LISTS; i++) { 676 + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { 677 + if (isect >= be->be_f_offset + be->be_length) 678 + break; 679 + if (isect >= be->be_f_offset) { 680 + /* We have found an extent */ 681 + dprintk("%s Get %p (%i)\n", __func__, be, 682 + atomic_read(&be->be_refcnt.refcount)); 683 + kref_get(&be->be_refcnt); 684 + if (!ret) 685 + ret = be; 686 + else if (be->be_state != PNFS_BLOCK_READ_DATA) 687 + bl_put_extent(be); 688 + else 689 + cow = be; 690 + break; 691 + } 692 + } 693 + if (ret && 694 + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) 695 + break; 696 + } 697 + spin_unlock(&bl->bl_ext_lock); 698 + if (cow_read) 699 + *cow_read = cow; 700 + print_bl_extent(ret); 701 + return ret; 702 + } 703 + 704 + /* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ 705 + static struct pnfs_block_extent * 706 + bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) 707 + { 708 + struct pnfs_block_extent *be, *ret = NULL; 709 + int i; 710 + 711 + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); 712 + for (i = 0; i < EXTENT_LISTS; i++) { 713 + if (ret) 714 + break; 715 + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { 716 + if (isect >= be->be_f_offset + be->be_length) 717 + break; 718 + if (isect >= be->be_f_offset) { 719 + /* We have found an extent */ 720 + dprintk("%s Get %p (%i)\n", __func__, be, 721 + atomic_read(&be->be_refcnt.refcount)); 722 + kref_get(&be->be_refcnt); 723 + ret = be; 724 + break; 725 + } 726 + } 727 + } 728 + print_bl_extent(ret); 729 + return ret; 730 + } 731 + 732 + int 733 + encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 734 + struct xdr_stream *xdr, 735 + const struct nfs4_layoutcommit_args *arg) 736 + { 737 + struct pnfs_block_short_extent *lce, *save; 738 + unsigned int count = 0; 739 + __be32 *p, *xdr_start; 740 + 741 + dprintk("%s enter\n", __func__); 742 + /* BUG - creation of bl_commit is buggy - need to wait for 743 + * entire block to be marked WRITTEN before it can be added. 744 + */ 745 + spin_lock(&bl->bl_ext_lock); 746 + /* Want to adjust for possible truncate */ 747 + /* We now want to adjust argument range */ 748 + 749 + /* XDR encode the ranges found */ 750 + xdr_start = xdr_reserve_space(xdr, 8); 751 + if (!xdr_start) 752 + goto out; 753 + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { 754 + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); 755 + if (!p) 756 + break; 757 + p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); 758 + p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); 759 + p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); 760 + p = xdr_encode_hyper(p, 0LL); 761 + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 762 + list_del(&lce->bse_node); 763 + list_add_tail(&lce->bse_node, &bl->bl_committing); 764 + bl->bl_count--; 765 + count++; 766 + } 767 + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); 768 + xdr_start[1] = cpu_to_be32(count); 769 + out: 770 + spin_unlock(&bl->bl_ext_lock); 771 + dprintk("%s found %i ranges\n", __func__, count); 772 + return 0; 773 + } 774 + 775 + /* Helper function to set_to_rw that initialize a new extent */ 776 + static void 777 + _prep_new_extent(struct pnfs_block_extent *new, 778 + struct pnfs_block_extent *orig, 779 + sector_t offset, sector_t length, int state) 780 + { 781 + kref_init(&new->be_refcnt); 782 + /* don't need to INIT_LIST_HEAD(&new->be_node) */ 783 + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); 784 + new->be_mdev = orig->be_mdev; 785 + new->be_f_offset = offset; 786 + new->be_length = length; 787 + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; 788 + new->be_state = state; 789 + new->be_inval = orig->be_inval; 790 + } 791 + 792 + /* Tries to merge be with extent in front of it in list. 793 + * Frees storage if not used. 794 + */ 795 + static struct pnfs_block_extent * 796 + _front_merge(struct pnfs_block_extent *be, struct list_head *head, 797 + struct pnfs_block_extent *storage) 798 + { 799 + struct pnfs_block_extent *prev; 800 + 801 + if (!storage) 802 + goto no_merge; 803 + if (&be->be_node == head || be->be_node.prev == head) 804 + goto no_merge; 805 + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); 806 + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || 807 + !extents_consistent(prev, be)) 808 + goto no_merge; 809 + _prep_new_extent(storage, prev, prev->be_f_offset, 810 + prev->be_length + be->be_length, prev->be_state); 811 + list_replace(&prev->be_node, &storage->be_node); 812 + bl_put_extent(prev); 813 + list_del(&be->be_node); 814 + bl_put_extent(be); 815 + return storage; 816 + 817 + no_merge: 818 + kfree(storage); 819 + return be; 820 + } 821 + 822 + static u64 823 + set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) 824 + { 825 + u64 rv = offset + length; 826 + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; 827 + struct pnfs_block_extent *children[3]; 828 + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; 829 + int i = 0, j; 830 + 831 + dprintk("%s(%llu, %llu)\n", __func__, offset, length); 832 + /* Create storage for up to three new extents e1, e2, e3 */ 833 + e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); 834 + e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); 835 + e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); 836 + /* BUG - we are ignoring any failure */ 837 + if (!e1 || !e2 || !e3) 838 + goto out_nosplit; 839 + 840 + spin_lock(&bl->bl_ext_lock); 841 + be = bl_find_get_extent_locked(bl, offset); 842 + rv = be->be_f_offset + be->be_length; 843 + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { 844 + spin_unlock(&bl->bl_ext_lock); 845 + goto out_nosplit; 846 + } 847 + /* Add e* to children, bumping e*'s krefs */ 848 + if (be->be_f_offset != offset) { 849 + _prep_new_extent(e1, be, be->be_f_offset, 850 + offset - be->be_f_offset, 851 + PNFS_BLOCK_INVALID_DATA); 852 + children[i++] = e1; 853 + print_bl_extent(e1); 854 + } else 855 + merge1 = e1; 856 + _prep_new_extent(e2, be, offset, 857 + min(length, be->be_f_offset + be->be_length - offset), 858 + PNFS_BLOCK_READWRITE_DATA); 859 + children[i++] = e2; 860 + print_bl_extent(e2); 861 + if (offset + length < be->be_f_offset + be->be_length) { 862 + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, 863 + be->be_f_offset + be->be_length - 864 + offset - length, 865 + PNFS_BLOCK_INVALID_DATA); 866 + children[i++] = e3; 867 + print_bl_extent(e3); 868 + } else 869 + merge2 = e3; 870 + 871 + /* Remove be from list, and insert the e* */ 872 + /* We don't get refs on e*, since this list is the base reference 873 + * set when init'ed. 874 + */ 875 + if (i < 3) 876 + children[i] = NULL; 877 + new = children[0]; 878 + list_replace(&be->be_node, &new->be_node); 879 + bl_put_extent(be); 880 + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); 881 + for (j = 1; j < i; j++) { 882 + old = new; 883 + new = children[j]; 884 + list_add(&new->be_node, &old->be_node); 885 + } 886 + if (merge2) { 887 + /* This is a HACK, should just create a _back_merge function */ 888 + new = list_entry(new->be_node.next, 889 + struct pnfs_block_extent, be_node); 890 + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); 891 + } 892 + spin_unlock(&bl->bl_ext_lock); 893 + 894 + /* Since we removed the base reference above, be is now scheduled for 895 + * destruction. 896 + */ 897 + bl_put_extent(be); 898 + dprintk("%s returns %llu after split\n", __func__, rv); 899 + return rv; 900 + 901 + out_nosplit: 902 + kfree(e1); 903 + kfree(e2); 904 + kfree(e3); 905 + dprintk("%s returns %llu without splitting\n", __func__, rv); 906 + return rv; 907 + } 908 + 909 + void 910 + clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 911 + const struct nfs4_layoutcommit_args *arg, 912 + int status) 913 + { 914 + struct pnfs_block_short_extent *lce, *save; 915 + 916 + dprintk("%s status %d\n", __func__, status); 917 + list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { 918 + if (likely(!status)) { 919 + u64 offset = lce->bse_f_offset; 920 + u64 end = offset + lce->bse_length; 921 + 922 + do { 923 + offset = set_to_rw(bl, offset, end - offset); 924 + } while (offset < end); 925 + list_del(&lce->bse_node); 926 + 927 + kfree(lce); 928 + } else { 929 + list_del(&lce->bse_node); 930 + spin_lock(&bl->bl_ext_lock); 931 + add_to_commitlist(bl, lce); 932 + spin_unlock(&bl->bl_ext_lock); 933 + } 934 + } 935 + }
+7 -4
fs/nfs/client.c
··· 105 105 .nrvers = ARRAY_SIZE(nfs_version), 106 106 .version = nfs_version, 107 107 .stats = &nfs_rpcstat, 108 - .pipe_dir_name = "/nfs", 108 + .pipe_dir_name = NFS_PIPE_DIRNAME, 109 109 }; 110 110 111 111 struct rpc_stat nfs_rpcstat = { ··· 904 904 /* 905 905 * Load up the server record from information gained in an fsinfo record 906 906 */ 907 - static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) 907 + static void nfs_server_set_fsinfo(struct nfs_server *server, 908 + struct nfs_fh *mntfh, 909 + struct nfs_fsinfo *fsinfo) 908 910 { 909 911 unsigned long max_rpc_payload; 910 912 ··· 936 934 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 937 935 server->wsize = NFS_MAX_FILE_IO_SIZE; 938 936 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 939 - set_pnfs_layoutdriver(server, fsinfo->layouttype); 937 + server->pnfs_blksize = fsinfo->blksize; 938 + set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); 940 939 941 940 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 942 941 ··· 983 980 if (error < 0) 984 981 goto out_error; 985 982 986 - nfs_server_set_fsinfo(server, &fsinfo); 983 + nfs_server_set_fsinfo(server, mntfh, &fsinfo); 987 984 988 985 /* Get some general file system info */ 989 986 if (server->namelen == 0) {
+34 -23
fs/nfs/dir.c
··· 134 134 135 135 #endif /* CONFIG_NFS_V4 */ 136 136 137 - static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) 137 + static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) 138 138 { 139 139 struct nfs_open_dir_context *ctx; 140 140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 141 141 if (ctx != NULL) { 142 142 ctx->duped = 0; 143 + ctx->attr_gencount = NFS_I(dir)->attr_gencount; 143 144 ctx->dir_cookie = 0; 144 145 ctx->dup_cookie = 0; 145 146 ctx->cred = get_rpccred(cred); 146 - } else 147 - ctx = ERR_PTR(-ENOMEM); 148 - return ctx; 147 + return ctx; 148 + } 149 + return ERR_PTR(-ENOMEM); 149 150 } 150 151 151 152 static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) ··· 174 173 cred = rpc_lookup_cred(); 175 174 if (IS_ERR(cred)) 176 175 return PTR_ERR(cred); 177 - ctx = alloc_nfs_open_dir_context(cred); 176 + ctx = alloc_nfs_open_dir_context(inode, cred); 178 177 if (IS_ERR(ctx)) { 179 178 res = PTR_ERR(ctx); 180 179 goto out; ··· 324 323 { 325 324 loff_t diff = desc->file->f_pos - desc->current_index; 326 325 unsigned int index; 327 - struct nfs_open_dir_context *ctx = desc->file->private_data; 328 326 329 327 if (diff < 0) 330 328 goto out_eof; ··· 336 336 index = (unsigned int)diff; 337 337 *desc->dir_cookie = array->array[index].cookie; 338 338 desc->cache_entry_index = index; 339 - ctx->duped = 0; 340 339 return 0; 341 340 out_eof: 342 341 desc->eof = 1; ··· 348 349 int i; 349 350 loff_t new_pos; 350 351 int status = -EAGAIN; 351 - struct nfs_open_dir_context *ctx = desc->file->private_data; 352 352 353 353 for (i = 0; i < array->size; i++) { 354 354 if (array->array[i].cookie == *desc->dir_cookie) { 355 + struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); 356 + struct nfs_open_dir_context *ctx = desc->file->private_data; 357 + 355 358 new_pos = desc->current_index + i; 356 - if (new_pos < desc->file->f_pos) { 359 + if (ctx->attr_gencount != nfsi->attr_gencount 360 + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 361 + ctx->duped = 0; 362 + ctx->attr_gencount = nfsi->attr_gencount; 363 + } else if (new_pos < desc->file->f_pos) { 364 + if (ctx->duped > 0 365 + && ctx->dup_cookie == *desc->dir_cookie) { 366 + if (printk_ratelimit()) { 367 + pr_notice("NFS: directory %s/%s contains a readdir loop." 368 + "Please contact your server vendor. " 369 + "The file: %s has duplicate cookie %llu\n", 370 + desc->file->f_dentry->d_parent->d_name.name, 371 + desc->file->f_dentry->d_name.name, 372 + array->array[i].string.name, 373 + *desc->dir_cookie); 374 + } 375 + status = -ELOOP; 376 + goto out; 377 + } 357 378 ctx->dup_cookie = *desc->dir_cookie; 358 - ctx->duped = 1; 379 + ctx->duped = -1; 359 380 } 360 381 desc->file->f_pos = new_pos; 361 382 desc->cache_entry_index = i; ··· 387 368 if (*desc->dir_cookie == array->last_cookie) 388 369 desc->eof = 1; 389 370 } 371 + out: 390 372 return status; 391 373 } 392 374 ··· 760 740 struct nfs_cache_array *array = NULL; 761 741 struct nfs_open_dir_context *ctx = file->private_data; 762 742 763 - if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { 764 - if (printk_ratelimit()) { 765 - pr_notice("NFS: directory %s/%s contains a readdir loop. " 766 - "Please contact your server vendor. " 767 - "Offending cookie: %llu\n", 768 - file->f_dentry->d_parent->d_name.name, 769 - file->f_dentry->d_name.name, 770 - *desc->dir_cookie); 771 - } 772 - res = -ELOOP; 773 - goto out; 774 - } 775 - 776 743 array = nfs_readdir_get_array(desc->page); 777 744 if (IS_ERR(array)) { 778 745 res = PTR_ERR(array); ··· 781 774 *desc->dir_cookie = array->array[i+1].cookie; 782 775 else 783 776 *desc->dir_cookie = array->last_cookie; 777 + if (ctx->duped != 0) 778 + ctx->duped = 1; 784 779 } 785 780 if (array->eof_index >= 0) 786 781 desc->eof = 1; ··· 814 805 struct page *page = NULL; 815 806 int status; 816 807 struct inode *inode = desc->file->f_path.dentry->d_inode; 808 + struct nfs_open_dir_context *ctx = desc->file->private_data; 817 809 818 810 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 819 811 (unsigned long long)*desc->dir_cookie); ··· 828 818 desc->page_index = 0; 829 819 desc->last_cookie = *desc->dir_cookie; 830 820 desc->page = page; 821 + ctx->duped = 0; 831 822 832 823 status = nfs_readdir_xdr_to_array(desc, page, inode); 833 824 if (status < 0)
+1 -1
fs/nfs/nfs4_fs.h
··· 318 318 extern const u32 nfs4_fattr_bitmap[2]; 319 319 extern const u32 nfs4_statfs_bitmap[2]; 320 320 extern const u32 nfs4_pathconf_bitmap[2]; 321 - extern const u32 nfs4_fsinfo_bitmap[2]; 321 + extern const u32 nfs4_fsinfo_bitmap[3]; 322 322 extern const u32 nfs4_fs_locations_bitmap[2]; 323 323 324 324 /* nfs4renewd.c */
+1 -1
fs/nfs/nfs4filelayout.c
··· 170 170 171 171 pnfs_set_layoutcommit(wdata); 172 172 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, 173 - (unsigned long) wdata->lseg->pls_end_pos); 173 + (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); 174 174 } 175 175 176 176 /*
+59 -3
fs/nfs/nfs4proc.c
··· 140 140 0 141 141 }; 142 142 143 - const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE 143 + const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE 144 144 | FATTR4_WORD0_MAXREAD 145 145 | FATTR4_WORD0_MAXWRITE 146 146 | FATTR4_WORD0_LEASE_TIME, 147 147 FATTR4_WORD1_TIME_DELTA 148 - | FATTR4_WORD1_FS_LAYOUT_TYPES 148 + | FATTR4_WORD1_FS_LAYOUT_TYPES, 149 + FATTR4_WORD2_LAYOUT_BLKSIZE 149 150 }; 150 151 151 152 const u32 nfs4_fs_locations_bitmap[2] = { ··· 5835 5834 return status; 5836 5835 } 5837 5836 5837 + /* 5838 + * Retrieve the list of Data Server devices from the MDS. 5839 + */ 5840 + static int _nfs4_getdevicelist(struct nfs_server *server, 5841 + const struct nfs_fh *fh, 5842 + struct pnfs_devicelist *devlist) 5843 + { 5844 + struct nfs4_getdevicelist_args args = { 5845 + .fh = fh, 5846 + .layoutclass = server->pnfs_curr_ld->id, 5847 + }; 5848 + struct nfs4_getdevicelist_res res = { 5849 + .devlist = devlist, 5850 + }; 5851 + struct rpc_message msg = { 5852 + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], 5853 + .rpc_argp = &args, 5854 + .rpc_resp = &res, 5855 + }; 5856 + int status; 5857 + 5858 + dprintk("--> %s\n", __func__); 5859 + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, 5860 + &res.seq_res, 0); 5861 + dprintk("<-- %s status=%d\n", __func__, status); 5862 + return status; 5863 + } 5864 + 5865 + int nfs4_proc_getdevicelist(struct nfs_server *server, 5866 + const struct nfs_fh *fh, 5867 + struct pnfs_devicelist *devlist) 5868 + { 5869 + struct nfs4_exception exception = { }; 5870 + int err; 5871 + 5872 + do { 5873 + err = nfs4_handle_exception(server, 5874 + _nfs4_getdevicelist(server, fh, devlist), 5875 + &exception); 5876 + } while (exception.retry); 5877 + 5878 + dprintk("%s: err=%d, num_devs=%u\n", __func__, 5879 + err, devlist->num_devs); 5880 + 5881 + return err; 5882 + } 5883 + EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); 5884 + 5838 5885 static int 5839 5886 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5840 5887 { ··· 5961 5912 static void nfs4_layoutcommit_release(void *calldata) 5962 5913 { 5963 5914 struct nfs4_layoutcommit_data *data = calldata; 5915 + struct pnfs_layout_segment *lseg, *tmp; 5964 5916 5917 + pnfs_cleanup_layoutcommit(data); 5965 5918 /* Matched by references in pnfs_set_layoutcommit */ 5966 - put_lseg(data->lseg); 5919 + list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { 5920 + list_del_init(&lseg->pls_lc_list); 5921 + if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, 5922 + &lseg->pls_flags)) 5923 + put_lseg(lseg); 5924 + } 5967 5925 put_rpccred(data->cred); 5968 5926 kfree(data); 5969 5927 }
+215 -18
fs/nfs/nfs4xdr.c
··· 113 113 #define encode_restorefh_maxsz (op_encode_hdr_maxsz) 114 114 #define decode_restorefh_maxsz (op_decode_hdr_maxsz) 115 115 #define encode_fsinfo_maxsz (encode_getattr_maxsz) 116 - #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) 116 + /* The 5 accounts for the PNFS attributes, and assumes that at most three 117 + * layout types will be returned. 118 + */ 119 + #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ 120 + nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) 117 121 #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 118 122 #define decode_renew_maxsz (op_decode_hdr_maxsz) 119 123 #define encode_setclientid_maxsz \ ··· 318 314 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 319 315 #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 320 316 #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 317 + #define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ 318 + encode_verifier_maxsz) 319 + #define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ 320 + 2 /* nfs_cookie4 gdlr_cookie */ + \ 321 + decode_verifier_maxsz \ 322 + /* verifier4 gdlr_verifier */ + \ 323 + 1 /* gdlr_deviceid_list count */ + \ 324 + XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ 325 + NFS4_DEVICEID4_SIZE) \ 326 + /* gdlr_deviceid_list */ + \ 327 + 1 /* bool gdlr_eof */) 321 328 #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ 322 329 XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) 323 330 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ··· 763 748 #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 764 749 decode_sequence_maxsz + \ 765 750 decode_reclaim_complete_maxsz) 751 + #define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ 752 + encode_sequence_maxsz + \ 753 + encode_putfh_maxsz + \ 754 + encode_getdevicelist_maxsz) 755 + #define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ 756 + decode_sequence_maxsz + \ 757 + decode_putfh_maxsz + \ 758 + decode_getdevicelist_maxsz) 766 759 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 767 760 encode_sequence_maxsz +\ 768 761 encode_getdeviceinfo_maxsz) ··· 1127 1104 hdr->replen += decode_getattr_maxsz; 1128 1105 } 1129 1106 1107 + static void 1108 + encode_getattr_three(struct xdr_stream *xdr, 1109 + uint32_t bm0, uint32_t bm1, uint32_t bm2, 1110 + struct compound_hdr *hdr) 1111 + { 1112 + __be32 *p; 1113 + 1114 + p = reserve_space(xdr, 4); 1115 + *p = cpu_to_be32(OP_GETATTR); 1116 + if (bm2) { 1117 + p = reserve_space(xdr, 16); 1118 + *p++ = cpu_to_be32(3); 1119 + *p++ = cpu_to_be32(bm0); 1120 + *p++ = cpu_to_be32(bm1); 1121 + *p = cpu_to_be32(bm2); 1122 + } else if (bm1) { 1123 + p = reserve_space(xdr, 12); 1124 + *p++ = cpu_to_be32(2); 1125 + *p++ = cpu_to_be32(bm0); 1126 + *p = cpu_to_be32(bm1); 1127 + } else { 1128 + p = reserve_space(xdr, 8); 1129 + *p++ = cpu_to_be32(1); 1130 + *p = cpu_to_be32(bm0); 1131 + } 1132 + hdr->nops++; 1133 + hdr->replen += decode_getattr_maxsz; 1134 + } 1135 + 1130 1136 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1131 1137 { 1132 1138 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], ··· 1164 1112 1165 1113 static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1166 1114 { 1167 - encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], 1168 - bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); 1115 + encode_getattr_three(xdr, 1116 + bitmask[0] & nfs4_fsinfo_bitmap[0], 1117 + bitmask[1] & nfs4_fsinfo_bitmap[1], 1118 + bitmask[2] & nfs4_fsinfo_bitmap[2], 1119 + hdr); 1169 1120 } 1170 1121 1171 1122 static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) ··· 1910 1855 1911 1856 #ifdef CONFIG_NFS_V4_1 1912 1857 static void 1858 + encode_getdevicelist(struct xdr_stream *xdr, 1859 + const struct nfs4_getdevicelist_args *args, 1860 + struct compound_hdr *hdr) 1861 + { 1862 + __be32 *p; 1863 + nfs4_verifier dummy = { 1864 + .data = "dummmmmy", 1865 + }; 1866 + 1867 + p = reserve_space(xdr, 20); 1868 + *p++ = cpu_to_be32(OP_GETDEVICELIST); 1869 + *p++ = cpu_to_be32(args->layoutclass); 1870 + *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); 1871 + xdr_encode_hyper(p, 0ULL); /* cookie */ 1872 + encode_nfs4_verifier(xdr, &dummy); 1873 + hdr->nops++; 1874 + hdr->replen += decode_getdevicelist_maxsz; 1875 + } 1876 + 1877 + static void 1913 1878 encode_getdeviceinfo(struct xdr_stream *xdr, 1914 1879 const struct nfs4_getdeviceinfo_args *args, 1915 1880 struct compound_hdr *hdr) ··· 1991 1916 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1992 1917 /* Only whole file layouts */ 1993 1918 p = xdr_encode_hyper(p, 0); /* offset */ 1994 - p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ 1919 + p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ 1995 1920 *p++ = cpu_to_be32(0); /* reclaim */ 1996 1921 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); 1997 1922 *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ··· 2679 2604 struct compound_hdr hdr = { 2680 2605 .nops = 0, 2681 2606 }; 2682 - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2607 + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; 2683 2608 2684 2609 encode_compound_hdr(xdr, req, &hdr); 2685 2610 encode_setclientid_confirm(xdr, arg, &hdr); ··· 2823 2748 struct compound_hdr hdr = { 2824 2749 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2825 2750 }; 2826 - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2751 + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; 2827 2752 2828 2753 encode_compound_hdr(xdr, req, &hdr); 2829 2754 encode_sequence(xdr, &args->la_seq_args, &hdr); ··· 2846 2771 encode_compound_hdr(xdr, req, &hdr); 2847 2772 encode_sequence(xdr, &args->seq_args, &hdr); 2848 2773 encode_reclaim_complete(xdr, args, &hdr); 2774 + encode_nops(&hdr); 2775 + } 2776 + 2777 + /* 2778 + * Encode GETDEVICELIST request 2779 + */ 2780 + static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, 2781 + struct xdr_stream *xdr, 2782 + struct nfs4_getdevicelist_args *args) 2783 + { 2784 + struct compound_hdr hdr = { 2785 + .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2786 + }; 2787 + 2788 + encode_compound_hdr(xdr, req, &hdr); 2789 + encode_sequence(xdr, &args->seq_args, &hdr); 2790 + encode_putfh(xdr, args->fh, &hdr); 2791 + encode_getdevicelist(xdr, args, &hdr); 2849 2792 encode_nops(&hdr); 2850 2793 } 2851 2794 ··· 3104 3011 goto out_overflow; 3105 3012 bmlen = be32_to_cpup(p); 3106 3013 3107 - bitmap[0] = bitmap[1] = 0; 3014 + bitmap[0] = bitmap[1] = bitmap[2] = 0; 3108 3015 p = xdr_inline_decode(xdr, (bmlen << 2)); 3109 3016 if (unlikely(!p)) 3110 3017 goto out_overflow; 3111 3018 if (bmlen > 0) { 3112 3019 bitmap[0] = be32_to_cpup(p++); 3113 - if (bmlen > 1) 3114 - bitmap[1] = be32_to_cpup(p); 3020 + if (bmlen > 1) { 3021 + bitmap[1] = be32_to_cpup(p++); 3022 + if (bmlen > 2) 3023 + bitmap[2] = be32_to_cpup(p); 3024 + } 3115 3025 } 3116 3026 return 0; 3117 3027 out_overflow: ··· 3146 3050 return ret; 3147 3051 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 3148 3052 } else 3149 - bitmask[0] = bitmask[1] = 0; 3150 - dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); 3053 + bitmask[0] = bitmask[1] = bitmask[2] = 0; 3054 + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, 3055 + bitmask[0], bitmask[1], bitmask[2]); 3151 3056 return 0; 3152 3057 } 3153 3058 ··· 4202 4105 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 4203 4106 { 4204 4107 __be32 *savep; 4205 - uint32_t attrlen, bitmap[2] = {0}; 4108 + uint32_t attrlen, bitmap[3] = {0}; 4206 4109 int status; 4207 4110 4208 4111 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ··· 4228 4131 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) 4229 4132 { 4230 4133 __be32 *savep; 4231 - uint32_t attrlen, bitmap[2] = {0}; 4134 + uint32_t attrlen, bitmap[3] = {0}; 4232 4135 int status; 4233 4136 4234 4137 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ··· 4260 4163 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) 4261 4164 { 4262 4165 __be32 *savep; 4263 - uint32_t attrlen, bitmap[2] = {0}; 4166 + uint32_t attrlen, bitmap[3] = {0}; 4264 4167 int status; 4265 4168 4266 4169 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ··· 4400 4303 { 4401 4304 __be32 *savep; 4402 4305 uint32_t attrlen, 4403 - bitmap[2] = {0}; 4306 + bitmap[3] = {0}; 4404 4307 int status; 4405 4308 4406 4309 status = decode_op_hdr(xdr, OP_GETATTR); ··· 4486 4389 return status; 4487 4390 } 4488 4391 4392 + /* 4393 + * The prefered block size for layout directed io 4394 + */ 4395 + static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, 4396 + uint32_t *res) 4397 + { 4398 + __be32 *p; 4399 + 4400 + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); 4401 + *res = 0; 4402 + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { 4403 + p = xdr_inline_decode(xdr, 4); 4404 + if (unlikely(!p)) { 4405 + print_overflow_msg(__func__, xdr); 4406 + return -EIO; 4407 + } 4408 + *res = be32_to_cpup(p); 4409 + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; 4410 + } 4411 + return 0; 4412 + } 4413 + 4489 4414 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4490 4415 { 4491 4416 __be32 *savep; 4492 - uint32_t attrlen, bitmap[2]; 4417 + uint32_t attrlen, bitmap[3]; 4493 4418 int status; 4494 4419 4495 4420 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ··· 4538 4419 goto xdr_error; 4539 4420 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); 4540 4421 if (status != 0) 4422 + goto xdr_error; 4423 + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); 4424 + if (status) 4541 4425 goto xdr_error; 4542 4426 4543 4427 status = verify_attr_len(xdr, savep, attrlen); ··· 4961 4839 { 4962 4840 __be32 *savep; 4963 4841 uint32_t attrlen, 4964 - bitmap[2] = {0}; 4842 + bitmap[3] = {0}; 4965 4843 struct kvec *iov = req->rq_rcv_buf.head; 4966 4844 int status; 4967 4845 ··· 5390 5268 } 5391 5269 5392 5270 #if defined(CONFIG_NFS_V4_1) 5271 + /* 5272 + * TODO: Need to handle case when EOF != true; 5273 + */ 5274 + static int decode_getdevicelist(struct xdr_stream *xdr, 5275 + struct pnfs_devicelist *res) 5276 + { 5277 + __be32 *p; 5278 + int status, i; 5279 + struct nfs_writeverf verftemp; 5280 + 5281 + status = decode_op_hdr(xdr, OP_GETDEVICELIST); 5282 + if (status) 5283 + return status; 5284 + 5285 + p = xdr_inline_decode(xdr, 8 + 8 + 4); 5286 + if (unlikely(!p)) 5287 + goto out_overflow; 5288 + 5289 + /* TODO: Skip cookie for now */ 5290 + p += 2; 5291 + 5292 + /* Read verifier */ 5293 + p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); 5294 + 5295 + res->num_devs = be32_to_cpup(p); 5296 + 5297 + dprintk("%s: num_dev %d\n", __func__, res->num_devs); 5298 + 5299 + if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { 5300 + printk(KERN_ERR "%s too many result dev_num %u\n", 5301 + __func__, res->num_devs); 5302 + return -EIO; 5303 + } 5304 + 5305 + p = xdr_inline_decode(xdr, 5306 + res->num_devs * NFS4_DEVICEID4_SIZE + 4); 5307 + if (unlikely(!p)) 5308 + goto out_overflow; 5309 + for (i = 0; i < res->num_devs; i++) 5310 + p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, 5311 + NFS4_DEVICEID4_SIZE); 5312 + res->eof = be32_to_cpup(p); 5313 + return 0; 5314 + out_overflow: 5315 + print_overflow_msg(__func__, xdr); 5316 + return -EIO; 5317 + } 5393 5318 5394 5319 static int decode_getdeviceinfo(struct xdr_stream *xdr, 5395 5320 struct pnfs_device *pdev) ··· 5599 5430 int status; 5600 5431 5601 5432 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); 5433 + res->status = status; 5602 5434 if (status) 5603 5435 return status; 5604 5436 ··· 6712 6542 } 6713 6543 6714 6544 /* 6545 + * Decode GETDEVICELIST response 6546 + */ 6547 + static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, 6548 + struct xdr_stream *xdr, 6549 + struct nfs4_getdevicelist_res *res) 6550 + { 6551 + struct compound_hdr hdr; 6552 + int status; 6553 + 6554 + dprintk("encoding getdevicelist!\n"); 6555 + 6556 + status = decode_compound_hdr(xdr, &hdr); 6557 + if (status != 0) 6558 + goto out; 6559 + status = decode_sequence(xdr, &res->seq_res, rqstp); 6560 + if (status != 0) 6561 + goto out; 6562 + status = decode_putfh(xdr); 6563 + if (status != 0) 6564 + goto out; 6565 + status = decode_getdevicelist(xdr, res->devlist); 6566 + out: 6567 + return status; 6568 + } 6569 + 6570 + /* 6715 6571 * Decode GETDEVINFO response 6716 6572 */ 6717 6573 static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, ··· 6918 6722 int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, 6919 6723 int plus) 6920 6724 { 6921 - uint32_t bitmap[2] = {0}; 6725 + uint32_t bitmap[3] = {0}; 6922 6726 uint32_t len; 6923 6727 __be32 *p = xdr_inline_decode(xdr, 4); 6924 6728 if (unlikely(!p)) ··· 7104 6908 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7105 6909 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7106 6910 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 6911 + PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), 7107 6912 #endif /* CONFIG_NFS_V4_1 */ 7108 6913 }; 7109 6914
+54 -32
fs/nfs/pnfs.c
··· 76 76 void 77 77 unset_pnfs_layoutdriver(struct nfs_server *nfss) 78 78 { 79 - if (nfss->pnfs_curr_ld) 79 + if (nfss->pnfs_curr_ld) { 80 + if (nfss->pnfs_curr_ld->clear_layoutdriver) 81 + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); 80 82 module_put(nfss->pnfs_curr_ld->owner); 83 + } 81 84 nfss->pnfs_curr_ld = NULL; 82 85 } 83 86 ··· 91 88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 92 89 */ 93 90 void 94 - set_pnfs_layoutdriver(struct nfs_server *server, u32 id) 91 + set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, 92 + u32 id) 95 93 { 96 94 struct pnfs_layoutdriver_type *ld_type = NULL; 97 95 ··· 119 115 goto out_no_driver; 120 116 } 121 117 server->pnfs_curr_ld = ld_type; 118 + if (ld_type->set_layoutdriver 119 + && ld_type->set_layoutdriver(server, mntfh)) { 120 + printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", 121 + __func__, id); 122 + module_put(ld_type->owner); 123 + goto out_no_driver; 124 + } 122 125 123 126 dprintk("%s: pNFS module for %u set\n", __func__, id); 124 127 return; ··· 201 190 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 202 191 { 203 192 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 193 + put_rpccred(lo->plh_lc_cred); 204 194 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 205 195 } 206 196 ··· 236 224 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 237 225 { 238 226 INIT_LIST_HEAD(&lseg->pls_list); 227 + INIT_LIST_HEAD(&lseg->pls_lc_list); 239 228 atomic_set(&lseg->pls_refcount, 1); 240 229 smp_mb(); 241 230 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); ··· 829 816 } 830 817 831 818 static struct pnfs_layout_hdr * 832 - alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) 819 + alloc_init_layout_hdr(struct inode *ino, 820 + struct nfs_open_context *ctx, 821 + gfp_t gfp_flags) 833 822 { 834 823 struct pnfs_layout_hdr *lo; 835 824 ··· 843 828 INIT_LIST_HEAD(&lo->plh_segs); 844 829 INIT_LIST_HEAD(&lo->plh_bulk_recall); 845 830 lo->plh_inode = ino; 831 + lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); 846 832 return lo; 847 833 } 848 834 849 835 static struct pnfs_layout_hdr * 850 - pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) 836 + pnfs_find_alloc_layout(struct inode *ino, 837 + struct nfs_open_context *ctx, 838 + gfp_t gfp_flags) 851 839 { 852 840 struct nfs_inode *nfsi = NFS_I(ino); 853 841 struct pnfs_layout_hdr *new = NULL; ··· 865 847 return nfsi->layout; 866 848 } 867 849 spin_unlock(&ino->i_lock); 868 - new = alloc_init_layout_hdr(ino, gfp_flags); 850 + new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 869 851 spin_lock(&ino->i_lock); 870 852 871 853 if (likely(nfsi->layout == NULL)) /* Won the race? */ ··· 958 940 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 959 941 return NULL; 960 942 spin_lock(&ino->i_lock); 961 - lo = pnfs_find_alloc_layout(ino, gfp_flags); 943 + lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 962 944 if (lo == NULL) { 963 945 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 964 946 goto out_unlock; ··· 1368 1350 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1369 1351 1370 1352 /* 1371 - * Currently there is only one (whole file) write lseg. 1353 + * There can be multiple RW segments. 1372 1354 */ 1373 - static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) 1355 + static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) 1374 1356 { 1375 - struct pnfs_layout_segment *lseg, *rv = NULL; 1357 + struct pnfs_layout_segment *lseg; 1376 1358 1377 - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) 1378 - if (lseg->pls_range.iomode == IOMODE_RW) 1379 - rv = lseg; 1380 - return rv; 1359 + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { 1360 + if (lseg->pls_range.iomode == IOMODE_RW && 1361 + test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 1362 + list_add(&lseg->pls_lc_list, listp); 1363 + } 1381 1364 } 1382 1365 1383 1366 void ··· 1390 1371 1391 1372 spin_lock(&nfsi->vfs_inode.i_lock); 1392 1373 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1393 - /* references matched in nfs4_layoutcommit_release */ 1394 - get_lseg(wdata->lseg); 1395 - wdata->lseg->pls_lc_cred = 1396 - get_rpccred(wdata->args.context->state->owner->so_cred); 1397 1374 mark_as_dirty = true; 1398 1375 dprintk("%s: Set layoutcommit for inode %lu ", 1399 1376 __func__, wdata->inode->i_ino); 1400 1377 } 1401 - if (end_pos > wdata->lseg->pls_end_pos) 1402 - wdata->lseg->pls_end_pos = end_pos; 1378 + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { 1379 + /* references matched in nfs4_layoutcommit_release */ 1380 + get_lseg(wdata->lseg); 1381 + } 1382 + if (end_pos > nfsi->layout->plh_lwb) 1383 + nfsi->layout->plh_lwb = end_pos; 1403 1384 spin_unlock(&nfsi->vfs_inode.i_lock); 1385 + dprintk("%s: lseg %p end_pos %llu\n", 1386 + __func__, wdata->lseg, nfsi->layout->plh_lwb); 1404 1387 1405 1388 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1406 1389 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ ··· 1410 1389 mark_inode_dirty_sync(wdata->inode); 1411 1390 } 1412 1391 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1392 + 1393 + void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1394 + { 1395 + struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1396 + 1397 + if (nfss->pnfs_curr_ld->cleanup_layoutcommit) 1398 + nfss->pnfs_curr_ld->cleanup_layoutcommit(data); 1399 + } 1413 1400 1414 1401 /* 1415 1402 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and ··· 1432 1403 { 1433 1404 struct nfs4_layoutcommit_data *data; 1434 1405 struct nfs_inode *nfsi = NFS_I(inode); 1435 - struct pnfs_layout_segment *lseg; 1436 - struct rpc_cred *cred; 1437 1406 loff_t end_pos; 1438 1407 int status = 0; 1439 1408 ··· 1448 1421 goto out; 1449 1422 } 1450 1423 1424 + INIT_LIST_HEAD(&data->lseg_list); 1451 1425 spin_lock(&inode->i_lock); 1452 1426 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1453 1427 spin_unlock(&inode->i_lock); 1454 1428 kfree(data); 1455 1429 goto out; 1456 1430 } 1457 - /* 1458 - * Currently only one (whole file) write lseg which is referenced 1459 - * in pnfs_set_layoutcommit and will be found. 1460 - */ 1461 - lseg = pnfs_list_write_lseg(inode); 1462 1431 1463 - end_pos = lseg->pls_end_pos; 1464 - cred = lseg->pls_lc_cred; 1465 - lseg->pls_end_pos = 0; 1466 - lseg->pls_lc_cred = NULL; 1432 + pnfs_list_write_lseg(inode, &data->lseg_list); 1433 + 1434 + end_pos = nfsi->layout->plh_lwb; 1435 + nfsi->layout->plh_lwb = 0; 1467 1436 1468 1437 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, 1469 1438 sizeof(nfsi->layout->plh_stateid.data)); 1470 1439 spin_unlock(&inode->i_lock); 1471 1440 1472 1441 data->args.inode = inode; 1473 - data->lseg = lseg; 1474 - data->cred = cred; 1442 + data->cred = get_rpccred(nfsi->layout->plh_lc_cred); 1475 1443 nfs_fattr_init(&data->fattr); 1476 1444 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1477 1445 data->res.fattr = &data->fattr;
+24 -4
fs/nfs/pnfs.h
··· 36 36 enum { 37 37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 38 38 NFS_LSEG_ROC, /* roc bit received from server */ 39 + NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ 39 40 }; 40 41 41 42 struct pnfs_layout_segment { 42 43 struct list_head pls_list; 44 + struct list_head pls_lc_list; 43 45 struct pnfs_layout_range pls_range; 44 46 atomic_t pls_refcount; 45 47 unsigned long pls_flags; 46 48 struct pnfs_layout_hdr *pls_layout; 47 - struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ 48 - loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ 49 49 }; 50 50 51 51 enum pnfs_try_status { ··· 80 80 struct module *owner; 81 81 unsigned flags; 82 82 83 + int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); 84 + int (*clear_layoutdriver) (struct nfs_server *); 85 + 83 86 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); 84 87 void (*free_layout_hdr) (struct pnfs_layout_hdr *); 85 88 ··· 113 110 struct xdr_stream *xdr, 114 111 const struct nfs4_layoutreturn_args *args); 115 112 113 + void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); 114 + 116 115 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 117 116 struct xdr_stream *xdr, 118 117 const struct nfs4_layoutcommit_args *args); ··· 130 125 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 131 126 u32 plh_barrier; /* ignore lower seqids */ 132 127 unsigned long plh_flags; 128 + loff_t plh_lwb; /* last write byte for layoutcommit */ 129 + struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 133 130 struct inode *plh_inode; 134 131 }; 135 132 ··· 144 137 unsigned int pglen; 145 138 }; 146 139 140 + #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 141 + 142 + struct pnfs_devicelist { 143 + unsigned int eof; 144 + unsigned int num_devs; 145 + struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; 146 + }; 147 + 147 148 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 148 149 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 149 150 150 151 /* nfs4proc.c */ 152 + extern int nfs4_proc_getdevicelist(struct nfs_server *server, 153 + const struct nfs_fh *fh, 154 + struct pnfs_devicelist *devlist); 151 155 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 152 156 struct pnfs_device *dev); 153 157 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ··· 171 153 bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); 172 154 bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); 173 155 174 - void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 156 + void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); 175 157 void unset_pnfs_layoutdriver(struct nfs_server *); 176 158 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); 177 159 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); ··· 197 179 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 198 180 bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 199 181 void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 182 + void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 200 183 int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 201 184 int _pnfs_return_layout(struct inode *); 202 185 int pnfs_ld_write_done(struct nfs_write_data *); ··· 379 360 return false; 380 361 } 381 362 382 - static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) 363 + static inline void set_pnfs_layoutdriver(struct nfs_server *s, 364 + const struct nfs_fh *mntfh, u32 id); 383 365 { 384 366 } 385 367
+2
include/linux/nfs.h
··· 29 29 #define NFS_MNT_VERSION 1 30 30 #define NFS_MNT3_VERSION 3 31 31 32 + #define NFS_PIPE_DIRNAME "/nfs" 33 + 32 34 /* 33 35 * NFS stats. The good thing with these values is that NFSv3 errors are 34 36 * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
+1
include/linux/nfs4.h
··· 566 566 NFSPROC4_CLNT_SECINFO_NO_NAME, 567 567 NFSPROC4_CLNT_TEST_STATEID, 568 568 NFSPROC4_CLNT_FREE_STATEID, 569 + NFSPROC4_CLNT_GETDEVICELIST, 569 570 }; 570 571 571 572 /* nfs41 types */
+2 -1
include/linux/nfs_fs.h
··· 99 99 100 100 struct nfs_open_dir_context { 101 101 struct rpc_cred *cred; 102 + unsigned long attr_gencount; 102 103 __u64 dir_cookie; 103 104 __u64 dup_cookie; 104 - int duped; 105 + signed char duped; 105 106 }; 106 107 107 108 /*
+3 -1
include/linux/nfs_fs_sb.h
··· 132 132 #endif 133 133 134 134 #ifdef CONFIG_NFS_V4 135 - u32 attr_bitmask[2];/* V4 bitmask representing the set 135 + u32 attr_bitmask[3];/* V4 bitmask representing the set 136 136 of attributes supported on this 137 137 filesystem */ 138 138 u32 cache_consistency_bitmask[2]; ··· 145 145 filesystem */ 146 146 struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ 147 147 struct rpc_wait_queue roc_rpcwaitq; 148 + u32 pnfs_blksize; /* layout_blksize attr */ 149 + void *pnfs_ld_data; /* per mount point data */ 148 150 149 151 /* the following fields are protected by nfs_client->cl_lock */ 150 152 struct rb_root state_owners;
+15 -2
include/linux/nfs_xdr.h
··· 122 122 struct timespec time_delta; /* server time granularity */ 123 123 __u32 lease_time; /* in seconds */ 124 124 __u32 layouttype; /* supported pnfs layout driver */ 125 + __u32 blksize; /* preferred pnfs io block size */ 125 126 }; 126 127 127 128 struct nfs_fsstat { ··· 236 235 gfp_t gfp_flags; 237 236 }; 238 237 238 + struct nfs4_getdevicelist_args { 239 + const struct nfs_fh *fh; 240 + u32 layoutclass; 241 + struct nfs4_sequence_args seq_args; 242 + }; 243 + 244 + struct nfs4_getdevicelist_res { 245 + struct pnfs_devicelist *devlist; 246 + struct nfs4_sequence_res seq_res; 247 + }; 248 + 239 249 struct nfs4_getdeviceinfo_args { 240 250 struct pnfs_device *pdev; 241 251 struct nfs4_sequence_args seq_args; ··· 269 257 struct nfs_fattr *fattr; 270 258 const struct nfs_server *server; 271 259 struct nfs4_sequence_res seq_res; 260 + int status; 272 261 }; 273 262 274 263 struct nfs4_layoutcommit_data { 275 264 struct rpc_task task; 276 265 struct nfs_fattr fattr; 277 - struct pnfs_layout_segment *lseg; 266 + struct list_head lseg_list; 278 267 struct rpc_cred *cred; 279 268 struct nfs4_layoutcommit_args args; 280 269 struct nfs4_layoutcommit_res res; ··· 956 943 }; 957 944 958 945 struct nfs4_server_caps_res { 959 - u32 attr_bitmask[2]; 946 + u32 attr_bitmask[3]; 960 947 u32 acl_bitmask; 961 948 u32 has_links; 962 949 u32 has_symlinks;