at v2.6.39 1374 lines 34 kB view raw
1/* 2 * Copyright (C) 2005, 2006 3 * Avishay Traeger (avishay@gmail.com) 4 * Copyright (C) 2008, 2009 5 * Boaz Harrosh <bharrosh@panasas.com> 6 * 7 * Copyrights for code taken from ext2: 8 * Copyright (C) 1992, 1993, 1994, 1995 9 * Remy Card (card@masi.ibp.fr) 10 * Laboratoire MASI - Institut Blaise Pascal 11 * Universite Pierre et Marie Curie (Paris VI) 12 * from 13 * linux/fs/minix/inode.c 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * This file is part of exofs. 17 * 18 * exofs is free software; you can redistribute it and/or modify 19 * it under the terms of the GNU General Public License as published by 20 * the Free Software Foundation. Since it is based on ext2, and the only 21 * valid version of GPL for the Linux kernel is version 2, the only valid 22 * version of GPL for exofs is version 2. 23 * 24 * exofs is distributed in the hope that it will be useful, 25 * but WITHOUT ANY WARRANTY; without even the implied warranty of 26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 * GNU General Public License for more details. 28 * 29 * You should have received a copy of the GNU General Public License 30 * along with exofs; if not, write to the Free Software 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 32 */ 33 34#include <linux/slab.h> 35 36#include "exofs.h" 37 38#define EXOFS_DBGMSG2(M...) do {} while (0) 39 40enum { BIO_MAX_PAGES_KMALLOC = 41 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), 42 MAX_PAGES_KMALLOC = 43 PAGE_SIZE / sizeof(struct page *), 44}; 45 46unsigned exofs_max_io_pages(struct exofs_layout *layout, 47 unsigned expected_pages) 48{ 49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); 50 51 /* TODO: easily support bio chaining */ 52 pages = min_t(unsigned, pages, 53 layout->group_width * BIO_MAX_PAGES_KMALLOC); 54 return pages; 55} 56 57struct page_collect { 58 struct exofs_sb_info *sbi; 59 struct inode *inode; 60 unsigned expected_pages; 61 struct exofs_io_state *ios; 62 63 struct page **pages; 64 unsigned alloc_pages; 65 unsigned nr_pages; 66 unsigned long length; 67 loff_t pg_first; /* keep 64bit also in 32-arches */ 68 bool read_4_write; /* This means two things: that the read is sync 69 * And the pages should not be unlocked. 70 */ 71}; 72 73static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 74 struct inode *inode) 75{ 76 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 77 78 pcol->sbi = sbi; 79 pcol->inode = inode; 80 pcol->expected_pages = expected_pages; 81 82 pcol->ios = NULL; 83 pcol->pages = NULL; 84 pcol->alloc_pages = 0; 85 pcol->nr_pages = 0; 86 pcol->length = 0; 87 pcol->pg_first = -1; 88 pcol->read_4_write = false; 89} 90 91static void _pcol_reset(struct page_collect *pcol) 92{ 93 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); 94 95 pcol->pages = NULL; 96 pcol->alloc_pages = 0; 97 pcol->nr_pages = 0; 98 pcol->length = 0; 99 pcol->pg_first = -1; 100 pcol->ios = NULL; 101 102 /* this is probably the end of the loop but in writes 103 * it might not end here. don't be left with nothing 104 */ 105 if (!pcol->expected_pages) 106 pcol->expected_pages = MAX_PAGES_KMALLOC; 107} 108 109static int pcol_try_alloc(struct page_collect *pcol) 110{ 111 unsigned pages; 112 113 if (!pcol->ios) { /* First time allocate io_state */ 114 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); 115 116 if (ret) 117 return ret; 118 } 119 120 /* TODO: easily support bio chaining */ 121 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); 122 123 for (; pages; pages >>= 1) { 124 pcol->pages = kmalloc(pages * sizeof(struct page *), 125 GFP_KERNEL); 126 if (likely(pcol->pages)) { 127 pcol->alloc_pages = pages; 128 return 0; 129 } 130 } 131 132 EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", 133 pcol->expected_pages); 134 return -ENOMEM; 135} 136 137static void pcol_free(struct page_collect *pcol) 138{ 139 kfree(pcol->pages); 140 pcol->pages = NULL; 141 142 if (pcol->ios) { 143 exofs_put_io_state(pcol->ios); 144 pcol->ios = NULL; 145 } 146} 147 148static int pcol_add_page(struct page_collect *pcol, struct page *page, 149 unsigned len) 150{ 151 if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) 152 return -ENOMEM; 153 154 pcol->pages[pcol->nr_pages++] = page; 155 pcol->length += len; 156 return 0; 157} 158 159static int update_read_page(struct page *page, int ret) 160{ 161 if (ret == 0) { 162 /* Everything is OK */ 163 SetPageUptodate(page); 164 if (PageError(page)) 165 ClearPageError(page); 166 } else if (ret == -EFAULT) { 167 /* In this case we were trying to read something that wasn't on 168 * disk yet - return a page full of zeroes. This should be OK, 169 * because the object should be empty (if there was a write 170 * before this read, the read would be waiting with the page 171 * locked */ 172 clear_highpage(page); 173 174 SetPageUptodate(page); 175 if (PageError(page)) 176 ClearPageError(page); 177 ret = 0; /* recovered error */ 178 EXOFS_DBGMSG("recovered read error\n"); 179 } else /* Error */ 180 SetPageError(page); 181 182 return ret; 183} 184 185static void update_write_page(struct page *page, int ret) 186{ 187 if (ret) { 188 mapping_set_error(page->mapping, ret); 189 SetPageError(page); 190 } 191 end_page_writeback(page); 192} 193 194/* Called at the end of reads, to optionally unlock pages and update their 195 * status. 196 */ 197static int __readpages_done(struct page_collect *pcol) 198{ 199 int i; 200 u64 resid; 201 u64 good_bytes; 202 u64 length = 0; 203 int ret = exofs_check_io(pcol->ios, &resid); 204 205 if (likely(!ret)) 206 good_bytes = pcol->length; 207 else 208 good_bytes = pcol->length - resid; 209 210 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" 211 " length=0x%lx nr_pages=%u\n", 212 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 213 pcol->nr_pages); 214 215 for (i = 0; i < pcol->nr_pages; i++) { 216 struct page *page = pcol->pages[i]; 217 struct inode *inode = page->mapping->host; 218 int page_stat; 219 220 if (inode != pcol->inode) 221 continue; /* osd might add more pages at end */ 222 223 if (likely(length < good_bytes)) 224 page_stat = 0; 225 else 226 page_stat = ret; 227 228 EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", 229 inode->i_ino, page->index, 230 page_stat ? "bad_bytes" : "good_bytes"); 231 232 ret = update_read_page(page, page_stat); 233 if (!pcol->read_4_write) 234 unlock_page(page); 235 length += PAGE_SIZE; 236 } 237 238 pcol_free(pcol); 239 EXOFS_DBGMSG2("readpages_done END\n"); 240 return ret; 241} 242 243/* callback of async reads */ 244static void readpages_done(struct exofs_io_state *ios, void *p) 245{ 246 struct page_collect *pcol = p; 247 248 __readpages_done(pcol); 249 atomic_dec(&pcol->sbi->s_curr_pending); 250 kfree(pcol); 251} 252 253static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 254{ 255 int i; 256 257 for (i = 0; i < pcol->nr_pages; i++) { 258 struct page *page = pcol->pages[i]; 259 260 if (rw == READ) 261 update_read_page(page, ret); 262 else 263 update_write_page(page, ret); 264 265 unlock_page(page); 266 } 267} 268 269static int read_exec(struct page_collect *pcol) 270{ 271 struct exofs_i_info *oi = exofs_i(pcol->inode); 272 struct exofs_io_state *ios = pcol->ios; 273 struct page_collect *pcol_copy = NULL; 274 int ret; 275 276 if (!pcol->pages) 277 return 0; 278 279 ios->pages = pcol->pages; 280 ios->nr_pages = pcol->nr_pages; 281 ios->length = pcol->length; 282 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 283 284 if (pcol->read_4_write) { 285 exofs_oi_read(oi, pcol->ios); 286 return __readpages_done(pcol); 287 } 288 289 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 290 if (!pcol_copy) { 291 ret = -ENOMEM; 292 goto err; 293 } 294 295 *pcol_copy = *pcol; 296 ios->done = readpages_done; 297 ios->private = pcol_copy; 298 ret = exofs_oi_read(oi, ios); 299 if (unlikely(ret)) 300 goto err; 301 302 atomic_inc(&pcol->sbi->s_curr_pending); 303 304 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 305 ios->obj.id, _LLU(ios->offset), pcol->length); 306 307 /* pages ownership was passed to pcol_copy */ 308 _pcol_reset(pcol); 309 return 0; 310 311err: 312 if (!pcol->read_4_write) 313 _unlock_pcol_pages(pcol, ret, READ); 314 315 pcol_free(pcol); 316 317 kfree(pcol_copy); 318 return ret; 319} 320 321/* readpage_strip is called either directly from readpage() or by the VFS from 322 * within read_cache_pages(), to add one more page to be read. It will try to 323 * collect as many contiguous pages as posible. If a discontinuity is 324 * encountered, or it runs out of resources, it will submit the previous segment 325 * and will start a new collection. Eventually caller must submit the last 326 * segment if present. 327 */ 328static int readpage_strip(void *data, struct page *page) 329{ 330 struct page_collect *pcol = data; 331 struct inode *inode = pcol->inode; 332 struct exofs_i_info *oi = exofs_i(inode); 333 loff_t i_size = i_size_read(inode); 334 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 335 size_t len; 336 int ret; 337 338 /* FIXME: Just for debugging, will be removed */ 339 if (PageUptodate(page)) 340 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, 341 page->index); 342 343 if (page->index < end_index) 344 len = PAGE_CACHE_SIZE; 345 else if (page->index == end_index) 346 len = i_size & ~PAGE_CACHE_MASK; 347 else 348 len = 0; 349 350 if (!len || !obj_created(oi)) { 351 /* this will be out of bounds, or doesn't exist yet. 352 * Current page is cleared and the request is split 353 */ 354 clear_highpage(page); 355 356 SetPageUptodate(page); 357 if (PageError(page)) 358 ClearPageError(page); 359 360 if (!pcol->read_4_write) 361 unlock_page(page); 362 EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " 363 "read_4_write=%d index=0x%lx end_index=0x%lx " 364 "splitting\n", inode->i_ino, len, 365 pcol->read_4_write, page->index, end_index); 366 367 return read_exec(pcol); 368 } 369 370try_again: 371 372 if (unlikely(pcol->pg_first == -1)) { 373 pcol->pg_first = page->index; 374 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 375 page->index)) { 376 /* Discontinuity detected, split the request */ 377 ret = read_exec(pcol); 378 if (unlikely(ret)) 379 goto fail; 380 goto try_again; 381 } 382 383 if (!pcol->pages) { 384 ret = pcol_try_alloc(pcol); 385 if (unlikely(ret)) 386 goto fail; 387 } 388 389 if (len != PAGE_CACHE_SIZE) 390 zero_user(page, len, PAGE_CACHE_SIZE - len); 391 392 EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", 393 inode->i_ino, page->index, len); 394 395 ret = pcol_add_page(pcol, page, len); 396 if (ret) { 397 EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " 398 "this_len=0x%zx nr_pages=%u length=0x%lx\n", 399 page, len, pcol->nr_pages, pcol->length); 400 401 /* split the request, and start again with current page */ 402 ret = read_exec(pcol); 403 if (unlikely(ret)) 404 goto fail; 405 406 goto try_again; 407 } 408 409 return 0; 410 411fail: 412 /* SetPageError(page); ??? */ 413 unlock_page(page); 414 return ret; 415} 416 417static int exofs_readpages(struct file *file, struct address_space *mapping, 418 struct list_head *pages, unsigned nr_pages) 419{ 420 struct page_collect pcol; 421 int ret; 422 423 _pcol_init(&pcol, nr_pages, mapping->host); 424 425 ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); 426 if (ret) { 427 EXOFS_ERR("read_cache_pages => %d\n", ret); 428 return ret; 429 } 430 431 return read_exec(&pcol); 432} 433 434static int _readpage(struct page *page, bool read_4_write) 435{ 436 struct page_collect pcol; 437 int ret; 438 439 _pcol_init(&pcol, 1, page->mapping->host); 440 441 pcol.read_4_write = read_4_write; 442 ret = readpage_strip(&pcol, page); 443 if (ret) { 444 EXOFS_ERR("_readpage => %d\n", ret); 445 return ret; 446 } 447 448 return read_exec(&pcol); 449} 450 451/* 452 * We don't need the file 453 */ 454static int exofs_readpage(struct file *file, struct page *page) 455{ 456 return _readpage(page, false); 457} 458 459/* Callback for osd_write. All writes are asynchronous */ 460static void writepages_done(struct exofs_io_state *ios, void *p) 461{ 462 struct page_collect *pcol = p; 463 int i; 464 u64 resid; 465 u64 good_bytes; 466 u64 length = 0; 467 int ret = exofs_check_io(ios, &resid); 468 469 atomic_dec(&pcol->sbi->s_curr_pending); 470 471 if (likely(!ret)) 472 good_bytes = pcol->length; 473 else 474 good_bytes = pcol->length - resid; 475 476 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" 477 " length=0x%lx nr_pages=%u\n", 478 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 479 pcol->nr_pages); 480 481 for (i = 0; i < pcol->nr_pages; i++) { 482 struct page *page = pcol->pages[i]; 483 struct inode *inode = page->mapping->host; 484 int page_stat; 485 486 if (inode != pcol->inode) 487 continue; /* osd might add more pages to a bio */ 488 489 if (likely(length < good_bytes)) 490 page_stat = 0; 491 else 492 page_stat = ret; 493 494 update_write_page(page, page_stat); 495 unlock_page(page); 496 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", 497 inode->i_ino, page->index, page_stat); 498 499 length += PAGE_SIZE; 500 } 501 502 pcol_free(pcol); 503 kfree(pcol); 504 EXOFS_DBGMSG2("writepages_done END\n"); 505} 506 507static int write_exec(struct page_collect *pcol) 508{ 509 struct exofs_i_info *oi = exofs_i(pcol->inode); 510 struct exofs_io_state *ios = pcol->ios; 511 struct page_collect *pcol_copy = NULL; 512 int ret; 513 514 if (!pcol->pages) 515 return 0; 516 517 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 518 if (!pcol_copy) { 519 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); 520 ret = -ENOMEM; 521 goto err; 522 } 523 524 *pcol_copy = *pcol; 525 526 ios->pages = pcol_copy->pages; 527 ios->nr_pages = pcol_copy->nr_pages; 528 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; 529 ios->length = pcol_copy->length; 530 ios->done = writepages_done; 531 ios->private = pcol_copy; 532 533 ret = exofs_oi_write(oi, ios); 534 if (unlikely(ret)) { 535 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); 536 goto err; 537 } 538 539 atomic_inc(&pcol->sbi->s_curr_pending); 540 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 541 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), 542 pcol->length); 543 /* pages ownership was passed to pcol_copy */ 544 _pcol_reset(pcol); 545 return 0; 546 547err: 548 _unlock_pcol_pages(pcol, ret, WRITE); 549 pcol_free(pcol); 550 kfree(pcol_copy); 551 552 return ret; 553} 554 555/* writepage_strip is called either directly from writepage() or by the VFS from 556 * within write_cache_pages(), to add one more page to be written to storage. 557 * It will try to collect as many contiguous pages as possible. If a 558 * discontinuity is encountered or it runs out of resources it will submit the 559 * previous segment and will start a new collection. 560 * Eventually caller must submit the last segment if present. 561 */ 562static int writepage_strip(struct page *page, 563 struct writeback_control *wbc_unused, void *data) 564{ 565 struct page_collect *pcol = data; 566 struct inode *inode = pcol->inode; 567 struct exofs_i_info *oi = exofs_i(inode); 568 loff_t i_size = i_size_read(inode); 569 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 570 size_t len; 571 int ret; 572 573 BUG_ON(!PageLocked(page)); 574 575 ret = wait_obj_created(oi); 576 if (unlikely(ret)) 577 goto fail; 578 579 if (page->index < end_index) 580 /* in this case, the page is within the limits of the file */ 581 len = PAGE_CACHE_SIZE; 582 else { 583 len = i_size & ~PAGE_CACHE_MASK; 584 585 if (page->index > end_index || !len) { 586 /* in this case, the page is outside the limits 587 * (truncate in progress) 588 */ 589 ret = write_exec(pcol); 590 if (unlikely(ret)) 591 goto fail; 592 if (PageError(page)) 593 ClearPageError(page); 594 unlock_page(page); 595 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " 596 "outside the limits\n", 597 inode->i_ino, page->index); 598 return 0; 599 } 600 } 601 602try_again: 603 604 if (unlikely(pcol->pg_first == -1)) { 605 pcol->pg_first = page->index; 606 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 607 page->index)) { 608 /* Discontinuity detected, split the request */ 609 ret = write_exec(pcol); 610 if (unlikely(ret)) 611 goto fail; 612 613 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", 614 inode->i_ino, page->index); 615 goto try_again; 616 } 617 618 if (!pcol->pages) { 619 ret = pcol_try_alloc(pcol); 620 if (unlikely(ret)) 621 goto fail; 622 } 623 624 EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", 625 inode->i_ino, page->index, len); 626 627 ret = pcol_add_page(pcol, page, len); 628 if (unlikely(ret)) { 629 EXOFS_DBGMSG2("Failed pcol_add_page " 630 "nr_pages=%u total_length=0x%lx\n", 631 pcol->nr_pages, pcol->length); 632 633 /* split the request, next loop will start again */ 634 ret = write_exec(pcol); 635 if (unlikely(ret)) { 636 EXOFS_DBGMSG("write_exec failed => %d", ret); 637 goto fail; 638 } 639 640 goto try_again; 641 } 642 643 BUG_ON(PageWriteback(page)); 644 set_page_writeback(page); 645 646 return 0; 647 648fail: 649 EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", 650 inode->i_ino, page->index, ret); 651 set_bit(AS_EIO, &page->mapping->flags); 652 unlock_page(page); 653 return ret; 654} 655 656static int exofs_writepages(struct address_space *mapping, 657 struct writeback_control *wbc) 658{ 659 struct page_collect pcol; 660 long start, end, expected_pages; 661 int ret; 662 663 start = wbc->range_start >> PAGE_CACHE_SHIFT; 664 end = (wbc->range_end == LLONG_MAX) ? 665 start + mapping->nrpages : 666 wbc->range_end >> PAGE_CACHE_SHIFT; 667 668 if (start || end) 669 expected_pages = end - start + 1; 670 else 671 expected_pages = mapping->nrpages; 672 673 if (expected_pages < 32L) 674 expected_pages = 32L; 675 676 EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " 677 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", 678 mapping->host->i_ino, wbc->range_start, wbc->range_end, 679 mapping->nrpages, start, end, expected_pages); 680 681 _pcol_init(&pcol, expected_pages, mapping->host); 682 683 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); 684 if (ret) { 685 EXOFS_ERR("write_cache_pages => %d\n", ret); 686 return ret; 687 } 688 689 return write_exec(&pcol); 690} 691 692static int exofs_writepage(struct page *page, struct writeback_control *wbc) 693{ 694 struct page_collect pcol; 695 int ret; 696 697 _pcol_init(&pcol, 1, page->mapping->host); 698 699 ret = writepage_strip(page, NULL, &pcol); 700 if (ret) { 701 EXOFS_ERR("exofs_writepage => %d\n", ret); 702 return ret; 703 } 704 705 return write_exec(&pcol); 706} 707 708/* i_mutex held using inode->i_size directly */ 709static void _write_failed(struct inode *inode, loff_t to) 710{ 711 if (to > inode->i_size) 712 truncate_pagecache(inode, to, inode->i_size); 713} 714 715int exofs_write_begin(struct file *file, struct address_space *mapping, 716 loff_t pos, unsigned len, unsigned flags, 717 struct page **pagep, void **fsdata) 718{ 719 int ret = 0; 720 struct page *page; 721 722 page = *pagep; 723 if (page == NULL) { 724 ret = simple_write_begin(file, mapping, pos, len, flags, pagep, 725 fsdata); 726 if (ret) { 727 EXOFS_DBGMSG("simple_write_begin failed\n"); 728 goto out; 729 } 730 731 page = *pagep; 732 } 733 734 /* read modify write */ 735 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { 736 loff_t i_size = i_size_read(mapping->host); 737 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 738 size_t rlen; 739 740 if (page->index < end_index) 741 rlen = PAGE_CACHE_SIZE; 742 else if (page->index == end_index) 743 rlen = i_size & ~PAGE_CACHE_MASK; 744 else 745 rlen = 0; 746 747 if (!rlen) { 748 clear_highpage(page); 749 SetPageUptodate(page); 750 goto out; 751 } 752 753 ret = _readpage(page, true); 754 if (ret) { 755 /*SetPageError was done by _readpage. Is it ok?*/ 756 unlock_page(page); 757 EXOFS_DBGMSG("__readpage failed\n"); 758 } 759 } 760out: 761 if (unlikely(ret)) 762 _write_failed(mapping->host, pos + len); 763 764 return ret; 765} 766 767static int exofs_write_begin_export(struct file *file, 768 struct address_space *mapping, 769 loff_t pos, unsigned len, unsigned flags, 770 struct page **pagep, void **fsdata) 771{ 772 *pagep = NULL; 773 774 return exofs_write_begin(file, mapping, pos, len, flags, pagep, 775 fsdata); 776} 777 778static int exofs_write_end(struct file *file, struct address_space *mapping, 779 loff_t pos, unsigned len, unsigned copied, 780 struct page *page, void *fsdata) 781{ 782 struct inode *inode = mapping->host; 783 /* According to comment in simple_write_end i_mutex is held */ 784 loff_t i_size = inode->i_size; 785 int ret; 786 787 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); 788 if (unlikely(ret)) 789 _write_failed(inode, pos + len); 790 791 /* TODO: once simple_write_end marks inode dirty remove */ 792 if (i_size != inode->i_size) 793 mark_inode_dirty(inode); 794 return ret; 795} 796 797static int exofs_releasepage(struct page *page, gfp_t gfp) 798{ 799 EXOFS_DBGMSG("page 0x%lx\n", page->index); 800 WARN_ON(1); 801 return 0; 802} 803 804static void exofs_invalidatepage(struct page *page, unsigned long offset) 805{ 806 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); 807 WARN_ON(1); 808} 809 810const struct address_space_operations exofs_aops = { 811 .readpage = exofs_readpage, 812 .readpages = exofs_readpages, 813 .writepage = exofs_writepage, 814 .writepages = exofs_writepages, 815 .write_begin = exofs_write_begin_export, 816 .write_end = exofs_write_end, 817 .releasepage = exofs_releasepage, 818 .set_page_dirty = __set_page_dirty_nobuffers, 819 .invalidatepage = exofs_invalidatepage, 820 821 /* Not implemented Yet */ 822 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ 823 .direct_IO = NULL, /* TODO: Should be trivial to do */ 824 825 /* With these NULL has special meaning or default is not exported */ 826 .get_xip_mem = NULL, 827 .migratepage = NULL, 828 .launder_page = NULL, 829 .is_partially_uptodate = NULL, 830 .error_remove_page = NULL, 831}; 832 833/****************************************************************************** 834 * INODE OPERATIONS 835 *****************************************************************************/ 836 837/* 838 * Test whether an inode is a fast symlink. 839 */ 840static inline int exofs_inode_is_fast_symlink(struct inode *inode) 841{ 842 struct exofs_i_info *oi = exofs_i(inode); 843 844 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); 845} 846 847const struct osd_attr g_attr_logical_length = ATTR_DEF( 848 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 849 850static int _do_truncate(struct inode *inode, loff_t newsize) 851{ 852 struct exofs_i_info *oi = exofs_i(inode); 853 int ret; 854 855 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 856 857 ret = exofs_oi_truncate(oi, (u64)newsize); 858 if (likely(!ret)) 859 truncate_setsize(inode, newsize); 860 861 EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", 862 inode->i_ino, newsize, ret); 863 return ret; 864} 865 866/* 867 * Set inode attributes - update size attribute on OSD if needed, 868 * otherwise just call generic functions. 869 */ 870int exofs_setattr(struct dentry *dentry, struct iattr *iattr) 871{ 872 struct inode *inode = dentry->d_inode; 873 int error; 874 875 /* if we are about to modify an object, and it hasn't been 876 * created yet, wait 877 */ 878 error = wait_obj_created(exofs_i(inode)); 879 if (unlikely(error)) 880 return error; 881 882 error = inode_change_ok(inode, iattr); 883 if (unlikely(error)) 884 return error; 885 886 if ((iattr->ia_valid & ATTR_SIZE) && 887 iattr->ia_size != i_size_read(inode)) { 888 error = _do_truncate(inode, iattr->ia_size); 889 if (unlikely(error)) 890 return error; 891 } 892 893 setattr_copy(inode, iattr); 894 mark_inode_dirty(inode); 895 return 0; 896} 897 898static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( 899 EXOFS_APAGE_FS_DATA, 900 EXOFS_ATTR_INODE_FILE_LAYOUT, 901 0); 902static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( 903 EXOFS_APAGE_FS_DATA, 904 EXOFS_ATTR_INODE_DIR_LAYOUT, 905 0); 906 907/* 908 * Read the Linux inode info from the OSD, and return it as is. In exofs the 909 * inode info is in an application specific page/attribute of the osd-object. 910 */ 911static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 912 struct exofs_fcb *inode) 913{ 914 struct exofs_sb_info *sbi = sb->s_fs_info; 915 struct osd_attr attrs[] = { 916 [0] = g_attr_inode_data, 917 [1] = g_attr_inode_file_layout, 918 [2] = g_attr_inode_dir_layout, 919 }; 920 struct exofs_io_state *ios; 921 struct exofs_on_disk_inode_layout *layout; 922 int ret; 923 924 ret = exofs_get_io_state(&sbi->layout, &ios); 925 if (unlikely(ret)) { 926 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 927 return ret; 928 } 929 930 ios->obj.id = exofs_oi_objno(oi); 931 exofs_make_credential(oi->i_cred, &ios->obj); 932 ios->cred = oi->i_cred; 933 934 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); 935 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); 936 937 ios->in_attr = attrs; 938 ios->in_attr_len = ARRAY_SIZE(attrs); 939 940 ret = exofs_sbi_read(ios); 941 if (unlikely(ret)) { 942 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", 943 _LLU(ios->obj.id), ret); 944 memset(inode, 0, sizeof(*inode)); 945 inode->i_mode = 0040000 | (0777 & ~022); 946 /* If object is lost on target we might as well enable it's 947 * delete. 948 */ 949 if ((ret == -ENOENT) || (ret == -EINVAL)) 950 ret = 0; 951 goto out; 952 } 953 954 ret = extract_attr_from_ios(ios, &attrs[0]); 955 if (ret) { 956 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 957 goto out; 958 } 959 WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); 960 memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); 961 962 ret = extract_attr_from_ios(ios, &attrs[1]); 963 if (ret) { 964 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 965 goto out; 966 } 967 if (attrs[1].len) { 968 layout = attrs[1].val_ptr; 969 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { 970 EXOFS_ERR("%s: unsupported files layout %d\n", 971 __func__, layout->gen_func); 972 ret = -ENOTSUPP; 973 goto out; 974 } 975 } 976 977 ret = extract_attr_from_ios(ios, &attrs[2]); 978 if (ret) { 979 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 980 goto out; 981 } 982 if (attrs[2].len) { 983 layout = attrs[2].val_ptr; 984 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { 985 EXOFS_ERR("%s: unsupported meta-data layout %d\n", 986 __func__, layout->gen_func); 987 ret = -ENOTSUPP; 988 goto out; 989 } 990 } 991 992out: 993 exofs_put_io_state(ios); 994 return ret; 995} 996 997static void __oi_init(struct exofs_i_info *oi) 998{ 999 init_waitqueue_head(&oi->i_wq); 1000 oi->i_flags = 0; 1001} 1002/* 1003 * Fill in an inode read from the OSD and set it up for use 1004 */ 1005struct inode *exofs_iget(struct super_block *sb, unsigned long ino) 1006{ 1007 struct exofs_i_info *oi; 1008 struct exofs_fcb fcb; 1009 struct inode *inode; 1010 int ret; 1011 1012 inode = iget_locked(sb, ino); 1013 if (!inode) 1014 return ERR_PTR(-ENOMEM); 1015 if (!(inode->i_state & I_NEW)) 1016 return inode; 1017 oi = exofs_i(inode); 1018 __oi_init(oi); 1019 1020 /* read the inode from the osd */ 1021 ret = exofs_get_inode(sb, oi, &fcb); 1022 if (ret) 1023 goto bad_inode; 1024 1025 set_obj_created(oi); 1026 1027 /* copy stuff from on-disk struct to in-memory struct */ 1028 inode->i_mode = le16_to_cpu(fcb.i_mode); 1029 inode->i_uid = le32_to_cpu(fcb.i_uid); 1030 inode->i_gid = le32_to_cpu(fcb.i_gid); 1031 inode->i_nlink = le16_to_cpu(fcb.i_links_count); 1032 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); 1033 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); 1034 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); 1035 inode->i_ctime.tv_nsec = 1036 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; 1037 oi->i_commit_size = le64_to_cpu(fcb.i_size); 1038 i_size_write(inode, oi->i_commit_size); 1039 inode->i_blkbits = EXOFS_BLKSHIFT; 1040 inode->i_generation = le32_to_cpu(fcb.i_generation); 1041 1042 oi->i_dir_start_lookup = 0; 1043 1044 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 1045 ret = -ESTALE; 1046 goto bad_inode; 1047 } 1048 1049 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1050 if (fcb.i_data[0]) 1051 inode->i_rdev = 1052 old_decode_dev(le32_to_cpu(fcb.i_data[0])); 1053 else 1054 inode->i_rdev = 1055 new_decode_dev(le32_to_cpu(fcb.i_data[1])); 1056 } else { 1057 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1058 } 1059 1060 inode->i_mapping->backing_dev_info = sb->s_bdi; 1061 if (S_ISREG(inode->i_mode)) { 1062 inode->i_op = &exofs_file_inode_operations; 1063 inode->i_fop = &exofs_file_operations; 1064 inode->i_mapping->a_ops = &exofs_aops; 1065 } else if (S_ISDIR(inode->i_mode)) { 1066 inode->i_op = &exofs_dir_inode_operations; 1067 inode->i_fop = &exofs_dir_operations; 1068 inode->i_mapping->a_ops = &exofs_aops; 1069 } else if (S_ISLNK(inode->i_mode)) { 1070 if (exofs_inode_is_fast_symlink(inode)) 1071 inode->i_op = &exofs_fast_symlink_inode_operations; 1072 else { 1073 inode->i_op = &exofs_symlink_inode_operations; 1074 inode->i_mapping->a_ops = &exofs_aops; 1075 } 1076 } else { 1077 inode->i_op = &exofs_special_inode_operations; 1078 if (fcb.i_data[0]) 1079 init_special_inode(inode, inode->i_mode, 1080 old_decode_dev(le32_to_cpu(fcb.i_data[0]))); 1081 else 1082 init_special_inode(inode, inode->i_mode, 1083 new_decode_dev(le32_to_cpu(fcb.i_data[1]))); 1084 } 1085 1086 unlock_new_inode(inode); 1087 return inode; 1088 1089bad_inode: 1090 iget_failed(inode); 1091 return ERR_PTR(ret); 1092} 1093 1094int __exofs_wait_obj_created(struct exofs_i_info *oi) 1095{ 1096 if (!obj_created(oi)) { 1097 EXOFS_DBGMSG("!obj_created\n"); 1098 BUG_ON(!obj_2bcreated(oi)); 1099 wait_event(oi->i_wq, obj_created(oi)); 1100 EXOFS_DBGMSG("wait_event done\n"); 1101 } 1102 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1103} 1104 1105/* 1106 * Callback function from exofs_new_inode(). The important thing is that we 1107 * set the obj_created flag so that other methods know that the object exists on 1108 * the OSD. 1109 */ 1110static void create_done(struct exofs_io_state *ios, void *p) 1111{ 1112 struct inode *inode = p; 1113 struct exofs_i_info *oi = exofs_i(inode); 1114 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 1115 int ret; 1116 1117 ret = exofs_check_io(ios, NULL); 1118 exofs_put_io_state(ios); 1119 1120 atomic_dec(&sbi->s_curr_pending); 1121 1122 if (unlikely(ret)) { 1123 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", 1124 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1125 /*TODO: When FS is corrupted creation can fail, object already 1126 * exist. Get rid of this asynchronous creation, if exist 1127 * increment the obj counter and try the next object. Until we 1128 * succeed. All these dangling objects will be made into lost 1129 * files by chkfs.exofs 1130 */ 1131 } 1132 1133 set_obj_created(oi); 1134 1135 wake_up(&oi->i_wq); 1136} 1137 1138/* 1139 * Set up a new inode and create an object for it on the OSD 1140 */ 1141struct inode *exofs_new_inode(struct inode *dir, int mode) 1142{ 1143 struct super_block *sb; 1144 struct inode *inode; 1145 struct exofs_i_info *oi; 1146 struct exofs_sb_info *sbi; 1147 struct exofs_io_state *ios; 1148 int ret; 1149 1150 sb = dir->i_sb; 1151 inode = new_inode(sb); 1152 if (!inode) 1153 return ERR_PTR(-ENOMEM); 1154 1155 oi = exofs_i(inode); 1156 __oi_init(oi); 1157 1158 set_obj_2bcreated(oi); 1159 1160 sbi = sb->s_fs_info; 1161 1162 inode->i_mapping->backing_dev_info = sb->s_bdi; 1163 inode_init_owner(inode, dir, mode); 1164 inode->i_ino = sbi->s_nextid++; 1165 inode->i_blkbits = EXOFS_BLKSHIFT; 1166 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1167 oi->i_commit_size = inode->i_size = 0; 1168 spin_lock(&sbi->s_next_gen_lock); 1169 inode->i_generation = sbi->s_next_generation++; 1170 spin_unlock(&sbi->s_next_gen_lock); 1171 insert_inode_hash(inode); 1172 1173 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ 1174 1175 mark_inode_dirty(inode); 1176 1177 ret = exofs_get_io_state(&sbi->layout, &ios); 1178 if (unlikely(ret)) { 1179 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1180 return ERR_PTR(ret); 1181 } 1182 1183 ios->obj.id = exofs_oi_objno(oi); 1184 exofs_make_credential(oi->i_cred, &ios->obj); 1185 1186 ios->done = create_done; 1187 ios->private = inode; 1188 ios->cred = oi->i_cred; 1189 ret = exofs_sbi_create(ios); 1190 if (ret) { 1191 exofs_put_io_state(ios); 1192 return ERR_PTR(ret); 1193 } 1194 atomic_inc(&sbi->s_curr_pending); 1195 1196 return inode; 1197} 1198 1199/* 1200 * struct to pass two arguments to update_inode's callback 1201 */ 1202struct updatei_args { 1203 struct exofs_sb_info *sbi; 1204 struct exofs_fcb fcb; 1205}; 1206 1207/* 1208 * Callback function from exofs_update_inode(). 1209 */ 1210static void updatei_done(struct exofs_io_state *ios, void *p) 1211{ 1212 struct updatei_args *args = p; 1213 1214 exofs_put_io_state(ios); 1215 1216 atomic_dec(&args->sbi->s_curr_pending); 1217 1218 kfree(args); 1219} 1220 1221/* 1222 * Write the inode to the OSD. Just fill up the struct, and set the attribute 1223 * synchronously or asynchronously depending on the do_sync flag. 1224 */ 1225static int exofs_update_inode(struct inode *inode, int do_sync) 1226{ 1227 struct exofs_i_info *oi = exofs_i(inode); 1228 struct super_block *sb = inode->i_sb; 1229 struct exofs_sb_info *sbi = sb->s_fs_info; 1230 struct exofs_io_state *ios; 1231 struct osd_attr attr; 1232 struct exofs_fcb *fcb; 1233 struct updatei_args *args; 1234 int ret; 1235 1236 args = kzalloc(sizeof(*args), GFP_KERNEL); 1237 if (!args) { 1238 EXOFS_DBGMSG("Failed kzalloc of args\n"); 1239 return -ENOMEM; 1240 } 1241 1242 fcb = &args->fcb; 1243 1244 fcb->i_mode = cpu_to_le16(inode->i_mode); 1245 fcb->i_uid = cpu_to_le32(inode->i_uid); 1246 fcb->i_gid = cpu_to_le32(inode->i_gid); 1247 fcb->i_links_count = cpu_to_le16(inode->i_nlink); 1248 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 1249 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 1250 fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 1251 oi->i_commit_size = i_size_read(inode); 1252 fcb->i_size = cpu_to_le64(oi->i_commit_size); 1253 fcb->i_generation = cpu_to_le32(inode->i_generation); 1254 1255 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1256 if (old_valid_dev(inode->i_rdev)) { 1257 fcb->i_data[0] = 1258 cpu_to_le32(old_encode_dev(inode->i_rdev)); 1259 fcb->i_data[1] = 0; 1260 } else { 1261 fcb->i_data[0] = 0; 1262 fcb->i_data[1] = 1263 cpu_to_le32(new_encode_dev(inode->i_rdev)); 1264 fcb->i_data[2] = 0; 1265 } 1266 } else 1267 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1268 1269 ret = exofs_get_io_state(&sbi->layout, &ios); 1270 if (unlikely(ret)) { 1271 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1272 goto free_args; 1273 } 1274 1275 attr = g_attr_inode_data; 1276 attr.val_ptr = fcb; 1277 ios->out_attr_len = 1; 1278 ios->out_attr = &attr; 1279 1280 wait_obj_created(oi); 1281 1282 if (!do_sync) { 1283 args->sbi = sbi; 1284 ios->done = updatei_done; 1285 ios->private = args; 1286 } 1287 1288 ret = exofs_oi_write(oi, ios); 1289 if (!do_sync && !ret) { 1290 atomic_inc(&sbi->s_curr_pending); 1291 goto out; /* deallocation in updatei_done */ 1292 } 1293 1294 exofs_put_io_state(ios); 1295free_args: 1296 kfree(args); 1297out: 1298 EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", 1299 inode->i_ino, do_sync, ret); 1300 return ret; 1301} 1302 1303int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) 1304{ 1305 /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ 1306 return exofs_update_inode(inode, 1); 1307} 1308 1309/* 1310 * Callback function from exofs_delete_inode() - don't have much cleaning up to 1311 * do. 1312 */ 1313static void delete_done(struct exofs_io_state *ios, void *p) 1314{ 1315 struct exofs_sb_info *sbi = p; 1316 1317 exofs_put_io_state(ios); 1318 1319 atomic_dec(&sbi->s_curr_pending); 1320} 1321 1322/* 1323 * Called when the refcount of an inode reaches zero. We remove the object 1324 * from the OSD here. We make sure the object was created before we try and 1325 * delete it. 1326 */ 1327void exofs_evict_inode(struct inode *inode) 1328{ 1329 struct exofs_i_info *oi = exofs_i(inode); 1330 struct super_block *sb = inode->i_sb; 1331 struct exofs_sb_info *sbi = sb->s_fs_info; 1332 struct exofs_io_state *ios; 1333 int ret; 1334 1335 truncate_inode_pages(&inode->i_data, 0); 1336 1337 /* TODO: should do better here */ 1338 if (inode->i_nlink || is_bad_inode(inode)) 1339 goto no_delete; 1340 1341 inode->i_size = 0; 1342 end_writeback(inode); 1343 1344 /* if we are deleting an obj that hasn't been created yet, wait. 1345 * This also makes sure that create_done cannot be called with an 1346 * already evicted inode. 1347 */ 1348 wait_obj_created(oi); 1349 /* ignore the error, attempt a remove anyway */ 1350 1351 /* Now Remove the OSD objects */ 1352 ret = exofs_get_io_state(&sbi->layout, &ios); 1353 if (unlikely(ret)) { 1354 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1355 return; 1356 } 1357 1358 ios->obj.id = exofs_oi_objno(oi); 1359 ios->done = delete_done; 1360 ios->private = sbi; 1361 ios->cred = oi->i_cred; 1362 ret = exofs_sbi_remove(ios); 1363 if (ret) { 1364 EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); 1365 exofs_put_io_state(ios); 1366 return; 1367 } 1368 atomic_inc(&sbi->s_curr_pending); 1369 1370 return; 1371 1372no_delete: 1373 end_writeback(inode); 1374}