commit cdb897e3279ad1677138d6bdf1cfaf1393718a08

+1 -1

drivers/block/rbd.c

··· 3435 struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3436 struct rbd_device, lock_dwork); 3437 enum rbd_lock_state lock_state; 3438 - int ret; 3439 3440 dout("%s rbd_dev %p\n", __func__, rbd_dev); 3441 again:

··· 3435 struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3436 struct rbd_device, lock_dwork); 3437 enum rbd_lock_state lock_state; 3438 + int ret = 0; 3439 3440 dout("%s rbd_dev %p\n", __func__, rbd_dev); 3441 again:

+224 -179

fs/ceph/addr.c

··· 152 153 ceph_invalidate_fscache_page(inode, page); 154 155 if (!PagePrivate(page)) 156 return; 157 - 158 - /* 159 - * We can get non-dirty pages here due to races between 160 - * set_page_dirty and truncate_complete_page; just spit out a 161 - * warning, in case we end up with accounting problems later. 162 - */ 163 - if (!PageDirty(page)) 164 - pr_err("%p invalidatepage %p page not dirty\n", inode, page); 165 166 ClearPageChecked(page); 167 ··· 448 if (rc == 0) 449 goto out; 450 451 - if (fsc->mount_options->rsize >= PAGE_SIZE) 452 - max = (fsc->mount_options->rsize + PAGE_SIZE - 1) 453 - >> PAGE_SHIFT; 454 - 455 - dout("readpages %p file %p nr_pages %d max %d\n", inode, 456 - file, nr_pages, 457 - max); 458 while (!list_empty(page_list)) { 459 rc = start_read(inode, page_list, max); 460 if (rc < 0) ··· 463 return rc; 464 } 465 466 /* 467 * Get ref for the oldest snapc for an inode with dirty data... that is, the 468 * only snap context we are allowed to write back. 469 */ 470 - static struct ceph_snap_context *get_oldest_context(struct inode *inode, 471 - loff_t *snap_size, 472 - u64 *truncate_size, 473 - u32 *truncate_seq) 474 { 475 struct ceph_inode_info *ci = ceph_inode(inode); 476 struct ceph_snap_context *snapc = NULL; ··· 488 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 489 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 490 capsnap->context, capsnap->dirty_pages); 491 - if (capsnap->dirty_pages) { 492 - snapc = ceph_get_snap_context(capsnap->context); 493 - if (snap_size) 494 - *snap_size = capsnap->size; 495 - if (truncate_size) 496 - *truncate_size = capsnap->truncate_size; 497 - if (truncate_seq) 498 - *truncate_seq = capsnap->truncate_seq; 499 - break; 500 } 501 } 502 if (!snapc && ci->i_wrbuffer_ref_head) { 503 snapc = ceph_get_snap_context(ci->i_head_snapc); 504 dout(" head snapc %p has %d dirty pages\n", 505 snapc, ci->i_wrbuffer_ref_head); 506 - if (truncate_size) 507 - *truncate_size = ci->i_truncate_size; 508 - if (truncate_seq) 509 - *truncate_seq = ci->i_truncate_seq; 510 } 511 spin_unlock(&ci->i_ceph_lock); 512 return snapc; 513 } 514 515 /* ··· 571 struct inode *inode; 572 struct ceph_inode_info *ci; 573 struct ceph_fs_client *fsc; 574 - struct ceph_osd_client *osdc; 575 struct ceph_snap_context *snapc, *oldest; 576 loff_t page_off = page_offset(page); 577 - loff_t snap_size = -1; 578 long writeback_stat; 579 - u64 truncate_size; 580 - u32 truncate_seq; 581 int err, len = PAGE_SIZE; 582 583 dout("writepage %p idx %lu\n", page, page->index); 584 585 inode = page->mapping->host; 586 ci = ceph_inode(inode); 587 fsc = ceph_inode_to_client(inode); 588 - osdc = &fsc->client->osdc; 589 590 /* verify this is a writeable snap context */ 591 snapc = page_snap_context(page); 592 - if (snapc == NULL) { 593 dout("writepage %p page %p not dirty?\n", inode, page); 594 return 0; 595 } 596 - oldest = get_oldest_context(inode, &snap_size, 597 - &truncate_size, &truncate_seq); 598 if (snapc->seq > oldest->seq) { 599 dout("writepage %p page %p snapc %p not writeable - noop\n", 600 inode, page, snapc); ··· 601 } 602 ceph_put_snap_context(oldest); 603 604 - if (snap_size == -1) 605 - snap_size = i_size_read(inode); 606 - 607 /* is this a partial page at end of file? */ 608 - if (page_off >= snap_size) { 609 - dout("%p page eof %llu\n", page, snap_size); 610 return 0; 611 } 612 613 - if (snap_size < page_off + len) 614 - len = snap_size - page_off; 615 616 - dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 617 - inode, page, page->index, page_off, len, snapc); 618 619 writeback_stat = atomic_long_inc_return(&fsc->writeback_count); 620 if (writeback_stat > ··· 620 set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); 621 622 set_page_writeback(page); 623 - err = ceph_osdc_writepages(osdc, ceph_vino(inode), 624 - &ci->i_layout, snapc, 625 - page_off, len, 626 - truncate_seq, truncate_size, 627 &inode->i_mtime, &page, 1); 628 if (err < 0) { 629 struct writeback_control tmp_wbc; ··· 784 struct ceph_inode_info *ci = ceph_inode(inode); 785 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 786 struct ceph_vino vino = ceph_vino(inode); 787 - pgoff_t index, start, end; 788 - int range_whole = 0; 789 - int should_loop = 1; 790 - pgoff_t max_pages = 0, max_pages_ever = 0; 791 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; 792 struct pagevec pvec; 793 - int done = 0; 794 int rc = 0; 795 unsigned int wsize = i_blocksize(inode); 796 struct ceph_osd_request *req = NULL; 797 - int do_sync = 0; 798 - loff_t snap_size, i_size; 799 - u64 truncate_size; 800 - u32 truncate_seq; 801 802 - /* 803 - * Include a 'sync' in the OSD request if this is a data 804 - * integrity write (e.g., O_SYNC write or fsync()), or if our 805 - * cap is being revoked. 806 - */ 807 - if ((wbc->sync_mode == WB_SYNC_ALL) || 808 - ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) 809 - do_sync = 1; 810 - dout("writepages_start %p dosync=%d (mode=%s)\n", 811 - inode, do_sync, 812 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 813 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 814 ··· 807 mapping_set_error(mapping, -EIO); 808 return -EIO; /* we're in a forced umount, don't write! */ 809 } 810 - if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) 811 wsize = fsc->mount_options->wsize; 812 - if (wsize < PAGE_SIZE) 813 - wsize = PAGE_SIZE; 814 - max_pages_ever = wsize >> PAGE_SHIFT; 815 816 pagevec_init(&pvec, 0); 817 818 - /* where to start/end? */ 819 - if (wbc->range_cyclic) { 820 - start = mapping->writeback_index; /* Start from prev offset */ 821 - end = -1; 822 - dout(" cyclic, start at %lu\n", start); 823 - } else { 824 - start = wbc->range_start >> PAGE_SHIFT; 825 - end = wbc->range_end >> PAGE_SHIFT; 826 - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 827 - range_whole = 1; 828 - should_loop = 0; 829 - dout(" not cyclic, %lu to %lu\n", start, end); 830 - } 831 - index = start; 832 833 retry: 834 /* find oldest snap context with dirty data */ 835 - ceph_put_snap_context(snapc); 836 - snap_size = -1; 837 - snapc = get_oldest_context(inode, &snap_size, 838 - &truncate_size, &truncate_seq); 839 if (!snapc) { 840 /* hmm, why does writepages get called when there 841 is no dirty data? */ ··· 827 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 828 snapc, snapc->seq, snapc->num_snaps); 829 830 - i_size = i_size_read(inode); 831 - 832 - if (last_snapc && snapc != last_snapc) { 833 - /* if we switched to a newer snapc, restart our scan at the 834 - * start of the original file range. */ 835 - dout(" snapc differs from last pass, restarting at %lu\n", 836 - index); 837 - index = start; 838 } 839 last_snapc = snapc; 840 841 - while (!done && index <= end) { 842 - unsigned i; 843 - int first; 844 - pgoff_t strip_unit_end = 0; 845 int num_ops = 0, op_idx; 846 - int pvec_pages, locked_pages = 0; 847 struct page **pages = NULL, **data_pages; 848 mempool_t *pool = NULL; /* Becomes non-null if mempool used */ 849 struct page *page; 850 - int want; 851 u64 offset = 0, len = 0; 852 853 - max_pages = max_pages_ever; 854 855 get_more_pages: 856 - first = -1; 857 - want = min(end - index, 858 - min((pgoff_t)PAGEVEC_SIZE, 859 - max_pages - (pgoff_t)locked_pages) - 1) 860 - + 1; 861 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, 862 PAGECACHE_TAG_DIRTY, 863 - want); 864 dout("pagevec_lookup_tag got %d\n", pvec_pages); 865 if (!pvec_pages && !locked_pages) 866 break; ··· 893 unlikely(page->mapping != mapping)) { 894 dout("!dirty or !mapping %p\n", page); 895 unlock_page(page); 896 - break; 897 } 898 - if (!wbc->range_cyclic && page->index > end) { 899 dout("end of range %p\n", page); 900 - done = 1; 901 unlock_page(page); 902 break; 903 } ··· 910 unlock_page(page); 911 break; 912 } 913 - if (wbc->sync_mode != WB_SYNC_NONE) { 914 - dout("waiting on writeback %p\n", page); 915 - wait_on_page_writeback(page); 916 - } 917 - if (page_offset(page) >= 918 - (snap_size == -1 ? i_size : snap_size)) { 919 - dout("%p page eof %llu\n", page, 920 - (snap_size == -1 ? i_size : snap_size)); 921 - done = 1; 922 unlock_page(page); 923 break; 924 } 925 if (PageWriteback(page)) { 926 - dout("%p under writeback\n", page); 927 - unlock_page(page); 928 - break; 929 } 930 931 /* only if matching snap context */ 932 pgsnapc = page_snap_context(page); 933 - if (pgsnapc->seq > snapc->seq) { 934 - dout("page snapc %p %lld > oldest %p %lld\n", 935 pgsnapc, pgsnapc->seq, snapc, snapc->seq); 936 unlock_page(page); 937 - if (!locked_pages) 938 - continue; /* keep looking for snap */ 939 - break; 940 } 941 942 if (!clear_page_dirty_for_io(page)) { 943 dout("%p !clear_page_dirty_for_io\n", page); 944 unlock_page(page); 945 - break; 946 } 947 948 /* ··· 966 break; 967 } 968 969 - num_ops = 1 + do_sync; 970 strip_unit_end = page->index + 971 ((len - 1) >> PAGE_SHIFT); 972 ··· 996 } 997 998 /* note position of first page in pvec */ 999 - if (first < 0) 1000 - first = i; 1001 dout("%p will write page %p idx %lu\n", 1002 inode, page, page->index); 1003 ··· 1006 BLK_RW_ASYNC); 1007 } 1008 1009 - pages[locked_pages] = page; 1010 - locked_pages++; 1011 len += PAGE_SIZE; 1012 } 1013 ··· 1017 if (!locked_pages) 1018 goto release_pvec_pages; 1019 if (i) { 1020 - int j; 1021 - BUG_ON(!locked_pages || first < 0); 1022 1023 if (pvec_pages && i == pvec_pages && 1024 locked_pages < max_pages) { 1025 dout("reached end pvec, trying for more\n"); 1026 - pagevec_reinit(&pvec); 1027 goto get_more_pages; 1028 } 1029 - 1030 - /* shift unused pages over in the pvec... we 1031 - * will need to release them below. */ 1032 - for (j = i; j < pvec_pages; j++) { 1033 - dout(" pvec leftover page %p\n", pvec.pages[j]); 1034 - pvec.pages[j-i+first] = pvec.pages[j]; 1035 - } 1036 - pvec.nr -= i-first; 1037 } 1038 1039 new_request: ··· 1043 req = ceph_osdc_new_request(&fsc->client->osdc, 1044 &ci->i_layout, vino, 1045 offset, &len, 0, num_ops, 1046 - CEPH_OSD_OP_WRITE, 1047 - CEPH_OSD_FLAG_WRITE, 1048 - snapc, truncate_seq, 1049 - truncate_size, false); 1050 if (IS_ERR(req)) { 1051 req = ceph_osdc_new_request(&fsc->client->osdc, 1052 &ci->i_layout, vino, ··· 1054 CEPH_OSD_SLAB_OPS), 1055 CEPH_OSD_OP_WRITE, 1056 CEPH_OSD_FLAG_WRITE, 1057 - snapc, truncate_seq, 1058 - truncate_size, true); 1059 BUG_ON(IS_ERR(req)); 1060 } 1061 BUG_ON(len < page_offset(pages[locked_pages - 1]) + ··· 1071 for (i = 0; i < locked_pages; i++) { 1072 u64 cur_offset = page_offset(pages[i]); 1073 if (offset + len != cur_offset) { 1074 - if (op_idx + do_sync + 1 == req->r_num_ops) 1075 break; 1076 osd_req_op_extent_dup_last(req, op_idx, 1077 cur_offset - offset); ··· 1092 len += PAGE_SIZE; 1093 } 1094 1095 - if (snap_size != -1) { 1096 - len = min(len, snap_size - offset); 1097 } else if (i == locked_pages) { 1098 /* writepages_finish() clears writeback pages 1099 * according to the data length, so make sure 1100 * data length covers all locked pages */ 1101 u64 min_len = len + 1 - PAGE_SIZE; 1102 - len = min(len, (u64)i_size_read(inode) - offset); 1103 len = max(len, min_len); 1104 } 1105 dout("writepages got pages at %llu~%llu\n", offset, len); ··· 1109 0, !!pool, false); 1110 osd_req_op_extent_update(req, op_idx, len); 1111 1112 - if (do_sync) { 1113 - op_idx++; 1114 - osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); 1115 - } 1116 BUG_ON(op_idx + 1 != req->r_num_ops); 1117 1118 pool = NULL; 1119 if (i < locked_pages) { 1120 BUG_ON(num_ops <= req->r_num_ops); 1121 num_ops -= req->r_num_ops; 1122 - num_ops += do_sync; 1123 locked_pages -= i; 1124 1125 /* allocate new pages array for next request */ ··· 1146 if (pages) 1147 goto new_request; 1148 1149 - if (wbc->nr_to_write <= 0) 1150 - done = 1; 1151 1152 release_pvec_pages: 1153 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, 1154 pvec.nr ? pvec.pages[0] : NULL); 1155 pagevec_release(&pvec); 1156 - 1157 - if (locked_pages && !done) 1158 - goto retry; 1159 } 1160 1161 if (should_loop && !done) { 1162 /* more to do; loop back to beginning of file */ 1163 dout("writepages looping back to beginning of file\n"); 1164 - should_loop = 0; 1165 index = 0; 1166 goto retry; 1167 } ··· 1199 1200 out: 1201 ceph_osdc_put_request(req); 1202 - ceph_put_snap_context(snapc); 1203 - dout("writepages done, rc = %d\n", rc); 1204 return rc; 1205 } 1206 ··· 1212 static int context_is_writeable_or_written(struct inode *inode, 1213 struct ceph_snap_context *snapc) 1214 { 1215 - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, 1216 - NULL, NULL); 1217 int ret = !oldest || snapc->seq <= oldest->seq; 1218 1219 ceph_put_snap_context(oldest); ··· 1257 * this page is already dirty in another (older) snap 1258 * context! is it writeable now? 1259 */ 1260 - oldest = get_oldest_context(inode, NULL, NULL, NULL); 1261 - 1262 if (snapc->seq > oldest->seq) { 1263 ceph_put_snap_context(oldest); 1264 dout(" page %p snapc %p not current or oldest\n",

··· 152 153 ceph_invalidate_fscache_page(inode, page); 154 155 + WARN_ON(!PageLocked(page)); 156 if (!PagePrivate(page)) 157 return; 158 159 ClearPageChecked(page); 160 ··· 455 if (rc == 0) 456 goto out; 457 458 + max = fsc->mount_options->rsize >> PAGE_SHIFT; 459 + dout("readpages %p file %p nr_pages %d max %d\n", 460 + inode, file, nr_pages, max); 461 while (!list_empty(page_list)) { 462 rc = start_read(inode, page_list, max); 463 if (rc < 0) ··· 474 return rc; 475 } 476 477 + struct ceph_writeback_ctl 478 + { 479 + loff_t i_size; 480 + u64 truncate_size; 481 + u32 truncate_seq; 482 + bool size_stable; 483 + bool head_snapc; 484 + }; 485 + 486 /* 487 * Get ref for the oldest snapc for an inode with dirty data... that is, the 488 * only snap context we are allowed to write back. 489 */ 490 + static struct ceph_snap_context * 491 + get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, 492 + struct ceph_snap_context *page_snapc) 493 { 494 struct ceph_inode_info *ci = ceph_inode(inode); 495 struct ceph_snap_context *snapc = NULL; ··· 491 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 492 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 493 capsnap->context, capsnap->dirty_pages); 494 + if (!capsnap->dirty_pages) 495 + continue; 496 + 497 + /* get i_size, truncate_{seq,size} for page_snapc? */ 498 + if (snapc && capsnap->context != page_snapc) 499 + continue; 500 + 501 + if (ctl) { 502 + if (capsnap->writing) { 503 + ctl->i_size = i_size_read(inode); 504 + ctl->size_stable = false; 505 + } else { 506 + ctl->i_size = capsnap->size; 507 + ctl->size_stable = true; 508 + } 509 + ctl->truncate_size = capsnap->truncate_size; 510 + ctl->truncate_seq = capsnap->truncate_seq; 511 + ctl->head_snapc = false; 512 } 513 + 514 + if (snapc) 515 + break; 516 + 517 + snapc = ceph_get_snap_context(capsnap->context); 518 + if (!page_snapc || 519 + page_snapc == snapc || 520 + page_snapc->seq > snapc->seq) 521 + break; 522 } 523 if (!snapc && ci->i_wrbuffer_ref_head) { 524 snapc = ceph_get_snap_context(ci->i_head_snapc); 525 dout(" head snapc %p has %d dirty pages\n", 526 snapc, ci->i_wrbuffer_ref_head); 527 + if (ctl) { 528 + ctl->i_size = i_size_read(inode); 529 + ctl->truncate_size = ci->i_truncate_size; 530 + ctl->truncate_seq = ci->i_truncate_seq; 531 + ctl->size_stable = false; 532 + ctl->head_snapc = true; 533 + } 534 } 535 spin_unlock(&ci->i_ceph_lock); 536 return snapc; 537 + } 538 + 539 + static u64 get_writepages_data_length(struct inode *inode, 540 + struct page *page, u64 start) 541 + { 542 + struct ceph_inode_info *ci = ceph_inode(inode); 543 + struct ceph_snap_context *snapc = page_snap_context(page); 544 + struct ceph_cap_snap *capsnap = NULL; 545 + u64 end = i_size_read(inode); 546 + 547 + if (snapc != ci->i_head_snapc) { 548 + bool found = false; 549 + spin_lock(&ci->i_ceph_lock); 550 + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 551 + if (capsnap->context == snapc) { 552 + if (!capsnap->writing) 553 + end = capsnap->size; 554 + found = true; 555 + break; 556 + } 557 + } 558 + spin_unlock(&ci->i_ceph_lock); 559 + WARN_ON(!found); 560 + } 561 + if (end > page_offset(page) + PAGE_SIZE) 562 + end = page_offset(page) + PAGE_SIZE; 563 + return end > start ? end - start : 0; 564 } 565 566 /* ··· 526 struct inode *inode; 527 struct ceph_inode_info *ci; 528 struct ceph_fs_client *fsc; 529 struct ceph_snap_context *snapc, *oldest; 530 loff_t page_off = page_offset(page); 531 long writeback_stat; 532 int err, len = PAGE_SIZE; 533 + struct ceph_writeback_ctl ceph_wbc; 534 535 dout("writepage %p idx %lu\n", page, page->index); 536 537 inode = page->mapping->host; 538 ci = ceph_inode(inode); 539 fsc = ceph_inode_to_client(inode); 540 541 /* verify this is a writeable snap context */ 542 snapc = page_snap_context(page); 543 + if (!snapc) { 544 dout("writepage %p page %p not dirty?\n", inode, page); 545 return 0; 546 } 547 + oldest = get_oldest_context(inode, &ceph_wbc, snapc); 548 if (snapc->seq > oldest->seq) { 549 dout("writepage %p page %p snapc %p not writeable - noop\n", 550 inode, page, snapc); ··· 561 } 562 ceph_put_snap_context(oldest); 563 564 /* is this a partial page at end of file? */ 565 + if (page_off >= ceph_wbc.i_size) { 566 + dout("%p page eof %llu\n", page, ceph_wbc.i_size); 567 + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 568 return 0; 569 } 570 571 + if (ceph_wbc.i_size < page_off + len) 572 + len = ceph_wbc.i_size - page_off; 573 574 + dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", 575 + inode, page, page->index, page_off, len, snapc, snapc->seq); 576 577 writeback_stat = atomic_long_inc_return(&fsc->writeback_count); 578 if (writeback_stat > ··· 582 set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); 583 584 set_page_writeback(page); 585 + err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), 586 + &ci->i_layout, snapc, page_off, len, 587 + ceph_wbc.truncate_seq, 588 + ceph_wbc.truncate_size, 589 &inode->i_mtime, &page, 1); 590 if (err < 0) { 591 struct writeback_control tmp_wbc; ··· 746 struct ceph_inode_info *ci = ceph_inode(inode); 747 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 748 struct ceph_vino vino = ceph_vino(inode); 749 + pgoff_t index, start_index, end = -1; 750 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; 751 struct pagevec pvec; 752 int rc = 0; 753 unsigned int wsize = i_blocksize(inode); 754 struct ceph_osd_request *req = NULL; 755 + struct ceph_writeback_ctl ceph_wbc; 756 + bool should_loop, range_whole = false; 757 + bool stop, done = false; 758 759 + dout("writepages_start %p (mode=%s)\n", inode, 760 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 761 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 762 ··· 783 mapping_set_error(mapping, -EIO); 784 return -EIO; /* we're in a forced umount, don't write! */ 785 } 786 + if (fsc->mount_options->wsize < wsize) 787 wsize = fsc->mount_options->wsize; 788 789 pagevec_init(&pvec, 0); 790 791 + start_index = wbc->range_cyclic ? mapping->writeback_index : 0; 792 + index = start_index; 793 794 retry: 795 /* find oldest snap context with dirty data */ 796 + snapc = get_oldest_context(inode, &ceph_wbc, NULL); 797 if (!snapc) { 798 /* hmm, why does writepages get called when there 799 is no dirty data? */ ··· 821 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 822 snapc, snapc->seq, snapc->num_snaps); 823 824 + should_loop = false; 825 + if (ceph_wbc.head_snapc && snapc != last_snapc) { 826 + /* where to start/end? */ 827 + if (wbc->range_cyclic) { 828 + index = start_index; 829 + end = -1; 830 + if (index > 0) 831 + should_loop = true; 832 + dout(" cyclic, start at %lu\n", index); 833 + } else { 834 + index = wbc->range_start >> PAGE_SHIFT; 835 + end = wbc->range_end >> PAGE_SHIFT; 836 + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 837 + range_whole = true; 838 + dout(" not cyclic, %lu to %lu\n", index, end); 839 + } 840 + } else if (!ceph_wbc.head_snapc) { 841 + /* Do not respect wbc->range_{start,end}. Dirty pages 842 + * in that range can be associated with newer snapc. 843 + * They are not writeable until we write all dirty pages 844 + * associated with 'snapc' get written */ 845 + if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) 846 + should_loop = true; 847 + dout(" non-head snapc, range whole\n"); 848 } 849 + 850 + ceph_put_snap_context(last_snapc); 851 last_snapc = snapc; 852 853 + stop = false; 854 + while (!stop && index <= end) { 855 int num_ops = 0, op_idx; 856 + unsigned i, pvec_pages, max_pages, locked_pages = 0; 857 struct page **pages = NULL, **data_pages; 858 mempool_t *pool = NULL; /* Becomes non-null if mempool used */ 859 struct page *page; 860 + pgoff_t strip_unit_end = 0; 861 u64 offset = 0, len = 0; 862 863 + max_pages = wsize >> PAGE_SHIFT; 864 865 get_more_pages: 866 + pvec_pages = min_t(unsigned, PAGEVEC_SIZE, 867 + max_pages - locked_pages); 868 + if (end - index < (u64)(pvec_pages - 1)) 869 + pvec_pages = (unsigned)(end - index) + 1; 870 + 871 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, 872 PAGECACHE_TAG_DIRTY, 873 + pvec_pages); 874 dout("pagevec_lookup_tag got %d\n", pvec_pages); 875 if (!pvec_pages && !locked_pages) 876 break; ··· 871 unlikely(page->mapping != mapping)) { 872 dout("!dirty or !mapping %p\n", page); 873 unlock_page(page); 874 + continue; 875 } 876 + if (page->index > end) { 877 dout("end of range %p\n", page); 878 + /* can't be range_cyclic (1st pass) because 879 + * end == -1 in that case. */ 880 + stop = true; 881 + if (ceph_wbc.head_snapc) 882 + done = true; 883 unlock_page(page); 884 break; 885 } ··· 884 unlock_page(page); 885 break; 886 } 887 + if (page_offset(page) >= ceph_wbc.i_size) { 888 + dout("%p page eof %llu\n", 889 + page, ceph_wbc.i_size); 890 + /* not done if range_cyclic */ 891 + stop = true; 892 unlock_page(page); 893 break; 894 } 895 if (PageWriteback(page)) { 896 + if (wbc->sync_mode == WB_SYNC_NONE) { 897 + dout("%p under writeback\n", page); 898 + unlock_page(page); 899 + continue; 900 + } 901 + dout("waiting on writeback %p\n", page); 902 + wait_on_page_writeback(page); 903 } 904 905 /* only if matching snap context */ 906 pgsnapc = page_snap_context(page); 907 + if (pgsnapc != snapc) { 908 + dout("page snapc %p %lld != oldest %p %lld\n", 909 pgsnapc, pgsnapc->seq, snapc, snapc->seq); 910 unlock_page(page); 911 + continue; 912 } 913 914 if (!clear_page_dirty_for_io(page)) { 915 dout("%p !clear_page_dirty_for_io\n", page); 916 unlock_page(page); 917 + continue; 918 } 919 920 /* ··· 942 break; 943 } 944 945 + num_ops = 1; 946 strip_unit_end = page->index + 947 ((len - 1) >> PAGE_SHIFT); 948 ··· 972 } 973 974 /* note position of first page in pvec */ 975 dout("%p will write page %p idx %lu\n", 976 inode, page, page->index); 977 ··· 984 BLK_RW_ASYNC); 985 } 986 987 + 988 + pages[locked_pages++] = page; 989 + pvec.pages[i] = NULL; 990 + 991 len += PAGE_SIZE; 992 } 993 ··· 993 if (!locked_pages) 994 goto release_pvec_pages; 995 if (i) { 996 + unsigned j, n = 0; 997 + /* shift unused page to beginning of pvec */ 998 + for (j = 0; j < pvec_pages; j++) { 999 + if (!pvec.pages[j]) 1000 + continue; 1001 + if (n < j) 1002 + pvec.pages[n] = pvec.pages[j]; 1003 + n++; 1004 + } 1005 + pvec.nr = n; 1006 1007 if (pvec_pages && i == pvec_pages && 1008 locked_pages < max_pages) { 1009 dout("reached end pvec, trying for more\n"); 1010 + pagevec_release(&pvec); 1011 goto get_more_pages; 1012 } 1013 } 1014 1015 new_request: ··· 1019 req = ceph_osdc_new_request(&fsc->client->osdc, 1020 &ci->i_layout, vino, 1021 offset, &len, 0, num_ops, 1022 + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, 1023 + snapc, ceph_wbc.truncate_seq, 1024 + ceph_wbc.truncate_size, false); 1025 if (IS_ERR(req)) { 1026 req = ceph_osdc_new_request(&fsc->client->osdc, 1027 &ci->i_layout, vino, ··· 1031 CEPH_OSD_SLAB_OPS), 1032 CEPH_OSD_OP_WRITE, 1033 CEPH_OSD_FLAG_WRITE, 1034 + snapc, ceph_wbc.truncate_seq, 1035 + ceph_wbc.truncate_size, true); 1036 BUG_ON(IS_ERR(req)); 1037 } 1038 BUG_ON(len < page_offset(pages[locked_pages - 1]) + ··· 1048 for (i = 0; i < locked_pages; i++) { 1049 u64 cur_offset = page_offset(pages[i]); 1050 if (offset + len != cur_offset) { 1051 + if (op_idx + 1 == req->r_num_ops) 1052 break; 1053 osd_req_op_extent_dup_last(req, op_idx, 1054 cur_offset - offset); ··· 1069 len += PAGE_SIZE; 1070 } 1071 1072 + if (ceph_wbc.size_stable) { 1073 + len = min(len, ceph_wbc.i_size - offset); 1074 } else if (i == locked_pages) { 1075 /* writepages_finish() clears writeback pages 1076 * according to the data length, so make sure 1077 * data length covers all locked pages */ 1078 u64 min_len = len + 1 - PAGE_SIZE; 1079 + len = get_writepages_data_length(inode, pages[i - 1], 1080 + offset); 1081 len = max(len, min_len); 1082 } 1083 dout("writepages got pages at %llu~%llu\n", offset, len); ··· 1085 0, !!pool, false); 1086 osd_req_op_extent_update(req, op_idx, len); 1087 1088 BUG_ON(op_idx + 1 != req->r_num_ops); 1089 1090 pool = NULL; 1091 if (i < locked_pages) { 1092 BUG_ON(num_ops <= req->r_num_ops); 1093 num_ops -= req->r_num_ops; 1094 locked_pages -= i; 1095 1096 /* allocate new pages array for next request */ ··· 1127 if (pages) 1128 goto new_request; 1129 1130 + /* 1131 + * We stop writing back only if we are not doing 1132 + * integrity sync. In case of integrity sync we have to 1133 + * keep going until we have written all the pages 1134 + * we tagged for writeback prior to entering this loop. 1135 + */ 1136 + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) 1137 + done = stop = true; 1138 1139 release_pvec_pages: 1140 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, 1141 pvec.nr ? pvec.pages[0] : NULL); 1142 pagevec_release(&pvec); 1143 } 1144 1145 if (should_loop && !done) { 1146 /* more to do; loop back to beginning of file */ 1147 dout("writepages looping back to beginning of file\n"); 1148 + end = start_index - 1; /* OK even when start_index == 0 */ 1149 + 1150 + /* to write dirty pages associated with next snapc, 1151 + * we need to wait until current writes complete */ 1152 + if (wbc->sync_mode != WB_SYNC_NONE && 1153 + start_index == 0 && /* all dirty pages were checked */ 1154 + !ceph_wbc.head_snapc) { 1155 + struct page *page; 1156 + unsigned i, nr; 1157 + index = 0; 1158 + while ((index <= end) && 1159 + (nr = pagevec_lookup_tag(&pvec, mapping, &index, 1160 + PAGECACHE_TAG_WRITEBACK, 1161 + PAGEVEC_SIZE))) { 1162 + for (i = 0; i < nr; i++) { 1163 + page = pvec.pages[i]; 1164 + if (page_snap_context(page) != snapc) 1165 + continue; 1166 + wait_on_page_writeback(page); 1167 + } 1168 + pagevec_release(&pvec); 1169 + cond_resched(); 1170 + } 1171 + } 1172 + 1173 + start_index = 0; 1174 index = 0; 1175 goto retry; 1176 } ··· 1152 1153 out: 1154 ceph_osdc_put_request(req); 1155 + ceph_put_snap_context(last_snapc); 1156 + dout("writepages dend - startone, rc = %d\n", rc); 1157 return rc; 1158 } 1159 ··· 1165 static int context_is_writeable_or_written(struct inode *inode, 1166 struct ceph_snap_context *snapc) 1167 { 1168 + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); 1169 int ret = !oldest || snapc->seq <= oldest->seq; 1170 1171 ceph_put_snap_context(oldest); ··· 1211 * this page is already dirty in another (older) snap 1212 * context! is it writeable now? 1213 */ 1214 + oldest = get_oldest_context(inode, NULL, NULL); 1215 if (snapc->seq > oldest->seq) { 1216 ceph_put_snap_context(oldest); 1217 dout(" page %p snapc %p not current or oldest\n",

+1 -1

fs/ceph/cache.c

··· 209 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 210 211 /* No caching for filesystem */ 212 - if (fsc->fscache == NULL) 213 return; 214 215 /* Only cache for regular files that are read only */

··· 209 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 210 211 /* No caching for filesystem */ 212 + if (!fsc->fscache) 213 return; 214 215 /* Only cache for regular files that are read only */

+23 -17

fs/ceph/caps.c

··· 490 } 491 492 /* 493 - * if we are newly issued FILE_SHARED, mark dir not complete; we 494 - * don't know what happened to this directory while we didn't 495 - * have the cap. 496 */ 497 - if ((issued & CEPH_CAP_FILE_SHARED) && 498 - (had & CEPH_CAP_FILE_SHARED) == 0) { 499 - ci->i_shared_gen++; 500 if (S_ISDIR(ci->vfs_inode.i_mode)) { 501 dout(" marking %p NOT complete\n", &ci->vfs_inode); 502 __ceph_dir_clear_complete(ci); ··· 612 } 613 614 if (flags & CEPH_CAP_FLAG_AUTH) { 615 - if (ci->i_auth_cap == NULL || 616 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { 617 ci->i_auth_cap = cap; 618 cap->mds_wanted = wanted; ··· 729 struct ceph_mds_session *s = cap->session; 730 731 spin_lock(&s->s_cap_lock); 732 - if (s->s_cap_iterator == NULL) { 733 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, 734 s->s_mds); 735 list_move_tail(&cap->session_caps, &s->s_caps); ··· 1249 arg.mode = inode->i_mode; 1250 1251 arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 1252 - arg.flags = 0; 1253 if (sync) 1254 arg.flags |= CEPH_CLIENT_CAPS_SYNC; 1255 ··· 1458 goto retry; 1459 } 1460 1461 __ceph_flush_snaps(ci, session); 1462 out: 1463 spin_unlock(&ci->i_ceph_lock); 1464 1465 if (psession) { 1466 *psession = session; 1467 - } else { 1468 mutex_unlock(&session->s_mutex); 1469 ceph_put_mds_session(session); 1470 } ··· 1911 (ci->i_ceph_flags & 1912 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { 1913 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 1914 - spin_lock(&mdsc->cap_dirty_lock); 1915 - oldest_flush_tid = __get_oldest_flush_tid(mdsc); 1916 - spin_unlock(&mdsc->cap_dirty_lock); 1917 - __kick_flushing_caps(mdsc, session, ci, 1918 - oldest_flush_tid); 1919 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; 1920 } 1921 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) ··· 2116 2117 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 2118 2119 - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 2120 if (ret < 0) 2121 goto out; 2122 ··· 3428 tcap = __get_cap_for_mds(ci, target); 3429 if (tcap) { 3430 /* already have caps from the target */ 3431 - if (tcap->cap_id != t_cap_id || 3432 ceph_seq_cmp(tcap->seq, t_seq) < 0) { 3433 dout(" updating import cap %p mds%d\n", tcap, target); 3434 tcap->cap_id = t_cap_id;

··· 490 } 491 492 /* 493 + * If FILE_SHARED is newly issued, mark dir not complete. We don't 494 + * know what happened to this directory while we didn't have the cap. 495 + * If FILE_SHARED is being revoked, also mark dir not complete. It 496 + * stops on-going cached readdir. 497 */ 498 + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { 499 + if (issued & CEPH_CAP_FILE_SHARED) 500 + ci->i_shared_gen++; 501 if (S_ISDIR(ci->vfs_inode.i_mode)) { 502 dout(" marking %p NOT complete\n", &ci->vfs_inode); 503 __ceph_dir_clear_complete(ci); ··· 611 } 612 613 if (flags & CEPH_CAP_FLAG_AUTH) { 614 + if (!ci->i_auth_cap || 615 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { 616 ci->i_auth_cap = cap; 617 cap->mds_wanted = wanted; ··· 728 struct ceph_mds_session *s = cap->session; 729 730 spin_lock(&s->s_cap_lock); 731 + if (!s->s_cap_iterator) { 732 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, 733 s->s_mds); 734 list_move_tail(&cap->session_caps, &s->s_caps); ··· 1248 arg.mode = inode->i_mode; 1249 1250 arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 1251 + if (list_empty(&ci->i_cap_snaps)) 1252 + arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; 1253 + else 1254 + arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; 1255 if (sync) 1256 arg.flags |= CEPH_CLIENT_CAPS_SYNC; 1257 ··· 1454 goto retry; 1455 } 1456 1457 + // make sure flushsnap messages are sent in proper order. 1458 + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 1459 + __kick_flushing_caps(mdsc, session, ci, 0); 1460 + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; 1461 + } 1462 + 1463 __ceph_flush_snaps(ci, session); 1464 out: 1465 spin_unlock(&ci->i_ceph_lock); 1466 1467 if (psession) { 1468 *psession = session; 1469 + } else if (session) { 1470 mutex_unlock(&session->s_mutex); 1471 ceph_put_mds_session(session); 1472 } ··· 1901 (ci->i_ceph_flags & 1902 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { 1903 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 1904 + __kick_flushing_caps(mdsc, session, ci, 0); 1905 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; 1906 } 1907 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) ··· 2110 2111 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 2112 2113 + ret = file_write_and_wait_range(file, start, end); 2114 if (ret < 0) 2115 goto out; 2116 ··· 3422 tcap = __get_cap_for_mds(ci, target); 3423 if (tcap) { 3424 /* already have caps from the target */ 3425 + if (tcap->cap_id == t_cap_id && 3426 ceph_seq_cmp(tcap->seq, t_seq) < 0) { 3427 dout(" updating import cap %p mds%d\n", tcap, target); 3428 tcap->cap_id = t_cap_id;

+1 -1

fs/ceph/debugfs.c

··· 24 struct ceph_fs_client *fsc = s->private; 25 struct ceph_mdsmap *mdsmap; 26 27 - if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) 28 return 0; 29 mdsmap = fsc->mdsc->mdsmap; 30 seq_printf(s, "epoch %d\n", mdsmap->m_epoch);

··· 24 struct ceph_fs_client *fsc = s->private; 25 struct ceph_mdsmap *mdsmap; 26 27 + if (!fsc->mdsc || !fsc->mdsc->mdsmap) 28 return 0; 29 mdsmap = fsc->mdsc->mdsmap; 30 seq_printf(s, "epoch %d\n", mdsmap->m_epoch);

+4 -2

fs/ceph/dir.c

··· 377 } 378 /* hints to request -> mds selection code */ 379 req->r_direct_mode = USE_AUTH_MDS; 380 - req->r_direct_hash = ceph_frag_value(frag); 381 - __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 382 if (fi->last_name) { 383 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 384 if (!req->r_path2) {

··· 377 } 378 /* hints to request -> mds selection code */ 379 req->r_direct_mode = USE_AUTH_MDS; 380 + if (op == CEPH_MDS_OP_READDIR) { 381 + req->r_direct_hash = ceph_frag_value(frag); 382 + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 383 + } 384 if (fi->last_name) { 385 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 386 if (!req->r_path2) {

+25 -25

fs/ceph/file.c

··· 175 dout("init_file %p %p 0%o (regular)\n", inode, file, 176 inode->i_mode); 177 cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 178 - if (cf == NULL) { 179 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 180 return -ENOMEM; 181 } ··· 562 ssize_t ret; 563 size_t len = iov_iter_count(to); 564 565 - dout("sync_read on file %p %llu~%u %s\n", file, off, 566 - (unsigned)len, 567 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 568 569 if (!len) ··· 787 goto out; 788 } 789 790 - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; 791 ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); 792 ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); 793 ··· 799 } 800 801 req->r_ops[0] = orig_req->r_ops[0]; 802 - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 803 804 req->r_mtime = aio_req->mtime; 805 req->r_data_offset = req->r_ops[0].extent.offset; ··· 845 if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) 846 return -EROFS; 847 848 - dout("sync_direct_read_write (%s) on file %p %lld~%u\n", 849 - (write ? "write" : "read"), file, pos, (unsigned)count); 850 851 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 852 if (ret < 0) ··· 860 if (ret2 < 0) 861 dout("invalidate_inode_pages2_range returned %d\n", ret2); 862 863 - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; 864 } else { 865 flags = CEPH_OSD_FLAG_READ; 866 } ··· 873 vino = ceph_vino(inode); 874 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 875 vino, pos, &size, 0, 876 - /*include a 'startsync' command*/ 877 - write ? 2 : 1, 878 write ? CEPH_OSD_OP_WRITE : 879 CEPH_OSD_OP_READ, 880 flags, snapc, ··· 884 ret = PTR_ERR(req); 885 break; 886 } 887 888 len = size; 889 pages = dio_get_pages_alloc(iter, len, &start, &num_pages); ··· 925 truncate_inode_pages_range(inode->i_mapping, pos, 926 (pos+len) | (PAGE_SIZE - 1)); 927 928 - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 929 req->r_mtime = mtime; 930 } 931 ··· 1050 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 1051 return -EROFS; 1052 1053 - dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); 1054 1055 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 1056 if (ret < 0) ··· 1063 if (ret < 0) 1064 dout("invalidate_inode_pages2_range returned %d\n", ret); 1065 1066 - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; 1067 1068 while ((len = iov_iter_count(from)) > 0) { 1069 size_t left; ··· 1310 if (!prealloc_cf) 1311 return -ENOMEM; 1312 1313 inode_lock(inode); 1314 1315 /* We can write back this queue in page reclaim */ ··· 1342 goto out; 1343 } 1344 1345 - retry_snap: 1346 /* FIXME: not complete since it doesn't account for being at quota */ 1347 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { 1348 err = -ENOSPC; ··· 1390 &prealloc_cf); 1391 else 1392 written = ceph_sync_write(iocb, &data, pos, snapc); 1393 - if (written == -EOLDSNAPC) { 1394 - dout("aio_write %p %llx.%llx %llu~%u" 1395 - "got EOLDSNAPC, retrying\n", 1396 - inode, ceph_vinop(inode), 1397 - pos, (unsigned)count); 1398 - inode_lock(inode); 1399 - goto retry_snap; 1400 - } 1401 if (written > 0) 1402 iov_iter_advance(from, written); 1403 ceph_put_snap_context(snapc); ··· 1423 ceph_cap_string(got)); 1424 ceph_put_cap_refs(ci, got); 1425 1426 if (written >= 0) { 1427 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) 1428 iocb->ki_flags |= IOCB_DSYNC; 1429 - 1430 written = generic_write_sync(iocb, written); 1431 } 1432 ··· 1481 offset += file->f_pos; 1482 break; 1483 case SEEK_DATA: 1484 - if (offset >= i_size) { 1485 ret = -ENXIO; 1486 goto out; 1487 } 1488 break; 1489 case SEEK_HOLE: 1490 - if (offset >= i_size) { 1491 ret = -ENXIO; 1492 goto out; 1493 }

··· 175 dout("init_file %p %p 0%o (regular)\n", inode, file, 176 inode->i_mode); 177 cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 178 + if (!cf) { 179 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 180 return -ENOMEM; 181 } ··· 562 ssize_t ret; 563 size_t len = iov_iter_count(to); 564 565 + dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, 566 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 567 568 if (!len) ··· 788 goto out; 789 } 790 791 + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 792 ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); 793 ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); 794 ··· 800 } 801 802 req->r_ops[0] = orig_req->r_ops[0]; 803 804 req->r_mtime = aio_req->mtime; 805 req->r_data_offset = req->r_ops[0].extent.offset; ··· 847 if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) 848 return -EROFS; 849 850 + dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 851 + (write ? "write" : "read"), file, pos, (unsigned)count, 852 + snapc, snapc->seq); 853 854 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 855 if (ret < 0) ··· 861 if (ret2 < 0) 862 dout("invalidate_inode_pages2_range returned %d\n", ret2); 863 864 + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 865 } else { 866 flags = CEPH_OSD_FLAG_READ; 867 } ··· 874 vino = ceph_vino(inode); 875 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 876 vino, pos, &size, 0, 877 + 1, 878 write ? CEPH_OSD_OP_WRITE : 879 CEPH_OSD_OP_READ, 880 flags, snapc, ··· 886 ret = PTR_ERR(req); 887 break; 888 } 889 + 890 + if (write) 891 + size = min_t(u64, size, fsc->mount_options->wsize); 892 + else 893 + size = min_t(u64, size, fsc->mount_options->rsize); 894 895 len = size; 896 pages = dio_get_pages_alloc(iter, len, &start, &num_pages); ··· 922 truncate_inode_pages_range(inode->i_mapping, pos, 923 (pos+len) | (PAGE_SIZE - 1)); 924 925 req->r_mtime = mtime; 926 } 927 ··· 1048 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 1049 return -EROFS; 1050 1051 + dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", 1052 + file, pos, (unsigned)count, snapc, snapc->seq); 1053 1054 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 1055 if (ret < 0) ··· 1060 if (ret < 0) 1061 dout("invalidate_inode_pages2_range returned %d\n", ret); 1062 1063 + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 1064 1065 while ((len = iov_iter_count(from)) > 0) { 1066 size_t left; ··· 1307 if (!prealloc_cf) 1308 return -ENOMEM; 1309 1310 + retry_snap: 1311 inode_lock(inode); 1312 1313 /* We can write back this queue in page reclaim */ ··· 1338 goto out; 1339 } 1340 1341 /* FIXME: not complete since it doesn't account for being at quota */ 1342 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { 1343 err = -ENOSPC; ··· 1387 &prealloc_cf); 1388 else 1389 written = ceph_sync_write(iocb, &data, pos, snapc); 1390 if (written > 0) 1391 iov_iter_advance(from, written); 1392 ceph_put_snap_context(snapc); ··· 1428 ceph_cap_string(got)); 1429 ceph_put_cap_refs(ci, got); 1430 1431 + if (written == -EOLDSNAPC) { 1432 + dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", 1433 + inode, ceph_vinop(inode), pos, (unsigned)count); 1434 + goto retry_snap; 1435 + } 1436 + 1437 if (written >= 0) { 1438 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) 1439 iocb->ki_flags |= IOCB_DSYNC; 1440 written = generic_write_sync(iocb, written); 1441 } 1442 ··· 1481 offset += file->f_pos; 1482 break; 1483 case SEEK_DATA: 1484 + if (offset < 0 || offset >= i_size) { 1485 ret = -ENXIO; 1486 goto out; 1487 } 1488 break; 1489 case SEEK_HOLE: 1490 + if (offset < 0 || offset >= i_size) { 1491 ret = -ENXIO; 1492 goto out; 1493 }

+28 -25

fs/ceph/inode.c

··· 52 ino_t t = ceph_vino_to_ino(vino); 53 54 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); 55 - if (inode == NULL) 56 return ERR_PTR(-ENOMEM); 57 if (inode->i_state & I_NEW) { 58 dout("get_inode created new inode %p %llx.%llx ino %llx\n", ··· 133 } 134 135 frag = kmalloc(sizeof(*frag), GFP_NOFS); 136 - if (!frag) { 137 - pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " 138 - "frag %x\n", &ci->vfs_inode, 139 - ceph_vinop(&ci->vfs_inode), f); 140 return ERR_PTR(-ENOMEM); 141 - } 142 frag->frag = f; 143 frag->split_by = 0; 144 frag->mds = -1; ··· 1067 spin_unlock(&dentry->d_lock); 1068 if (old_lease_session) 1069 ceph_put_mds_session(old_lease_session); 1070 - return; 1071 } 1072 1073 /* ··· 1173 dn = d_alloc(parent, &dname); 1174 dout("d_alloc %p '%.*s' = %p\n", parent, 1175 dname.len, dname.name, dn); 1176 - if (dn == NULL) { 1177 dput(parent); 1178 err = -ENOMEM; 1179 goto done; ··· 1473 struct dentry *dn; 1474 struct inode *in; 1475 int err = 0, skipped = 0, ret, i; 1476 - struct inode *snapdir = NULL; 1477 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1478 u32 frag = le32_to_cpu(rhead->args.readdir.frag); 1479 u32 last_hash = 0; ··· 1505 } 1506 1507 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 1508 - snapdir = ceph_get_snapdir(d_inode(parent)); 1509 - parent = d_find_alias(snapdir); 1510 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", 1511 rinfo->dir_nr, parent); 1512 } else { ··· 1512 rinfo->dir_nr, parent); 1513 if (rinfo->dir_dir) 1514 ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); 1515 - } 1516 1517 - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && 1518 - !(rinfo->hash_order && last_hash)) { 1519 - /* note dir version at start of readdir so we can tell 1520 - * if any dentries get dropped */ 1521 - req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); 1522 - req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); 1523 - req->r_readdir_cache_idx = 0; 1524 } 1525 1526 cache_ctl.index = req->r_readdir_cache_idx; ··· 1562 dn = d_alloc(parent, &dname); 1563 dout("d_alloc %p '%.*s' = %p\n", parent, 1564 dname.len, dname.name, dn); 1565 - if (dn == NULL) { 1566 dout("d_alloc badness\n"); 1567 err = -ENOMEM; 1568 goto out; ··· 1646 req->r_readdir_cache_idx = cache_ctl.index; 1647 } 1648 ceph_readdir_cache_release(&cache_ctl); 1649 - if (snapdir) { 1650 - iput(snapdir); 1651 - dput(parent); 1652 - } 1653 dout("readdir_prepopulate done\n"); 1654 return err; 1655 } ··· 1833 * possibly truncate them.. so write AND block! 1834 */ 1835 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 1836 dout("__do_pending_vmtruncate %p flushing snaps first\n", 1837 inode); 1838 - spin_unlock(&ci->i_ceph_lock); 1839 filemap_write_and_wait_range(&inode->i_data, 0, 1840 inode->i_sb->s_maxbytes); 1841 goto retry;

··· 52 ino_t t = ceph_vino_to_ino(vino); 53 54 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); 55 + if (!inode) 56 return ERR_PTR(-ENOMEM); 57 if (inode->i_state & I_NEW) { 58 dout("get_inode created new inode %p %llx.%llx ino %llx\n", ··· 133 } 134 135 frag = kmalloc(sizeof(*frag), GFP_NOFS); 136 + if (!frag) 137 return ERR_PTR(-ENOMEM); 138 + 139 frag->frag = f; 140 frag->split_by = 0; 141 frag->mds = -1; ··· 1070 spin_unlock(&dentry->d_lock); 1071 if (old_lease_session) 1072 ceph_put_mds_session(old_lease_session); 1073 } 1074 1075 /* ··· 1177 dn = d_alloc(parent, &dname); 1178 dout("d_alloc %p '%.*s' = %p\n", parent, 1179 dname.len, dname.name, dn); 1180 + if (!dn) { 1181 dput(parent); 1182 err = -ENOMEM; 1183 goto done; ··· 1477 struct dentry *dn; 1478 struct inode *in; 1479 int err = 0, skipped = 0, ret, i; 1480 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1481 u32 frag = le32_to_cpu(rhead->args.readdir.frag); 1482 u32 last_hash = 0; ··· 1510 } 1511 1512 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 1513 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", 1514 rinfo->dir_nr, parent); 1515 } else { ··· 1519 rinfo->dir_nr, parent); 1520 if (rinfo->dir_dir) 1521 ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); 1522 1523 + if (ceph_frag_is_leftmost(frag) && 1524 + req->r_readdir_offset == 2 && 1525 + !(rinfo->hash_order && last_hash)) { 1526 + /* note dir version at start of readdir so we can 1527 + * tell if any dentries get dropped */ 1528 + req->r_dir_release_cnt = 1529 + atomic64_read(&ci->i_release_count); 1530 + req->r_dir_ordered_cnt = 1531 + atomic64_read(&ci->i_ordered_count); 1532 + req->r_readdir_cache_idx = 0; 1533 + } 1534 } 1535 1536 cache_ctl.index = req->r_readdir_cache_idx; ··· 1566 dn = d_alloc(parent, &dname); 1567 dout("d_alloc %p '%.*s' = %p\n", parent, 1568 dname.len, dname.name, dn); 1569 + if (!dn) { 1570 dout("d_alloc badness\n"); 1571 err = -ENOMEM; 1572 goto out; ··· 1650 req->r_readdir_cache_idx = cache_ctl.index; 1651 } 1652 ceph_readdir_cache_release(&cache_ctl); 1653 dout("readdir_prepopulate done\n"); 1654 return err; 1655 } ··· 1841 * possibly truncate them.. so write AND block! 1842 */ 1843 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 1844 + struct ceph_cap_snap *capsnap; 1845 + to = ci->i_truncate_size; 1846 + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 1847 + // MDS should have revoked Frw caps 1848 + WARN_ON_ONCE(capsnap->writing); 1849 + if (capsnap->dirty_pages && capsnap->size > to) 1850 + to = capsnap->size; 1851 + } 1852 + spin_unlock(&ci->i_ceph_lock); 1853 dout("__do_pending_vmtruncate %p flushing snaps first\n", 1854 inode); 1855 + 1856 + truncate_pagecache(inode, to); 1857 + 1858 filemap_write_and_wait_range(&inode->i_data, 0, 1859 inode->i_sb->s_maxbytes); 1860 goto retry;

+22 -15

fs/ceph/mds_client.c

··· 408 { 409 struct ceph_mds_session *session; 410 411 - if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) 412 return NULL; 413 session = mdsc->sessions[mds]; 414 dout("lookup_mds_session %p %d\n", session, ··· 483 484 dout("register_session realloc to %d\n", newmax); 485 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); 486 - if (sa == NULL) 487 goto fail_realloc; 488 if (mdsc->sessions) { 489 memcpy(sa, mdsc->sessions, ··· 731 732 inode = NULL; 733 if (req->r_inode) { 734 - inode = req->r_inode; 735 - ihold(inode); 736 - } else if (req->r_dentry) { 737 /* ignore race with rename; old or new d_parent is okay */ 738 struct dentry *parent; 739 struct inode *dir; ··· 893 894 /* Calculate serialized length of metadata */ 895 metadata_bytes = 4; /* map length */ 896 - for (i = 0; metadata[i][0] != NULL; ++i) { 897 metadata_bytes += 8 + strlen(metadata[i][0]) + 898 strlen(metadata[i][1]); 899 metadata_key_count++; ··· 926 ceph_encode_32(&p, metadata_key_count); 927 928 /* Two length-prefixed strings for each entry in the map */ 929 - for (i = 0; metadata[i][0] != NULL; ++i) { 930 size_t const key_len = strlen(metadata[i][0]); 931 size_t const val_len = strlen(metadata[i][1]); 932 ··· 1129 1130 spin_lock(&session->s_cap_lock); 1131 p = p->next; 1132 - if (cap->ci == NULL) { 1133 dout("iterate_session_caps finishing cap %p removal\n", 1134 cap); 1135 BUG_ON(cap->session != session); ··· 1755 int len, pos; 1756 unsigned seq; 1757 1758 - if (dentry == NULL) 1759 return ERR_PTR(-EINVAL); 1760 1761 retry: ··· 1778 len--; /* no leading '/' */ 1779 1780 path = kmalloc(len+1, GFP_NOFS); 1781 - if (path == NULL) 1782 return ERR_PTR(-ENOMEM); 1783 pos = len; 1784 path[pos] = 0; /* trailing null */ ··· 2882 } 2883 2884 if (list_empty(&ci->i_cap_snaps)) { 2885 - snap_follows = 0; 2886 } else { 2887 struct ceph_cap_snap *capsnap = 2888 list_first_entry(&ci->i_cap_snaps, ··· 3140 newmap->m_epoch, oldmap->m_epoch); 3141 3142 for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { 3143 - if (mdsc->sessions[i] == NULL) 3144 continue; 3145 s = mdsc->sessions[i]; 3146 oldstate = ceph_mdsmap_get_state(oldmap, i); ··· 3287 mutex_lock(&session->s_mutex); 3288 session->s_seq++; 3289 3290 - if (inode == NULL) { 3291 dout("handle_lease no inode %llx\n", vino.ino); 3292 goto release; 3293 } ··· 3445 3446 for (i = 0; i < mdsc->max_sessions; i++) { 3447 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); 3448 - if (s == NULL) 3449 continue; 3450 if (s->s_state == CEPH_MDS_SESSION_CLOSING) { 3451 dout("resending session close request for mds%d\n", ··· 3497 fsc->mdsc = mdsc; 3498 mutex_init(&mdsc->mutex); 3499 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3500 - if (mdsc->mdsmap == NULL) { 3501 kfree(mdsc); 3502 return -ENOMEM; 3503 }

··· 408 { 409 struct ceph_mds_session *session; 410 411 + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) 412 return NULL; 413 session = mdsc->sessions[mds]; 414 dout("lookup_mds_session %p %d\n", session, ··· 483 484 dout("register_session realloc to %d\n", newmax); 485 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); 486 + if (!sa) 487 goto fail_realloc; 488 if (mdsc->sessions) { 489 memcpy(sa, mdsc->sessions, ··· 731 732 inode = NULL; 733 if (req->r_inode) { 734 + if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { 735 + inode = req->r_inode; 736 + ihold(inode); 737 + } else { 738 + /* req->r_dentry is non-null for LSSNAP request. 739 + * fall-thru */ 740 + WARN_ON_ONCE(!req->r_dentry); 741 + } 742 + } 743 + if (!inode && req->r_dentry) { 744 /* ignore race with rename; old or new d_parent is okay */ 745 struct dentry *parent; 746 struct inode *dir; ··· 886 887 /* Calculate serialized length of metadata */ 888 metadata_bytes = 4; /* map length */ 889 + for (i = 0; metadata[i][0]; ++i) { 890 metadata_bytes += 8 + strlen(metadata[i][0]) + 891 strlen(metadata[i][1]); 892 metadata_key_count++; ··· 919 ceph_encode_32(&p, metadata_key_count); 920 921 /* Two length-prefixed strings for each entry in the map */ 922 + for (i = 0; metadata[i][0]; ++i) { 923 size_t const key_len = strlen(metadata[i][0]); 924 size_t const val_len = strlen(metadata[i][1]); 925 ··· 1122 1123 spin_lock(&session->s_cap_lock); 1124 p = p->next; 1125 + if (!cap->ci) { 1126 dout("iterate_session_caps finishing cap %p removal\n", 1127 cap); 1128 BUG_ON(cap->session != session); ··· 1748 int len, pos; 1749 unsigned seq; 1750 1751 + if (!dentry) 1752 return ERR_PTR(-EINVAL); 1753 1754 retry: ··· 1771 len--; /* no leading '/' */ 1772 1773 path = kmalloc(len+1, GFP_NOFS); 1774 + if (!path) 1775 return ERR_PTR(-ENOMEM); 1776 pos = len; 1777 path[pos] = 0; /* trailing null */ ··· 2875 } 2876 2877 if (list_empty(&ci->i_cap_snaps)) { 2878 + snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; 2879 } else { 2880 struct ceph_cap_snap *capsnap = 2881 list_first_entry(&ci->i_cap_snaps, ··· 3133 newmap->m_epoch, oldmap->m_epoch); 3134 3135 for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { 3136 + if (!mdsc->sessions[i]) 3137 continue; 3138 s = mdsc->sessions[i]; 3139 oldstate = ceph_mdsmap_get_state(oldmap, i); ··· 3280 mutex_lock(&session->s_mutex); 3281 session->s_seq++; 3282 3283 + if (!inode) { 3284 dout("handle_lease no inode %llx\n", vino.ino); 3285 goto release; 3286 } ··· 3438 3439 for (i = 0; i < mdsc->max_sessions; i++) { 3440 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); 3441 + if (!s) 3442 continue; 3443 if (s->s_state == CEPH_MDS_SESSION_CLOSING) { 3444 dout("resending session close request for mds%d\n", ··· 3490 fsc->mdsc = mdsc; 3491 mutex_init(&mdsc->mutex); 3492 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3493 + if (!mdsc->mdsmap) { 3494 kfree(mdsc); 3495 return -ENOMEM; 3496 }

+3 -3

fs/ceph/mdsmap.c

··· 112 u16 mdsmap_ev; 113 114 m = kzalloc(sizeof(*m), GFP_NOFS); 115 - if (m == NULL) 116 return ERR_PTR(-ENOMEM); 117 118 ceph_decode_need(p, end, 1 + 1, bad); ··· 138 m->m_num_mds = m->m_max_mds; 139 140 m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); 141 - if (m->m_info == NULL) 142 goto nomem; 143 144 /* pick out active nodes from mds_info (state > 0) */ ··· 232 if (num_export_targets) { 233 info->export_targets = kcalloc(num_export_targets, 234 sizeof(u32), GFP_NOFS); 235 - if (info->export_targets == NULL) 236 goto nomem; 237 for (j = 0; j < num_export_targets; j++) 238 info->export_targets[j] =

··· 112 u16 mdsmap_ev; 113 114 m = kzalloc(sizeof(*m), GFP_NOFS); 115 + if (!m) 116 return ERR_PTR(-ENOMEM); 117 118 ceph_decode_need(p, end, 1 + 1, bad); ··· 138 m->m_num_mds = m->m_max_mds; 139 140 m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); 141 + if (!m->m_info) 142 goto nomem; 143 144 /* pick out active nodes from mds_info (state > 0) */ ··· 232 if (num_export_targets) { 233 info->export_targets = kcalloc(num_export_targets, 234 sizeof(u32), GFP_NOFS); 235 + if (!info->export_targets) 236 goto nomem; 237 for (j = 0; j < num_export_targets; j++) 238 info->export_targets[j] =

+16 -21

fs/ceph/snap.c

··· 299 /* 300 * build the snap context for a given realm. 301 */ 302 - static int build_snap_context(struct ceph_snap_realm *realm) 303 { 304 struct ceph_snap_realm *parent = realm->parent; 305 struct ceph_snap_context *snapc; ··· 314 */ 315 if (parent) { 316 if (!parent->cached_context) { 317 - err = build_snap_context(parent); 318 if (err) 319 goto fail; 320 } ··· 333 " (unchanged)\n", 334 realm->ino, realm, realm->cached_context, 335 realm->cached_context->seq, 336 - (unsigned int) realm->cached_context->num_snaps); 337 return 0; 338 } 339 ··· 374 realm->ino, realm, snapc, snapc->seq, 375 (unsigned int) snapc->num_snaps); 376 377 - ceph_put_snap_context(realm->cached_context); 378 realm->cached_context = snapc; 379 return 0; 380 ··· 399 /* 400 * rebuild snap context for the given realm and all of its children. 401 */ 402 - static void rebuild_snap_realms(struct ceph_snap_realm *realm) 403 { 404 struct ceph_snap_realm *child; 405 406 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); 407 - build_snap_context(realm); 408 409 list_for_each_entry(child, &realm->children, child_item) 410 - rebuild_snap_realms(child); 411 } 412 413 ··· 630 { 631 struct ceph_inode_info *ci; 632 struct inode *lastinode = NULL; 633 - struct ceph_snap_realm *child; 634 635 dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); 636 637 spin_lock(&realm->inodes_with_caps_lock); 638 - list_for_each_entry(ci, &realm->inodes_with_caps, 639 - i_snap_realm_item) { 640 struct inode *inode = igrab(&ci->vfs_inode); 641 if (!inode) 642 continue; ··· 647 spin_unlock(&realm->inodes_with_caps_lock); 648 iput(lastinode); 649 650 - list_for_each_entry(child, &realm->children, child_item) { 651 - dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", 652 - realm, realm->ino, child, child->ino); 653 - list_del_init(&child->dirty_item); 654 - list_add(&child->dirty_item, &realm->dirty_item); 655 - } 656 - 657 - list_del_init(&realm->dirty_item); 658 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 659 } 660 ··· 717 if (err < 0) 718 goto fail; 719 720 - /* queue realm for cap_snap creation */ 721 - list_add(&realm->dirty_item, &dirty_realms); 722 if (realm->seq > mdsc->last_snap_seq) 723 mdsc->last_snap_seq = realm->seq; 724 ··· 735 736 /* invalidate when we reach the _end_ (root) of the trace */ 737 if (invalidate && p >= e) 738 - rebuild_snap_realms(realm); 739 740 if (!first_realm) 741 first_realm = realm; ··· 752 while (!list_empty(&dirty_realms)) { 753 realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, 754 dirty_item); 755 queue_realm_cap_snaps(realm); 756 } 757

··· 299 /* 300 * build the snap context for a given realm. 301 */ 302 + static int build_snap_context(struct ceph_snap_realm *realm, 303 + struct list_head* dirty_realms) 304 { 305 struct ceph_snap_realm *parent = realm->parent; 306 struct ceph_snap_context *snapc; ··· 313 */ 314 if (parent) { 315 if (!parent->cached_context) { 316 + err = build_snap_context(parent, dirty_realms); 317 if (err) 318 goto fail; 319 } ··· 332 " (unchanged)\n", 333 realm->ino, realm, realm->cached_context, 334 realm->cached_context->seq, 335 + (unsigned int)realm->cached_context->num_snaps); 336 return 0; 337 } 338 ··· 373 realm->ino, realm, snapc, snapc->seq, 374 (unsigned int) snapc->num_snaps); 375 376 + if (realm->cached_context) { 377 + ceph_put_snap_context(realm->cached_context); 378 + /* queue realm for cap_snap creation */ 379 + list_add_tail(&realm->dirty_item, dirty_realms); 380 + } 381 realm->cached_context = snapc; 382 return 0; 383 ··· 394 /* 395 * rebuild snap context for the given realm and all of its children. 396 */ 397 + static void rebuild_snap_realms(struct ceph_snap_realm *realm, 398 + struct list_head *dirty_realms) 399 { 400 struct ceph_snap_realm *child; 401 402 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); 403 + build_snap_context(realm, dirty_realms); 404 405 list_for_each_entry(child, &realm->children, child_item) 406 + rebuild_snap_realms(child, dirty_realms); 407 } 408 409 ··· 624 { 625 struct ceph_inode_info *ci; 626 struct inode *lastinode = NULL; 627 628 dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); 629 630 spin_lock(&realm->inodes_with_caps_lock); 631 + list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { 632 struct inode *inode = igrab(&ci->vfs_inode); 633 if (!inode) 634 continue; ··· 643 spin_unlock(&realm->inodes_with_caps_lock); 644 iput(lastinode); 645 646 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 647 } 648 ··· 721 if (err < 0) 722 goto fail; 723 724 if (realm->seq > mdsc->last_snap_seq) 725 mdsc->last_snap_seq = realm->seq; 726 ··· 741 742 /* invalidate when we reach the _end_ (root) of the trace */ 743 if (invalidate && p >= e) 744 + rebuild_snap_realms(realm, &dirty_realms); 745 746 if (!first_realm) 747 first_realm = realm; ··· 758 while (!list_empty(&dirty_realms)) { 759 realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, 760 dirty_item); 761 + list_del_init(&realm->dirty_item); 762 queue_realm_cap_snaps(realm); 763 } 764

+42 -34

fs/ceph/super.c

··· 49 struct ceph_statfs st; 50 u64 fsid; 51 int err; 52 53 dout("statfs\n"); 54 - err = ceph_monc_do_statfs(&fsc->client->monc, &st); 55 if (err < 0) 56 return err; 57 ··· 120 Opt_rasize, 121 Opt_caps_wanted_delay_min, 122 Opt_caps_wanted_delay_max, 123 - Opt_cap_release_safety, 124 Opt_readdir_max_entries, 125 Opt_readdir_max_bytes, 126 Opt_congestion_kb, ··· 158 {Opt_rasize, "rasize=%d"}, 159 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 160 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 161 - {Opt_cap_release_safety, "cap_release_safety=%d"}, 162 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 163 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 164 {Opt_congestion_kb, "write_congestion_kb=%d"}, ··· 240 break; 241 /* misc */ 242 case Opt_wsize: 243 - fsopt->wsize = intval; 244 break; 245 case Opt_rsize: 246 - fsopt->rsize = intval; 247 break; 248 case Opt_rasize: 249 - fsopt->rasize = intval; 250 break; 251 case Opt_caps_wanted_delay_min: 252 fsopt->caps_wanted_delay_min = intval; 253 break; 254 case Opt_caps_wanted_delay_max: 255 fsopt->caps_wanted_delay_max = intval; 256 break; 257 case Opt_readdir_max_entries: 258 fsopt->max_readdir = intval; 259 break; 260 case Opt_readdir_max_bytes: 261 fsopt->max_readdir_bytes = intval; 262 break; 263 case Opt_congestion_kb: 264 fsopt->congestion_kb = intval; 265 break; 266 case Opt_dirstat: ··· 413 fsopt->sb_flags = flags; 414 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 415 416 - fsopt->rsize = CEPH_RSIZE_DEFAULT; 417 fsopt->rasize = CEPH_RASIZE_DEFAULT; 418 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 419 if (!fsopt->snapdir_name) { ··· 424 425 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 426 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 427 - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 428 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 429 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 430 fsopt->congestion_kb = default_congestion_kb(); ··· 529 seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); 530 if (fsopt->wsize) 531 seq_printf(m, ",wsize=%d", fsopt->wsize); 532 - if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 533 seq_printf(m, ",rsize=%d", fsopt->rsize); 534 if (fsopt->rasize != CEPH_RASIZE_DEFAULT) 535 seq_printf(m, ",rasize=%d", fsopt->rasize); ··· 541 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) 542 seq_printf(m, ",caps_wanted_delay_max=%d", 543 fsopt->caps_wanted_delay_max); 544 - if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) 545 - seq_printf(m, ",cap_release_safety=%d", 546 - fsopt->cap_release_safety); 547 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) 548 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); 549 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) ··· 594 } 595 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 596 597 - if (fsopt->mds_namespace == NULL) { 598 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 599 0, true); 600 } else { ··· 615 * to be processed in parallel, limit concurrency. 616 */ 617 fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); 618 - if (fsc->wb_wq == NULL) 619 goto fail_client; 620 fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); 621 - if (fsc->pg_inv_wq == NULL) 622 goto fail_wb_wq; 623 fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); 624 - if (fsc->trunc_wq == NULL) 625 goto fail_pg_inv_wq; 626 627 /* set up mempools */ ··· 692 __alignof__(struct ceph_inode_info), 693 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| 694 SLAB_ACCOUNT, ceph_inode_init_once); 695 - if (ceph_inode_cachep == NULL) 696 return -ENOMEM; 697 698 ceph_cap_cachep = KMEM_CACHE(ceph_cap, 699 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 700 - if (ceph_cap_cachep == NULL) 701 goto bad_cap; 702 ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, 703 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 704 - if (ceph_cap_flush_cachep == NULL) 705 goto bad_cap_flush; 706 707 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, 708 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 709 - if (ceph_dentry_cachep == NULL) 710 goto bad_dentry; 711 712 ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); 713 714 - if (ceph_file_cachep == NULL) 715 goto bad_file; 716 717 if ((error = ceph_fscache_register())) ··· 965 return err; 966 967 /* set ra_pages based on rasize mount option? */ 968 - if (fsc->mount_options->rasize >= PAGE_SIZE) 969 - sb->s_bdi->ra_pages = 970 - (fsc->mount_options->rasize + PAGE_SIZE - 1) 971 - >> PAGE_SHIFT; 972 - else 973 - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; 974 975 - if (fsc->mount_options->rsize > fsc->mount_options->rasize && 976 - fsc->mount_options->rsize >= PAGE_SIZE) 977 - sb->s_bdi->io_pages = 978 - (fsc->mount_options->rsize + PAGE_SIZE - 1) 979 - >> PAGE_SHIFT; 980 - else if (fsc->mount_options->rsize == 0) 981 - sb->s_bdi->io_pages = ULONG_MAX; 982 983 return 0; 984 }

··· 49 struct ceph_statfs st; 50 u64 fsid; 51 int err; 52 + u64 data_pool; 53 + 54 + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { 55 + data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; 56 + } else { 57 + data_pool = CEPH_NOPOOL; 58 + } 59 60 dout("statfs\n"); 61 + err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st); 62 if (err < 0) 63 return err; 64 ··· 113 Opt_rasize, 114 Opt_caps_wanted_delay_min, 115 Opt_caps_wanted_delay_max, 116 Opt_readdir_max_entries, 117 Opt_readdir_max_bytes, 118 Opt_congestion_kb, ··· 152 {Opt_rasize, "rasize=%d"}, 153 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 154 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 155 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 156 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 157 {Opt_congestion_kb, "write_congestion_kb=%d"}, ··· 235 break; 236 /* misc */ 237 case Opt_wsize: 238 + if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) 239 + return -EINVAL; 240 + fsopt->wsize = ALIGN(intval, PAGE_SIZE); 241 break; 242 case Opt_rsize: 243 + if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) 244 + return -EINVAL; 245 + fsopt->rsize = ALIGN(intval, PAGE_SIZE); 246 break; 247 case Opt_rasize: 248 + if (intval < 0) 249 + return -EINVAL; 250 + fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE); 251 break; 252 case Opt_caps_wanted_delay_min: 253 + if (intval < 1) 254 + return -EINVAL; 255 fsopt->caps_wanted_delay_min = intval; 256 break; 257 case Opt_caps_wanted_delay_max: 258 + if (intval < 1) 259 + return -EINVAL; 260 fsopt->caps_wanted_delay_max = intval; 261 break; 262 case Opt_readdir_max_entries: 263 + if (intval < 1) 264 + return -EINVAL; 265 fsopt->max_readdir = intval; 266 break; 267 case Opt_readdir_max_bytes: 268 + if (intval < PAGE_SIZE && intval != 0) 269 + return -EINVAL; 270 fsopt->max_readdir_bytes = intval; 271 break; 272 case Opt_congestion_kb: 273 + if (intval < 1024) /* at least 1M */ 274 + return -EINVAL; 275 fsopt->congestion_kb = intval; 276 break; 277 case Opt_dirstat: ··· 392 fsopt->sb_flags = flags; 393 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 394 395 + fsopt->wsize = CEPH_MAX_WRITE_SIZE; 396 + fsopt->rsize = CEPH_MAX_READ_SIZE; 397 fsopt->rasize = CEPH_RASIZE_DEFAULT; 398 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 399 if (!fsopt->snapdir_name) { ··· 402 403 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 404 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 405 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 406 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 407 fsopt->congestion_kb = default_congestion_kb(); ··· 508 seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); 509 if (fsopt->wsize) 510 seq_printf(m, ",wsize=%d", fsopt->wsize); 511 + if (fsopt->rsize != CEPH_MAX_READ_SIZE) 512 seq_printf(m, ",rsize=%d", fsopt->rsize); 513 if (fsopt->rasize != CEPH_RASIZE_DEFAULT) 514 seq_printf(m, ",rasize=%d", fsopt->rasize); ··· 520 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) 521 seq_printf(m, ",caps_wanted_delay_max=%d", 522 fsopt->caps_wanted_delay_max); 523 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) 524 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); 525 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) ··· 576 } 577 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 578 579 + if (!fsopt->mds_namespace) { 580 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 581 0, true); 582 } else { ··· 597 * to be processed in parallel, limit concurrency. 598 */ 599 fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); 600 + if (!fsc->wb_wq) 601 goto fail_client; 602 fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); 603 + if (!fsc->pg_inv_wq) 604 goto fail_wb_wq; 605 fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); 606 + if (!fsc->trunc_wq) 607 goto fail_pg_inv_wq; 608 609 /* set up mempools */ ··· 674 __alignof__(struct ceph_inode_info), 675 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| 676 SLAB_ACCOUNT, ceph_inode_init_once); 677 + if (!ceph_inode_cachep) 678 return -ENOMEM; 679 680 ceph_cap_cachep = KMEM_CACHE(ceph_cap, 681 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 682 + if (!ceph_cap_cachep) 683 goto bad_cap; 684 ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, 685 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 686 + if (!ceph_cap_flush_cachep) 687 goto bad_cap_flush; 688 689 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, 690 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 691 + if (!ceph_dentry_cachep) 692 goto bad_dentry; 693 694 ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); 695 696 + if (!ceph_file_cachep) 697 goto bad_file; 698 699 if ((error = ceph_fscache_register())) ··· 947 return err; 948 949 /* set ra_pages based on rasize mount option? */ 950 + sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; 951 952 + /* set io_pages based on max osd read size */ 953 + sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; 954 955 return 0; 956 }

+14 -2

fs/ceph/super.h

··· 46 #define ceph_test_mount_opt(fsc, opt) \ 47 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) 48 49 - #define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ 50 #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ 51 #define CEPH_MAX_READDIR_DEFAULT 1024 52 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) 53 #define CEPH_SNAPDIRNAME_DEFAULT ".snap" 54 55 struct ceph_mount_options { 56 int flags; ··· 74 int rasize; /* max readahead */ 75 int congestion_kb; /* max writeback in flight */ 76 int caps_wanted_delay_min, caps_wanted_delay_max; 77 - int cap_release_safety; 78 int max_readdir; /* max readdir result (entires) */ 79 int max_readdir_bytes; /* max readdir result (bytes) */ 80

··· 46 #define ceph_test_mount_opt(fsc, opt) \ 47 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) 48 49 + /* max size of osd read request, limited by libceph */ 50 + #define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN 51 + /* osd has a configurable limitaion of max write size. 52 + * CEPH_MSG_MAX_DATA_LEN should be small enough. */ 53 + #define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN 54 #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ 55 #define CEPH_MAX_READDIR_DEFAULT 1024 56 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) 57 #define CEPH_SNAPDIRNAME_DEFAULT ".snap" 58 + 59 + /* 60 + * Delay telling the MDS we no longer want caps, in case we reopen 61 + * the file. Delay a minimum amount of time, even if we send a cap 62 + * message for some other reason. Otherwise, take the oppotunity to 63 + * update the mds to avoid sending another message later. 64 + */ 65 + #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 66 + #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 67 68 struct ceph_mount_options { 69 int flags; ··· 61 int rasize; /* max readahead */ 62 int congestion_kb; /* max writeback in flight */ 63 int caps_wanted_delay_min, caps_wanted_delay_max; 64 int max_readdir; /* max readdir result (entires) */ 65 int max_readdir_bytes; /* max readdir result (bytes) */ 66

+4 -4

fs/ceph/xattr.c

··· 777 spin_unlock(&ci->i_ceph_lock); 778 779 /* security module gets xattr while filling trace */ 780 - if (current->journal_info != NULL) { 781 pr_warn_ratelimited("sync getxattr %p " 782 "during filling trace\n", inode); 783 return -EBUSY; ··· 809 810 memcpy(value, xattr->val, xattr->val_len); 811 812 - if (current->journal_info != NULL && 813 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 814 ci->i_ceph_flags |= CEPH_I_SEC_INITED; 815 out: ··· 1058 up_read(&mdsc->snap_rwsem); 1059 1060 /* security module set xattr while filling trace */ 1061 - if (current->journal_info != NULL) { 1062 pr_warn_ratelimited("sync setxattr %p " 1063 "during filling trace\n", inode); 1064 err = -EBUSY; ··· 1108 { 1109 struct ceph_inode_info *ci; 1110 bool ret; 1111 - if (in->i_security == NULL) 1112 return false; 1113 ci = ceph_inode(in); 1114 spin_lock(&ci->i_ceph_lock);

··· 777 spin_unlock(&ci->i_ceph_lock); 778 779 /* security module gets xattr while filling trace */ 780 + if (current->journal_info) { 781 pr_warn_ratelimited("sync getxattr %p " 782 "during filling trace\n", inode); 783 return -EBUSY; ··· 809 810 memcpy(value, xattr->val, xattr->val_len); 811 812 + if (current->journal_info && 813 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 814 ci->i_ceph_flags |= CEPH_I_SEC_INITED; 815 out: ··· 1058 up_read(&mdsc->snap_rwsem); 1059 1060 /* security module set xattr while filling trace */ 1061 + if (current->journal_info) { 1062 pr_warn_ratelimited("sync setxattr %p " 1063 "during filling trace\n", inode); 1064 err = -EBUSY; ··· 1108 { 1109 struct ceph_inode_info *ci; 1110 bool ret; 1111 + if (!in->i_security) 1112 return false; 1113 ci = ceph_inode(in); 1114 spin_lock(&ci->i_ceph_lock);

+5 -1

include/linux/ceph/ceph_fs.h

··· 167 struct ceph_mon_statfs { 168 struct ceph_mon_request_header monhdr; 169 struct ceph_fsid fsid; 170 } __attribute__ ((packed)); 171 172 struct ceph_statfs { ··· 671 extern const char *ceph_cap_op_name(int op); 672 673 /* flags field in client cap messages (version >= 10) */ 674 - #define CEPH_CLIENT_CAPS_SYNC (0x1) 675 676 /* 677 * caps message, used for capability callbacks, acks, requests, etc.

··· 167 struct ceph_mon_statfs { 168 struct ceph_mon_request_header monhdr; 169 struct ceph_fsid fsid; 170 + __u8 contains_data_pool; 171 + __le64 data_pool; 172 } __attribute__ ((packed)); 173 174 struct ceph_statfs { ··· 669 extern const char *ceph_cap_op_name(int op); 670 671 /* flags field in client cap messages (version >= 10) */ 672 + #define CEPH_CLIENT_CAPS_SYNC (1<<0) 673 + #define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1) 674 + #define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2); 675 676 /* 677 * caps message, used for capability callbacks, acks, requests, etc.

-11

include/linux/ceph/libceph.h

··· 84 85 #define CEPH_AUTH_NAME_DEFAULT "guest" 86 87 - /* 88 - * Delay telling the MDS we no longer want caps, in case we reopen 89 - * the file. Delay a minimum amount of time, even if we send a cap 90 - * message for some other reason. Otherwise, take the oppotunity to 91 - * update the mds to avoid sending another message later. 92 - */ 93 - #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 94 - #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 95 - 96 - #define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) 97 - 98 /* mount state */ 99 enum { 100 CEPH_MOUNT_MOUNTING,

··· 84 85 #define CEPH_AUTH_NAME_DEFAULT "guest" 86 87 /* mount state */ 88 enum { 89 CEPH_MOUNT_MOUNTING,

+2 -2

include/linux/ceph/mon_client.h

··· 133 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 134 unsigned long timeout); 135 136 - extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, 137 - struct ceph_statfs *buf); 138 139 int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, 140 u64 *newest);

··· 133 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 134 unsigned long timeout); 135 136 + int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, 137 + struct ceph_statfs *buf); 138 139 int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, 140 u64 *newest);

-1

include/linux/ceph/rados.h

··· 230 \ 231 /* fancy write */ \ 232 f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ 233 - f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \ 234 f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ 235 f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ 236 \

··· 230 \ 231 /* fancy write */ \ 232 f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ 233 f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ 234 f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ 235 \

+5 -1

net/ceph/mon_client.c

··· 676 /* 677 * Do a synchronous statfs(). 678 */ 679 - int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) 680 { 681 struct ceph_mon_generic_request *req; 682 struct ceph_mon_statfs *h; ··· 697 goto out; 698 699 req->u.st = buf; 700 701 mutex_lock(&monc->mutex); 702 register_generic_request(req); ··· 707 h->monhdr.session_mon = cpu_to_le16(-1); 708 h->monhdr.session_mon_tid = 0; 709 h->fsid = monc->monmap->fsid; 710 send_generic_request(monc, req); 711 mutex_unlock(&monc->mutex); 712

··· 676 /* 677 * Do a synchronous statfs(). 678 */ 679 + int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, 680 + struct ceph_statfs *buf) 681 { 682 struct ceph_mon_generic_request *req; 683 struct ceph_mon_statfs *h; ··· 696 goto out; 697 698 req->u.st = buf; 699 + req->request->hdr.version = cpu_to_le16(2); 700 701 mutex_lock(&monc->mutex); 702 register_generic_request(req); ··· 705 h->monhdr.session_mon = cpu_to_le16(-1); 706 h->monhdr.session_mon_tid = 0; 707 h->fsid = monc->monmap->fsid; 708 + h->contains_data_pool = (data_pool != CEPH_NOPOOL); 709 + h->data_pool = cpu_to_le64(data_pool); 710 send_generic_request(monc, req); 711 mutex_unlock(&monc->mutex); 712

-5

net/ceph/osd_client.c

··· 863 dst->cls.method_len = src->cls.method_len; 864 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); 865 break; 866 - case CEPH_OSD_OP_STARTSYNC: 867 - break; 868 case CEPH_OSD_OP_WATCH: 869 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 870 dst->watch.ver = cpu_to_le64(0); ··· 914 * if the file was recently truncated, we include information about its 915 * old and new size so that the object can be updated appropriately. (we 916 * avoid synchronously deleting truncated objects because it's slow.) 917 - * 918 - * if @do_sync, include a 'startsync' command so that the osd will flush 919 - * data quickly. 920 */ 921 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 922 struct ceph_file_layout *layout,

··· 863 dst->cls.method_len = src->cls.method_len; 864 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); 865 break; 866 case CEPH_OSD_OP_WATCH: 867 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 868 dst->watch.ver = cpu_to_le64(0); ··· 916 * if the file was recently truncated, we include information about its 917 * old and new size so that the object can be updated appropriately. (we 918 * avoid synchronously deleting truncated objects because it's slow.) 919 */ 920 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 921 struct ceph_file_layout *layout,