Merge tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client

+14

Documentation/filesystems/ceph.txt

··· 158 158 copies. Currently, it's only used in copy_file_range, which will revert 159 159 to the default VFS implementation if this option is used. 160 160 161 + recover_session=<no|clean> 162 + Set auto reconnect mode in the case where the client is blacklisted. The 163 + available modes are "no" and "clean". The default is "no". 164 + 165 + * no: never attempt to reconnect when client detects that it has been 166 + blacklisted. Operations will generally fail after being blacklisted. 167 + 168 + * clean: client reconnects to the ceph cluster automatically when it 169 + detects that it has been blacklisted. During reconnect, client drops 170 + dirty data/metadata, invalidates page caches and writable file handles. 171 + After reconnect, file locks become stale because the MDS loses track 172 + of them. If an inode contains any stale file locks, read/write on the 173 + inode is not allowed until applications release all stale file locks. 174 + 161 175 More Information 162 176 ================ 163 177

+12 -6

drivers/block/rbd.c

··· 1754 1754 mutex_init(&img_request->state_mutex); 1755 1755 kref_init(&img_request->kref); 1756 1756 1757 - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, 1758 - obj_op_name(op_type), img_request); 1759 1757 return img_request; 1760 1758 } 1761 1759 ··· 2941 2943 2942 2944 __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2943 2945 child_img_req->obj_request = obj_req; 2946 + 2947 + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, 2948 + obj_req); 2944 2949 2945 2950 if (!rbd_img_is_write(img_req)) { 2946 2951 switch (img_req->data_type) { ··· 4878 4877 img_request->rq = rq; 4879 4878 snapc = NULL; /* img_request consumes a ref */ 4880 4879 4880 + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, 4881 + img_request, obj_op_name(op_type), offset, length); 4882 + 4881 4883 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 4882 4884 result = rbd_img_fill_nodata(img_request, offset, length); 4883 4885 else ··· 5673 5669 5674 5670 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 5675 5671 { 5672 + size_t size; 5676 5673 void *reply_buf; 5677 5674 int ret; 5678 5675 void *p; 5679 5676 5680 - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 5677 + /* Response will be an encoded string, which includes a length */ 5678 + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; 5679 + reply_buf = kzalloc(size, GFP_KERNEL); 5681 5680 if (!reply_buf) 5682 5681 return -ENOMEM; 5683 5682 5684 5683 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5685 5684 &rbd_dev->header_oloc, "get_object_prefix", 5686 - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 5685 + NULL, 0, reply_buf, size); 5687 5686 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5688 5687 if (ret < 0) 5689 5688 goto out; ··· 6703 6696 dout("rbd id object name is %s\n", oid.name); 6704 6697 6705 6698 /* Response will be an encoded string, which includes a length */ 6706 - 6707 6699 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 6708 6700 response = kzalloc(size, GFP_NOIO); 6709 6701 if (!response) { ··· 6714 6708 6715 6709 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 6716 6710 "get_id", NULL, 0, 6717 - response, RBD_IMAGE_ID_LEN_MAX); 6711 + response, size); 6718 6712 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6719 6713 if (ret == -ENOENT) { 6720 6714 image_id = kstrdup("", GFP_KERNEL);

+1 -1

fs/ceph/Makefile

··· 6 6 obj-$(CONFIG_CEPH_FS) += ceph.o 7 7 8 8 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9 - export.o caps.o snap.o xattr.o quota.o \ 9 + export.o caps.o snap.o xattr.o quota.o io.o \ 10 10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 11 11 debugfs.o 12 12

+30 -31

fs/ceph/addr.c

··· 189 189 { 190 190 struct inode *inode = file_inode(filp); 191 191 struct ceph_inode_info *ci = ceph_inode(inode); 192 - struct ceph_osd_client *osdc = 193 - &ceph_inode_to_client(inode)->client->osdc; 192 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 194 193 int err = 0; 195 194 u64 off = page_offset(page); 196 195 u64 len = PAGE_SIZE; ··· 218 219 219 220 dout("readpage inode %p file %p page %p index %lu\n", 220 221 inode, filp, page, page->index); 221 - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 222 - off, &len, 222 + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 223 + &ci->i_layout, off, &len, 223 224 ci->i_truncate_seq, ci->i_truncate_size, 224 225 &page, 1, 0); 225 226 if (err == -ENOENT) ··· 227 228 if (err < 0) { 228 229 SetPageError(page); 229 230 ceph_fscache_readpage_cancel(inode, page); 231 + if (err == -EBLACKLISTED) 232 + fsc->blacklisted = true; 230 233 goto out; 231 234 } 232 235 if (err < PAGE_SIZE) ··· 267 266 int i; 268 267 269 268 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 269 + if (rc == -EBLACKLISTED) 270 + ceph_inode_to_client(inode)->blacklisted = true; 270 271 271 272 /* unlock all pages, zeroing any data we didn't read */ 272 273 osd_data = osd_req_op_extent_osd_data(req, 0); ··· 326 323 /* caller of readpages does not hold buffer and read caps 327 324 * (fadvise, madvise and readahead cases) */ 328 325 int want = CEPH_CAP_FILE_CACHE; 329 - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); 326 + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, 327 + true, &got); 330 328 if (ret < 0) { 331 329 dout("start_read %p, error getting cap\n", inode); 332 330 } else if (!(got & want)) { ··· 573 569 /* 574 570 * Write a single page, but leave the page locked. 575 571 * 576 - * If we get a write error, set the page error bit, but still adjust the 572 + * If we get a write error, mark the mapping for error, but still adjust the 577 573 * dirty page accounting (i.e., page is no longer dirty). 578 574 */ 579 575 static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ··· 644 640 end_page_writeback(page); 645 641 return err; 646 642 } 643 + if (err == -EBLACKLISTED) 644 + fsc->blacklisted = true; 647 645 dout("writepage setting page/mapping error %d %p\n", 648 646 err, page); 649 - SetPageError(page); 650 647 mapping_set_error(&inode->i_data, err); 651 648 wbc->pages_skipped++; 652 649 } else { ··· 685 680 } 686 681 687 682 /* 688 - * lame release_pages helper. release_pages() isn't exported to 689 - * modules. 690 - */ 691 - static void ceph_release_pages(struct page **pages, int num) 692 - { 693 - struct pagevec pvec; 694 - int i; 695 - 696 - pagevec_init(&pvec); 697 - for (i = 0; i < num; i++) { 698 - if (pagevec_add(&pvec, pages[i]) == 0) 699 - pagevec_release(&pvec); 700 - } 701 - pagevec_release(&pvec); 702 - } 703 - 704 - /* 705 683 * async writeback completion handler. 706 684 * 707 685 * If we get an error, set the mapping error bit, but not the individual ··· 708 720 if (rc < 0) { 709 721 mapping_set_error(mapping, rc); 710 722 ceph_set_error_write(ci); 723 + if (rc == -EBLACKLISTED) 724 + fsc->blacklisted = true; 711 725 } else { 712 726 ceph_clear_error_write(ci); 713 727 } ··· 759 769 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", 760 770 inode, osd_data->length, rc >= 0 ? num_pages : 0); 761 771 762 - ceph_release_pages(osd_data->pages, num_pages); 772 + release_pages(osd_data->pages, num_pages); 763 773 } 764 774 765 775 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); ··· 1442 1452 want = CEPH_CAP_FILE_CACHE; 1443 1453 1444 1454 got = 0; 1445 - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 1455 + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, 1456 + &got, &pinned_page); 1446 1457 if (err < 0) 1447 1458 goto out_restore; 1448 1459 ··· 1531 1540 if (!prealloc_cf) 1532 1541 return VM_FAULT_OOM; 1533 1542 1543 + sb_start_pagefault(inode->i_sb); 1534 1544 ceph_block_sigs(&oldset); 1535 1545 1536 1546 if (ci->i_inline_version != CEPH_INLINE_NONE) { ··· 1560 1568 want = CEPH_CAP_FILE_BUFFER; 1561 1569 1562 1570 got = 0; 1563 - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1571 + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, 1564 1572 &got, NULL); 1565 1573 if (err < 0) 1566 1574 goto out_free; ··· 1606 1614 ceph_put_cap_refs(ci, got); 1607 1615 out_free: 1608 1616 ceph_restore_sigs(&oldset); 1617 + sb_end_pagefault(inode->i_sb); 1609 1618 ceph_free_cap_flush(prealloc_cf); 1610 1619 if (err < 0) 1611 1620 ret = vmf_error(err); ··· 1939 1946 1940 1947 if (err >= 0 || err == -ENOENT) 1941 1948 have |= POOL_READ; 1942 - else if (err != -EPERM) 1949 + else if (err != -EPERM) { 1950 + if (err == -EBLACKLISTED) 1951 + fsc->blacklisted = true; 1943 1952 goto out_unlock; 1953 + } 1944 1954 1945 1955 if (err2 == 0 || err2 == -EEXIST) 1946 1956 have |= POOL_WRITE; 1947 1957 else if (err2 != -EPERM) { 1958 + if (err2 == -EBLACKLISTED) 1959 + fsc->blacklisted = true; 1948 1960 err = err2; 1949 1961 goto out_unlock; 1950 1962 } ··· 1987 1989 return err; 1988 1990 } 1989 1991 1990 - int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) 1992 + int ceph_pool_perm_check(struct inode *inode, int need) 1991 1993 { 1992 - s64 pool; 1994 + struct ceph_inode_info *ci = ceph_inode(inode); 1993 1995 struct ceph_string *pool_ns; 1996 + s64 pool; 1994 1997 int ret, flags; 1995 1998 1996 1999 if (ci->i_vino.snap != CEPH_NOSNAP) { ··· 2003 2004 return 0; 2004 2005 } 2005 2006 2006 - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), 2007 + if (ceph_test_mount_opt(ceph_inode_to_client(inode), 2007 2008 NOPOOLPERM)) 2008 2009 return 0; 2009 2010

+2

fs/ceph/cache.c

··· 6 6 * Written by Milosz Tanski (milosz@adfin.com) 7 7 */ 8 8 9 + #include <linux/ceph/ceph_debug.h> 10 + 9 11 #include "super.h" 10 12 #include "cache.h" 11 13

+88 -85

fs/ceph/caps.c

··· 458 458 } 459 459 460 460 /* 461 - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. 462 - */ 463 - static int __ceph_get_cap_mds(struct ceph_inode_info *ci) 464 - { 465 - struct ceph_cap *cap; 466 - int mds = -1; 467 - struct rb_node *p; 468 - 469 - /* prefer mds with WR|BUFFER|EXCL caps */ 470 - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 471 - cap = rb_entry(p, struct ceph_cap, ci_node); 472 - mds = cap->mds; 473 - if (cap->issued & (CEPH_CAP_FILE_WR | 474 - CEPH_CAP_FILE_BUFFER | 475 - CEPH_CAP_FILE_EXCL)) 476 - break; 477 - } 478 - return mds; 479 - } 480 - 481 - int ceph_get_cap_mds(struct inode *inode) 482 - { 483 - struct ceph_inode_info *ci = ceph_inode(inode); 484 - int mds; 485 - spin_lock(&ci->i_ceph_lock); 486 - mds = __ceph_get_cap_mds(ceph_inode(inode)); 487 - spin_unlock(&ci->i_ceph_lock); 488 - return mds; 489 - } 490 - 491 - /* 492 461 * Called under i_ceph_lock. 493 462 */ 494 463 static void __insert_cap_node(struct ceph_inode_info *ci, ··· 597 628 /* 598 629 * Add a capability under the given MDS session. 599 630 * 600 - * Caller should hold session snap_rwsem (read) and s_mutex. 631 + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock 601 632 * 602 633 * @fmode is the open file mode, if we are opening a file, otherwise 603 634 * it is < 0. (This is so we can atomically add the cap and add an ··· 614 645 struct ceph_cap *cap; 615 646 int mds = session->s_mds; 616 647 int actual_wanted; 648 + u32 gen; 649 + 650 + lockdep_assert_held(&ci->i_ceph_lock); 617 651 618 652 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, 619 653 session->s_mds, cap_id, ceph_cap_string(issued), seq); ··· 627 655 */ 628 656 if (fmode >= 0) 629 657 wanted |= ceph_caps_for_mode(fmode); 658 + 659 + spin_lock(&session->s_gen_ttl_lock); 660 + gen = session->s_cap_gen; 661 + spin_unlock(&session->s_gen_ttl_lock); 630 662 631 663 cap = __get_cap_for_mds(ci, mds); 632 664 if (!cap) { ··· 657 681 list_move_tail(&cap->session_caps, &session->s_caps); 658 682 spin_unlock(&session->s_cap_lock); 659 683 660 - if (cap->cap_gen < session->s_cap_gen) 684 + if (cap->cap_gen < gen) 661 685 cap->issued = cap->implemented = CEPH_CAP_PIN; 662 686 663 687 /* ··· 751 775 cap->seq = seq; 752 776 cap->issue_seq = seq; 753 777 cap->mseq = mseq; 754 - cap->cap_gen = session->s_cap_gen; 778 + cap->cap_gen = gen; 755 779 756 780 if (fmode >= 0) 757 781 __ceph_get_fmode(ci, fmode); ··· 1260 1284 * Make note of max_size reported/requested from mds, revoked caps 1261 1285 * that have now been implemented. 1262 1286 * 1263 - * Make half-hearted attempt ot to invalidate page cache if we are 1264 - * dropping RDCACHE. Note that this will leave behind locked pages 1265 - * that we'll then need to deal with elsewhere. 1266 - * 1267 1287 * Return non-zero if delayed release, or we experienced an error 1268 1288 * such that the caller should requeue + retry later. 1269 1289 * ··· 1718 1746 * Add dirty inode to the flushing list. Assigned a seq number so we 1719 1747 * can wait for caps to flush without starving. 1720 1748 * 1721 - * Called under i_ceph_lock. 1749 + * Called under i_ceph_lock. Returns the flush tid. 1722 1750 */ 1723 - static int __mark_caps_flushing(struct inode *inode, 1751 + static u64 __mark_caps_flushing(struct inode *inode, 1724 1752 struct ceph_mds_session *session, bool wake, 1725 - u64 *flush_tid, u64 *oldest_flush_tid) 1753 + u64 *oldest_flush_tid) 1726 1754 { 1727 1755 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1728 1756 struct ceph_inode_info *ci = ceph_inode(inode); ··· 1761 1789 1762 1790 list_add_tail(&cf->i_list, &ci->i_cap_flush_list); 1763 1791 1764 - *flush_tid = cf->tid; 1765 - return flushing; 1792 + return cf->tid; 1766 1793 } 1767 1794 1768 1795 /* ··· 1999 2028 } 2000 2029 2001 2030 ack: 2002 - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { 2003 - dout(" skipping %p I_NOFLUSH set\n", inode); 2004 - continue; 2005 - } 2006 - 2007 2031 if (session && session != cap->session) { 2008 2032 dout("oops, wrong session %p mutex\n", session); 2009 2033 mutex_unlock(&session->s_mutex); ··· 2046 2080 } 2047 2081 2048 2082 if (cap == ci->i_auth_cap && ci->i_dirty_caps) { 2049 - flushing = __mark_caps_flushing(inode, session, false, 2050 - &flush_tid, 2051 - &oldest_flush_tid); 2083 + flushing = ci->i_dirty_caps; 2084 + flush_tid = __mark_caps_flushing(inode, session, false, 2085 + &oldest_flush_tid); 2052 2086 } else { 2053 2087 flushing = 0; 2054 2088 flush_tid = 0; ··· 2096 2130 retry: 2097 2131 spin_lock(&ci->i_ceph_lock); 2098 2132 retry_locked: 2099 - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { 2100 - spin_unlock(&ci->i_ceph_lock); 2101 - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); 2102 - goto out; 2103 - } 2104 2133 if (ci->i_dirty_caps && ci->i_auth_cap) { 2105 2134 struct ceph_cap *cap = ci->i_auth_cap; 2106 2135 int delayed; 2107 2136 2108 - if (!session || session != cap->session) { 2137 + if (session != cap->session) { 2109 2138 spin_unlock(&ci->i_ceph_lock); 2110 2139 if (session) 2111 2140 mutex_unlock(&session->s_mutex); ··· 2122 2161 goto retry_locked; 2123 2162 } 2124 2163 2125 - flushing = __mark_caps_flushing(inode, session, true, 2126 - &flush_tid, &oldest_flush_tid); 2164 + flushing = ci->i_dirty_caps; 2165 + flush_tid = __mark_caps_flushing(inode, session, true, 2166 + &oldest_flush_tid); 2127 2167 2128 2168 /* __send_cap drops i_ceph_lock */ 2129 2169 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, ··· 2223 2261 2224 2262 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) 2225 2263 { 2264 + struct ceph_file_info *fi = file->private_data; 2226 2265 struct inode *inode = file->f_mapping->host; 2227 2266 struct ceph_inode_info *ci = ceph_inode(inode); 2228 2267 u64 flush_tid; 2229 - int ret; 2268 + int ret, err; 2230 2269 int dirty; 2231 2270 2232 2271 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 2233 2272 2234 2273 ret = file_write_and_wait_range(file, start, end); 2235 - if (ret < 0) 2236 - goto out; 2237 - 2238 2274 if (datasync) 2239 2275 goto out; 2240 2276 2241 2277 dirty = try_flush_caps(inode, &flush_tid); 2242 2278 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 2243 2279 2244 - ret = unsafe_request_wait(inode); 2280 + err = unsafe_request_wait(inode); 2245 2281 2246 2282 /* 2247 2283 * only wait on non-file metadata writeback (the mds 2248 2284 * can recover size and mtime, so we don't need to 2249 2285 * wait for that) 2250 2286 */ 2251 - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { 2252 - ret = wait_event_interruptible(ci->i_cap_wq, 2287 + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { 2288 + err = wait_event_interruptible(ci->i_cap_wq, 2253 2289 caps_are_flushed(inode, flush_tid)); 2290 + } 2291 + 2292 + if (err < 0) 2293 + ret = err; 2294 + 2295 + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { 2296 + spin_lock(&file->f_lock); 2297 + err = errseq_check_and_advance(&ci->i_meta_err, 2298 + &fi->meta_err); 2299 + spin_unlock(&file->f_lock); 2300 + if (err < 0) 2301 + ret = err; 2254 2302 } 2255 2303 out: 2256 2304 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); ··· 2532 2560 * 2533 2561 * FIXME: how does a 0 return differ from -EAGAIN? 2534 2562 */ 2535 - static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2536 - loff_t endoff, bool nonblock, int *got) 2563 + enum { 2564 + NON_BLOCKING = 1, 2565 + CHECK_FILELOCK = 2, 2566 + }; 2567 + 2568 + static int try_get_cap_refs(struct inode *inode, int need, int want, 2569 + loff_t endoff, int flags, int *got) 2537 2570 { 2538 - struct inode *inode = &ci->vfs_inode; 2571 + struct ceph_inode_info *ci = ceph_inode(inode); 2539 2572 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2540 2573 int ret = 0; 2541 2574 int have, implemented; ··· 2552 2575 2553 2576 again: 2554 2577 spin_lock(&ci->i_ceph_lock); 2578 + 2579 + if ((flags & CHECK_FILELOCK) && 2580 + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { 2581 + dout("try_get_cap_refs %p error filelock\n", inode); 2582 + ret = -EIO; 2583 + goto out_unlock; 2584 + } 2555 2585 2556 2586 /* make sure file is actually open */ 2557 2587 file_wanted = __ceph_caps_file_wanted(ci); ··· 2621 2637 * we can not call down_read() when 2622 2638 * task isn't in TASK_RUNNING state 2623 2639 */ 2624 - if (nonblock) { 2640 + if (flags & NON_BLOCKING) { 2625 2641 ret = -EAGAIN; 2626 2642 goto out_unlock; 2627 2643 } ··· 2715 2731 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2716 2732 } 2717 2733 2718 - int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, 2734 + int ceph_try_get_caps(struct inode *inode, int need, int want, 2719 2735 bool nonblock, int *got) 2720 2736 { 2721 2737 int ret; 2722 2738 2723 2739 BUG_ON(need & ~CEPH_CAP_FILE_RD); 2724 2740 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); 2725 - ret = ceph_pool_perm_check(ci, need); 2741 + ret = ceph_pool_perm_check(inode, need); 2726 2742 if (ret < 0) 2727 2743 return ret; 2728 2744 2729 - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); 2745 + ret = try_get_cap_refs(inode, need, want, 0, 2746 + (nonblock ? NON_BLOCKING : 0), got); 2730 2747 return ret == -EAGAIN ? 0 : ret; 2731 2748 } 2732 2749 ··· 2736 2751 * due to a small max_size, make sure we check_max_size (and possibly 2737 2752 * ask the mds) so we don't get hung up indefinitely. 2738 2753 */ 2739 - int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2754 + int ceph_get_caps(struct file *filp, int need, int want, 2740 2755 loff_t endoff, int *got, struct page **pinned_page) 2741 2756 { 2742 - int _got, ret; 2757 + struct ceph_file_info *fi = filp->private_data; 2758 + struct inode *inode = file_inode(filp); 2759 + struct ceph_inode_info *ci = ceph_inode(inode); 2760 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 2761 + int ret, _got, flags; 2743 2762 2744 - ret = ceph_pool_perm_check(ci, need); 2763 + ret = ceph_pool_perm_check(inode, need); 2745 2764 if (ret < 0) 2746 2765 return ret; 2747 2766 2767 + if ((fi->fmode & CEPH_FILE_MODE_WR) && 2768 + fi->filp_gen != READ_ONCE(fsc->filp_gen)) 2769 + return -EBADF; 2770 + 2748 2771 while (true) { 2749 2772 if (endoff > 0) 2750 - check_max_size(&ci->vfs_inode, endoff); 2773 + check_max_size(inode, endoff); 2751 2774 2775 + flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; 2752 2776 _got = 0; 2753 - ret = try_get_cap_refs(ci, need, want, endoff, 2754 - false, &_got); 2777 + ret = try_get_cap_refs(inode, need, want, endoff, 2778 + flags, &_got); 2755 2779 if (ret == -EAGAIN) 2756 2780 continue; 2757 2781 if (!ret) { 2758 2782 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2759 2783 add_wait_queue(&ci->i_cap_wq, &wait); 2760 2784 2761 - while (!(ret = try_get_cap_refs(ci, need, want, endoff, 2762 - true, &_got))) { 2785 + flags |= NON_BLOCKING; 2786 + while (!(ret = try_get_cap_refs(inode, need, want, 2787 + endoff, flags, &_got))) { 2763 2788 if (signal_pending(current)) { 2764 2789 ret = -ERESTARTSYS; 2765 2790 break; ··· 2781 2786 if (ret == -EAGAIN) 2782 2787 continue; 2783 2788 } 2789 + 2790 + if ((fi->fmode & CEPH_FILE_MODE_WR) && 2791 + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { 2792 + if (ret >= 0 && _got) 2793 + ceph_put_cap_refs(ci, _got); 2794 + return -EBADF; 2795 + } 2796 + 2784 2797 if (ret < 0) { 2785 2798 if (ret == -ESTALE) { 2786 2799 /* session was killed, try renew caps */ 2787 - ret = ceph_renew_caps(&ci->vfs_inode); 2800 + ret = ceph_renew_caps(inode); 2788 2801 if (ret == 0) 2789 2802 continue; 2790 2803 } ··· 2801 2798 2802 2799 if (ci->i_inline_version != CEPH_INLINE_NONE && 2803 2800 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2804 - i_size_read(&ci->vfs_inode) > 0) { 2801 + i_size_read(inode) > 0) { 2805 2802 struct page *page = 2806 - find_get_page(ci->vfs_inode.i_mapping, 0); 2803 + find_get_page(inode->i_mapping, 0); 2807 2804 if (page) { 2808 2805 if (PageUptodate(page)) { 2809 2806 *pinned_page = page; ··· 2822 2819 * getattr request will bring inline data into 2823 2820 * page cache 2824 2821 */ 2825 - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, 2822 + ret = __ceph_do_getattr(inode, NULL, 2826 2823 CEPH_STAT_CAP_INLINE_DATA, 2827 2824 true); 2828 2825 if (ret < 0)

-1

fs/ceph/debugfs.c

··· 294 294 295 295 void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) 296 296 { 297 - return 0; 298 297 } 299 298 300 299 void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)

+27 -27

fs/ceph/export.c

··· 35 35 static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, 36 36 struct inode *parent_inode) 37 37 { 38 - const static int snap_handle_length = 38 + static const int snap_handle_length = 39 39 sizeof(struct ceph_nfs_snapfh) >> 2; 40 40 struct ceph_nfs_snapfh *sfh = (void *)rawfh; 41 41 u64 snapid = ceph_snap(inode); ··· 85 85 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 86 86 struct inode *parent_inode) 87 87 { 88 - const static int handle_length = 88 + static const int handle_length = 89 89 sizeof(struct ceph_nfs_fh) >> 2; 90 - const static int connected_handle_length = 90 + static const int connected_handle_length = 91 91 sizeof(struct ceph_nfs_confh) >> 2; 92 92 int type; 93 93 ··· 458 458 if (err < 0) 459 459 goto out; 460 460 461 - rinfo = &req->r_reply_info; 462 - for (i = 0; i < rinfo->dir_nr; i++) { 463 - rde = rinfo->dir_entries + i; 464 - BUG_ON(!rde->inode.in); 465 - if (ceph_snap(inode) == 466 - le64_to_cpu(rde->inode.in->snapid)) { 467 - memcpy(name, rde->name, rde->name_len); 468 - name[rde->name_len] = '\0'; 469 - err = 0; 470 - goto out; 471 - } 472 - } 461 + rinfo = &req->r_reply_info; 462 + for (i = 0; i < rinfo->dir_nr; i++) { 463 + rde = rinfo->dir_entries + i; 464 + BUG_ON(!rde->inode.in); 465 + if (ceph_snap(inode) == 466 + le64_to_cpu(rde->inode.in->snapid)) { 467 + memcpy(name, rde->name, rde->name_len); 468 + name[rde->name_len] = '\0'; 469 + err = 0; 470 + goto out; 471 + } 472 + } 473 473 474 - if (rinfo->dir_end) 475 - break; 474 + if (rinfo->dir_end) 475 + break; 476 476 477 - BUG_ON(rinfo->dir_nr <= 0); 478 - rde = rinfo->dir_entries + (rinfo->dir_nr - 1); 479 - next_offset += rinfo->dir_nr; 480 - last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); 481 - if (!last_name) { 482 - err = -ENOMEM; 483 - goto out; 484 - } 477 + BUG_ON(rinfo->dir_nr <= 0); 478 + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); 479 + next_offset += rinfo->dir_nr; 480 + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); 481 + if (!last_name) { 482 + err = -ENOMEM; 483 + goto out; 484 + } 485 485 486 - ceph_mdsc_put_request(req); 487 - req = NULL; 486 + ceph_mdsc_put_request(req); 487 + req = NULL; 488 488 } 489 489 err = -ENOENT; 490 490 out:

+64 -40

fs/ceph/file.c

··· 15 15 #include "super.h" 16 16 #include "mds_client.h" 17 17 #include "cache.h" 18 + #include "io.h" 18 19 19 20 static __le32 ceph_flags_sys2wire(u32 flags) 20 21 { ··· 202 201 static int ceph_init_file_info(struct inode *inode, struct file *file, 203 202 int fmode, bool isdir) 204 203 { 204 + struct ceph_inode_info *ci = ceph_inode(inode); 205 205 struct ceph_file_info *fi; 206 206 207 207 dout("%s %p %p 0%o (%s)\n", __func__, inode, file, ··· 213 211 struct ceph_dir_file_info *dfi = 214 212 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 215 213 if (!dfi) { 216 - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 214 + ceph_put_fmode(ci, fmode); /* clean up */ 217 215 return -ENOMEM; 218 216 } 219 217 ··· 224 222 } else { 225 223 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 226 224 if (!fi) { 227 - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 225 + ceph_put_fmode(ci, fmode); /* clean up */ 228 226 return -ENOMEM; 229 227 } 230 228 ··· 234 232 fi->fmode = fmode; 235 233 spin_lock_init(&fi->rw_contexts_lock); 236 234 INIT_LIST_HEAD(&fi->rw_contexts); 235 + fi->meta_err = errseq_sample(&ci->i_meta_err); 236 + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 237 237 238 238 return 0; 239 239 } ··· 699 695 ceph_release_page_vector(pages, num_pages); 700 696 } 701 697 702 - if (ret <= 0 || off >= i_size || !more) 698 + if (ret < 0) { 699 + if (ret == -EBLACKLISTED) 700 + fsc->blacklisted = true; 701 + break; 702 + } 703 + 704 + if (off >= i_size || !more) 703 705 break; 704 706 } 705 707 ··· 931 921 struct ceph_aio_request *aio_req = NULL; 932 922 int num_pages = 0; 933 923 int flags; 934 - int ret; 924 + int ret = 0; 935 925 struct timespec64 mtime = current_time(inode); 936 926 size_t count = iov_iter_count(iter); 937 927 loff_t pos = iocb->ki_pos; ··· 944 934 dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 945 935 (write ? "write" : "read"), file, pos, (unsigned)count, 946 936 snapc, snapc ? snapc->seq : 0); 947 - 948 - ret = filemap_write_and_wait_range(inode->i_mapping, 949 - pos, pos + count - 1); 950 - if (ret < 0) 951 - return ret; 952 937 953 938 if (write) { 954 939 int ret2 = invalidate_inode_pages2_range(inode->i_mapping, ··· 1265 1260 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1266 1261 else 1267 1262 want = CEPH_CAP_FILE_CACHE; 1268 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 1263 + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, 1264 + &got, &pinned_page); 1269 1265 if (ret < 0) 1270 1266 return ret; 1271 1267 ··· 1280 1274 1281 1275 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1282 1276 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1277 + ceph_start_io_direct(inode); 1283 1278 ret = ceph_direct_read_write(iocb, to, 1284 1279 NULL, NULL); 1280 + ceph_end_io_direct(inode); 1285 1281 if (ret >= 0 && ret < len) 1286 1282 retry_op = CHECK_EOF; 1287 1283 } else { 1284 + ceph_start_io_read(inode); 1288 1285 ret = ceph_sync_read(iocb, to, &retry_op); 1286 + ceph_end_io_read(inode); 1289 1287 } 1290 1288 } else { 1291 1289 retry_op = READ_INLINE; ··· 1300 1290 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1301 1291 ceph_cap_string(got)); 1302 1292 ceph_add_rw_context(fi, &rw_ctx); 1293 + ceph_start_io_read(inode); 1303 1294 ret = generic_file_read_iter(iocb, to); 1295 + ceph_end_io_read(inode); 1304 1296 ceph_del_rw_context(fi, &rw_ctx); 1305 1297 } 1306 1298 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", ··· 1411 1399 return -ENOMEM; 1412 1400 1413 1401 retry_snap: 1414 - inode_lock(inode); 1402 + if (iocb->ki_flags & IOCB_DIRECT) 1403 + ceph_start_io_direct(inode); 1404 + else 1405 + ceph_start_io_write(inode); 1415 1406 1416 1407 /* We can write back this queue in page reclaim */ 1417 1408 current->backing_dev_info = inode_to_bdi(inode); ··· 1472 1457 else 1473 1458 want = CEPH_CAP_FILE_BUFFER; 1474 1459 got = 0; 1475 - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, 1460 + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, 1476 1461 &got, NULL); 1477 1462 if (err < 0) 1478 1463 goto out; ··· 1485 1470 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 1486 1471 struct ceph_snap_context *snapc; 1487 1472 struct iov_iter data; 1488 - inode_unlock(inode); 1489 1473 1490 1474 spin_lock(&ci->i_ceph_lock); 1491 1475 if (__ceph_have_pending_cap_snap(ci)) { ··· 1501 1487 1502 1488 /* we might need to revert back to that point */ 1503 1489 data = *from; 1504 - if (iocb->ki_flags & IOCB_DIRECT) 1490 + if (iocb->ki_flags & IOCB_DIRECT) { 1505 1491 written = ceph_direct_read_write(iocb, &data, snapc, 1506 1492 &prealloc_cf); 1507 - else 1493 + ceph_end_io_direct(inode); 1494 + } else { 1508 1495 written = ceph_sync_write(iocb, &data, pos, snapc); 1496 + ceph_end_io_write(inode); 1497 + } 1509 1498 if (written > 0) 1510 1499 iov_iter_advance(from, written); 1511 1500 ceph_put_snap_context(snapc); ··· 1523 1506 written = generic_perform_write(file, from, pos); 1524 1507 if (likely(written >= 0)) 1525 1508 iocb->ki_pos = pos + written; 1526 - inode_unlock(inode); 1509 + ceph_end_io_write(inode); 1527 1510 } 1528 1511 1529 1512 if (written >= 0) { ··· 1558 1541 } 1559 1542 1560 1543 goto out_unlocked; 1561 - 1562 1544 out: 1563 - inode_unlock(inode); 1545 + if (iocb->ki_flags & IOCB_DIRECT) 1546 + ceph_end_io_direct(inode); 1547 + else 1548 + ceph_end_io_write(inode); 1564 1549 out_unlocked: 1565 1550 ceph_free_cap_flush(prealloc_cf); 1566 1551 current->backing_dev_info = NULL; ··· 1800 1781 else 1801 1782 want = CEPH_CAP_FILE_BUFFER; 1802 1783 1803 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 1784 + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 1804 1785 if (ret < 0) 1805 1786 goto unlock; 1806 1787 ··· 1829 1810 * src_ci. Two attempts are made to obtain both caps, and an error is return if 1830 1811 * this fails; zero is returned on success. 1831 1812 */ 1832 - static int get_rd_wr_caps(struct ceph_inode_info *src_ci, 1833 - loff_t src_endoff, int *src_got, 1834 - struct ceph_inode_info *dst_ci, 1813 + static int get_rd_wr_caps(struct file *src_filp, int *src_got, 1814 + struct file *dst_filp, 1835 1815 loff_t dst_endoff, int *dst_got) 1836 1816 { 1837 1817 int ret = 0; 1838 1818 bool retrying = false; 1839 1819 1840 1820 retry_caps: 1841 - ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 1821 + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 1842 1822 dst_endoff, dst_got, NULL); 1843 1823 if (ret < 0) 1844 1824 return ret; ··· 1847 1829 * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some 1848 1830 * retry dance instead to try to get both capabilities. 1849 1831 */ 1850 - ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 1832 + ret = ceph_try_get_caps(file_inode(src_filp), 1833 + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 1851 1834 false, src_got); 1852 1835 if (ret <= 0) { 1853 1836 /* Start by dropping dst_ci caps and getting src_ci caps */ 1854 - ceph_put_cap_refs(dst_ci, *dst_got); 1837 + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); 1855 1838 if (retrying) { 1856 1839 if (!ret) 1857 1840 /* ceph_try_get_caps masks EAGAIN */ 1858 1841 ret = -EAGAIN; 1859 1842 return ret; 1860 1843 } 1861 - ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, 1862 - CEPH_CAP_FILE_SHARED, src_endoff, 1863 - src_got, NULL); 1844 + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, 1845 + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); 1864 1846 if (ret < 0) 1865 1847 return ret; 1866 1848 /*... drop src_ci caps too, and retry */ 1867 - ceph_put_cap_refs(src_ci, *src_got); 1849 + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); 1868 1850 retrying = true; 1869 1851 goto retry_caps; 1870 1852 } ··· 1922 1904 struct ceph_inode_info *src_ci = ceph_inode(src_inode); 1923 1905 struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); 1924 1906 struct ceph_cap_flush *prealloc_cf; 1907 + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); 1925 1908 struct ceph_object_locator src_oloc, dst_oloc; 1926 1909 struct ceph_object_id src_oid, dst_oid; 1927 1910 loff_t endoff = 0, size; ··· 1932 1913 int src_got = 0, dst_got = 0, err, dirty; 1933 1914 bool do_final_copy = false; 1934 1915 1935 - if (src_inode == dst_inode) 1936 - return -EINVAL; 1937 - if (src_inode->i_sb != dst_inode->i_sb) 1938 - return -EXDEV; 1916 + if (src_inode->i_sb != dst_inode->i_sb) { 1917 + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); 1918 + 1919 + if (ceph_fsid_compare(&src_fsc->client->fsid, 1920 + &dst_fsc->client->fsid)) { 1921 + dout("Copying files across clusters: src: %pU dst: %pU\n", 1922 + &src_fsc->client->fsid, &dst_fsc->client->fsid); 1923 + return -EXDEV; 1924 + } 1925 + } 1939 1926 if (ceph_snap(dst_inode) != CEPH_NOSNAP) 1940 1927 return -EROFS; 1941 1928 ··· 1953 1928 * efficient). 1954 1929 */ 1955 1930 1956 - if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) 1931 + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) 1957 1932 return -EOPNOTSUPP; 1958 1933 1959 1934 if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || ··· 1985 1960 * clients may have dirty data in their caches. And OSDs know nothing 1986 1961 * about caps, so they can't safely do the remote object copies. 1987 1962 */ 1988 - err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, 1989 - dst_ci, (dst_off + len), &dst_got); 1963 + err = get_rd_wr_caps(src_file, &src_got, 1964 + dst_file, (dst_off + len), &dst_got); 1990 1965 if (err < 0) { 1991 1966 dout("get_rd_wr_caps returned %d\n", err); 1992 1967 ret = -EOPNOTSUPP; ··· 2043 2018 goto out; 2044 2019 } 2045 2020 len -= ret; 2046 - err = get_rd_wr_caps(src_ci, (src_off + len), 2047 - &src_got, dst_ci, 2048 - (dst_off + len), &dst_got); 2021 + err = get_rd_wr_caps(src_file, &src_got, 2022 + dst_file, (dst_off + len), &dst_got); 2049 2023 if (err < 0) 2050 2024 goto out; 2051 2025 err = is_file_size_ok(src_inode, dst_inode, ··· 2068 2044 dst_ci->i_vino.ino, dst_objnum); 2069 2045 /* Do an object remote copy */ 2070 2046 err = ceph_osdc_copy_from( 2071 - &ceph_inode_to_client(src_inode)->client->osdc, 2047 + &src_fsc->client->osdc, 2072 2048 src_ci->i_vino.snap, 0, 2073 2049 &src_oid, &src_oloc, 2074 2050 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |

+29 -21

fs/ceph/inode.c

··· 515 515 516 516 ceph_fscache_inode_init(ci); 517 517 518 + ci->i_meta_err = 0; 519 + 518 520 return &ci->vfs_inode; 519 521 } 520 522 ··· 803 801 804 802 /* update inode */ 805 803 inode->i_rdev = le32_to_cpu(info->rdev); 806 - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 804 + /* directories have fl_stripe_unit set to zero */ 805 + if (le32_to_cpu(info->layout.fl_stripe_unit)) 806 + inode->i_blkbits = 807 + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 808 + else 809 + inode->i_blkbits = CEPH_BLOCK_SHIFT; 807 810 808 811 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 809 812 ··· 1989 1982 int __ceph_setattr(struct inode *inode, struct iattr *attr) 1990 1983 { 1991 1984 struct ceph_inode_info *ci = ceph_inode(inode); 1992 - const unsigned int ia_valid = attr->ia_valid; 1985 + unsigned int ia_valid = attr->ia_valid; 1993 1986 struct ceph_mds_request *req; 1994 1987 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1995 1988 struct ceph_cap_flush *prealloc_cf; ··· 2094 2087 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2095 2088 } 2096 2089 } 2090 + if (ia_valid & ATTR_SIZE) { 2091 + dout("setattr %p size %lld -> %lld\n", inode, 2092 + inode->i_size, attr->ia_size); 2093 + if ((issued & CEPH_CAP_FILE_EXCL) && 2094 + attr->ia_size > inode->i_size) { 2095 + i_size_write(inode, attr->ia_size); 2096 + inode->i_blocks = calc_inode_blocks(attr->ia_size); 2097 + ci->i_reported_size = attr->ia_size; 2098 + dirtied |= CEPH_CAP_FILE_EXCL; 2099 + ia_valid |= ATTR_MTIME; 2100 + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2101 + attr->ia_size != inode->i_size) { 2102 + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); 2103 + req->r_args.setattr.old_size = 2104 + cpu_to_le64(inode->i_size); 2105 + mask |= CEPH_SETATTR_SIZE; 2106 + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2107 + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2108 + } 2109 + } 2097 2110 if (ia_valid & ATTR_MTIME) { 2098 2111 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, 2099 2112 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, ··· 2133 2106 &attr->ia_mtime); 2134 2107 mask |= CEPH_SETATTR_MTIME; 2135 2108 release |= CEPH_CAP_FILE_SHARED | 2136 - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2137 - } 2138 - } 2139 - if (ia_valid & ATTR_SIZE) { 2140 - dout("setattr %p size %lld -> %lld\n", inode, 2141 - inode->i_size, attr->ia_size); 2142 - if ((issued & CEPH_CAP_FILE_EXCL) && 2143 - attr->ia_size > inode->i_size) { 2144 - i_size_write(inode, attr->ia_size); 2145 - inode->i_blocks = calc_inode_blocks(attr->ia_size); 2146 - ci->i_reported_size = attr->ia_size; 2147 - dirtied |= CEPH_CAP_FILE_EXCL; 2148 - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2149 - attr->ia_size != inode->i_size) { 2150 - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); 2151 - req->r_args.setattr.old_size = 2152 - cpu_to_le64(inode->i_size); 2153 - mask |= CEPH_SETATTR_SIZE; 2154 - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2155 2109 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2156 2110 } 2157 2111 }

+163

fs/ceph/io.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2016 Trond Myklebust 4 + * Copyright (c) 2019 Jeff Layton 5 + * 6 + * I/O and data path helper functionality. 7 + * 8 + * Heavily borrowed from equivalent code in fs/nfs/io.c 9 + */ 10 + 11 + #include <linux/ceph/ceph_debug.h> 12 + 13 + #include <linux/types.h> 14 + #include <linux/kernel.h> 15 + #include <linux/rwsem.h> 16 + #include <linux/fs.h> 17 + 18 + #include "super.h" 19 + #include "io.h" 20 + 21 + /* Call with exclusively locked inode->i_rwsem */ 22 + static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) 23 + { 24 + lockdep_assert_held_write(&inode->i_rwsem); 25 + 26 + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { 27 + spin_lock(&ci->i_ceph_lock); 28 + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; 29 + spin_unlock(&ci->i_ceph_lock); 30 + inode_dio_wait(inode); 31 + } 32 + } 33 + 34 + /** 35 + * ceph_start_io_read - declare the file is being used for buffered reads 36 + * @inode: file inode 37 + * 38 + * Declare that a buffered read operation is about to start, and ensure 39 + * that we block all direct I/O. 40 + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, 41 + * and holds a shared lock on inode->i_rwsem to ensure that the flag 42 + * cannot be changed. 43 + * In practice, this means that buffered read operations are allowed to 44 + * execute in parallel, thanks to the shared lock, whereas direct I/O 45 + * operations need to wait to grab an exclusive lock in order to set 46 + * CEPH_I_ODIRECT. 47 + * Note that buffered writes and truncates both take a write lock on 48 + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. 49 + */ 50 + void 51 + ceph_start_io_read(struct inode *inode) 52 + { 53 + struct ceph_inode_info *ci = ceph_inode(inode); 54 + 55 + /* Be an optimist! */ 56 + down_read(&inode->i_rwsem); 57 + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) 58 + return; 59 + up_read(&inode->i_rwsem); 60 + /* Slow path.... */ 61 + down_write(&inode->i_rwsem); 62 + ceph_block_o_direct(ci, inode); 63 + downgrade_write(&inode->i_rwsem); 64 + } 65 + 66 + /** 67 + * ceph_end_io_read - declare that the buffered read operation is done 68 + * @inode: file inode 69 + * 70 + * Declare that a buffered read operation is done, and release the shared 71 + * lock on inode->i_rwsem. 72 + */ 73 + void 74 + ceph_end_io_read(struct inode *inode) 75 + { 76 + up_read(&inode->i_rwsem); 77 + } 78 + 79 + /** 80 + * ceph_start_io_write - declare the file is being used for buffered writes 81 + * @inode: file inode 82 + * 83 + * Declare that a buffered write operation is about to start, and ensure 84 + * that we block all direct I/O. 85 + */ 86 + void 87 + ceph_start_io_write(struct inode *inode) 88 + { 89 + down_write(&inode->i_rwsem); 90 + ceph_block_o_direct(ceph_inode(inode), inode); 91 + } 92 + 93 + /** 94 + * ceph_end_io_write - declare that the buffered write operation is done 95 + * @inode: file inode 96 + * 97 + * Declare that a buffered write operation is done, and release the 98 + * lock on inode->i_rwsem. 99 + */ 100 + void 101 + ceph_end_io_write(struct inode *inode) 102 + { 103 + up_write(&inode->i_rwsem); 104 + } 105 + 106 + /* Call with exclusively locked inode->i_rwsem */ 107 + static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) 108 + { 109 + lockdep_assert_held_write(&inode->i_rwsem); 110 + 111 + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { 112 + spin_lock(&ci->i_ceph_lock); 113 + ci->i_ceph_flags |= CEPH_I_ODIRECT; 114 + spin_unlock(&ci->i_ceph_lock); 115 + /* FIXME: unmap_mapping_range? */ 116 + filemap_write_and_wait(inode->i_mapping); 117 + } 118 + } 119 + 120 + /** 121 + * ceph_end_io_direct - declare the file is being used for direct i/o 122 + * @inode: file inode 123 + * 124 + * Declare that a direct I/O operation is about to start, and ensure 125 + * that we block all buffered I/O. 126 + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, 127 + * and holds a shared lock on inode->i_rwsem to ensure that the flag 128 + * cannot be changed. 129 + * In practice, this means that direct I/O operations are allowed to 130 + * execute in parallel, thanks to the shared lock, whereas buffered I/O 131 + * operations need to wait to grab an exclusive lock in order to clear 132 + * CEPH_I_ODIRECT. 133 + * Note that buffered writes and truncates both take a write lock on 134 + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. 135 + */ 136 + void 137 + ceph_start_io_direct(struct inode *inode) 138 + { 139 + struct ceph_inode_info *ci = ceph_inode(inode); 140 + 141 + /* Be an optimist! */ 142 + down_read(&inode->i_rwsem); 143 + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) 144 + return; 145 + up_read(&inode->i_rwsem); 146 + /* Slow path.... */ 147 + down_write(&inode->i_rwsem); 148 + ceph_block_buffered(ci, inode); 149 + downgrade_write(&inode->i_rwsem); 150 + } 151 + 152 + /** 153 + * ceph_end_io_direct - declare that the direct i/o operation is done 154 + * @inode: file inode 155 + * 156 + * Declare that a direct I/O operation is done, and release the shared 157 + * lock on inode->i_rwsem. 158 + */ 159 + void 160 + ceph_end_io_direct(struct inode *inode) 161 + { 162 + up_read(&inode->i_rwsem); 163 + }

+12

fs/ceph/io.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _FS_CEPH_IO_H 3 + #define _FS_CEPH_IO_H 4 + 5 + void ceph_start_io_read(struct inode *inode); 6 + void ceph_end_io_read(struct inode *inode); 7 + void ceph_start_io_write(struct inode *inode); 8 + void ceph_end_io_write(struct inode *inode); 9 + void ceph_start_io_direct(struct inode *inode); 10 + void ceph_end_io_direct(struct inode *inode); 11 + 12 + #endif /* FS_CEPH_IO_H */

+6 -2

fs/ceph/locks.c

··· 32 32 33 33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 34 34 { 35 - struct inode *inode = file_inode(src->fl_file); 35 + struct ceph_file_info *fi = dst->fl_file->private_data; 36 + struct inode *inode = file_inode(dst->fl_file); 36 37 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 38 + atomic_inc(&fi->num_locks); 37 39 } 38 40 39 41 static void ceph_fl_release_lock(struct file_lock *fl) 40 42 { 43 + struct ceph_file_info *fi = fl->fl_file->private_data; 41 44 struct inode *inode = file_inode(fl->fl_file); 42 45 struct ceph_inode_info *ci = ceph_inode(inode); 46 + atomic_dec(&fi->num_locks); 43 47 if (atomic_dec_and_test(&ci->i_filelock_ref)) { 44 48 /* clear error when all locks are released */ 45 49 spin_lock(&ci->i_ceph_lock); ··· 77 73 * window. Caller function will decrease the counter. 78 74 */ 79 75 fl->fl_ops = &ceph_fl_lock_ops; 80 - atomic_inc(&ceph_inode(inode)->i_filelock_ref); 76 + fl->fl_ops->fl_copy_lock(fl, NULL); 81 77 } 82 78 83 79 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)

+82 -28

fs/ceph/mds_client.c

··· 639 639 s->s_renew_seq = 0; 640 640 INIT_LIST_HEAD(&s->s_caps); 641 641 s->s_nr_caps = 0; 642 - s->s_trim_caps = 0; 643 642 refcount_set(&s->s_ref, 1); 644 643 INIT_LIST_HEAD(&s->s_waiting); 645 644 INIT_LIST_HEAD(&s->s_unsafe); ··· 1269 1270 { 1270 1271 struct ceph_mds_request *req; 1271 1272 struct rb_node *p; 1273 + struct ceph_inode_info *ci; 1272 1274 1273 1275 dout("cleanup_session_requests mds%d\n", session->s_mds); 1274 1276 mutex_lock(&mdsc->mutex); ··· 1278 1278 struct ceph_mds_request, r_unsafe_item); 1279 1279 pr_warn_ratelimited(" dropping unsafe request %llu\n", 1280 1280 req->r_tid); 1281 + if (req->r_target_inode) { 1282 + /* dropping unsafe change of inode's attributes */ 1283 + ci = ceph_inode(req->r_target_inode); 1284 + errseq_set(&ci->i_meta_err, -EIO); 1285 + } 1286 + if (req->r_unsafe_dir) { 1287 + /* dropping unsafe directory operation */ 1288 + ci = ceph_inode(req->r_unsafe_dir); 1289 + errseq_set(&ci->i_meta_err, -EIO); 1290 + } 1281 1291 __unregister_request(mdsc, req); 1282 1292 } 1283 1293 /* zero r_attempts, so kick_requests() will re-send requests */ ··· 1380 1370 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; 1381 1371 struct ceph_inode_info *ci = ceph_inode(inode); 1382 1372 LIST_HEAD(to_remove); 1383 - bool drop = false; 1373 + bool dirty_dropped = false; 1384 1374 bool invalidate = false; 1385 1375 1386 1376 dout("removing cap %p, ci is %p, inode is %p\n", ··· 1393 1383 struct ceph_cap_flush *cf; 1394 1384 struct ceph_mds_client *mdsc = fsc->mdsc; 1395 1385 1396 - if (ci->i_wrbuffer_ref > 0 && 1397 - READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) 1398 - invalidate = true; 1386 + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { 1387 + if (inode->i_data.nrpages > 0) 1388 + invalidate = true; 1389 + if (ci->i_wrbuffer_ref > 0) 1390 + mapping_set_error(&inode->i_data, -EIO); 1391 + } 1399 1392 1400 1393 while (!list_empty(&ci->i_cap_flush_list)) { 1401 1394 cf = list_first_entry(&ci->i_cap_flush_list, ··· 1418 1405 inode, ceph_ino(inode)); 1419 1406 ci->i_dirty_caps = 0; 1420 1407 list_del_init(&ci->i_dirty_item); 1421 - drop = true; 1408 + dirty_dropped = true; 1422 1409 } 1423 1410 if (!list_empty(&ci->i_flushing_item)) { 1424 1411 pr_warn_ratelimited( ··· 1428 1415 ci->i_flushing_caps = 0; 1429 1416 list_del_init(&ci->i_flushing_item); 1430 1417 mdsc->num_cap_flushing--; 1431 - drop = true; 1418 + dirty_dropped = true; 1432 1419 } 1433 1420 spin_unlock(&mdsc->cap_dirty_lock); 1421 + 1422 + if (dirty_dropped) { 1423 + errseq_set(&ci->i_meta_err, -EIO); 1424 + 1425 + if (ci->i_wrbuffer_ref_head == 0 && 1426 + ci->i_wr_ref == 0 && 1427 + ci->i_dirty_caps == 0 && 1428 + ci->i_flushing_caps == 0) { 1429 + ceph_put_snap_context(ci->i_head_snapc); 1430 + ci->i_head_snapc = NULL; 1431 + } 1432 + } 1434 1433 1435 1434 if (atomic_read(&ci->i_filelock_ref) > 0) { 1436 1435 /* make further file lock syscall return -EIO */ ··· 1455 1430 list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); 1456 1431 ci->i_prealloc_cap_flush = NULL; 1457 1432 } 1458 - 1459 - if (drop && 1460 - ci->i_wrbuffer_ref_head == 0 && 1461 - ci->i_wr_ref == 0 && 1462 - ci->i_dirty_caps == 0 && 1463 - ci->i_flushing_caps == 0) { 1464 - ceph_put_snap_context(ci->i_head_snapc); 1465 - ci->i_head_snapc = NULL; 1466 - } 1467 1433 } 1468 1434 spin_unlock(&ci->i_ceph_lock); 1469 1435 while (!list_empty(&to_remove)) { ··· 1468 1452 wake_up_all(&ci->i_cap_wq); 1469 1453 if (invalidate) 1470 1454 ceph_queue_invalidate(inode); 1471 - if (drop) 1455 + if (dirty_dropped) 1472 1456 iput(inode); 1473 1457 return 0; 1474 1458 } ··· 1721 1705 */ 1722 1706 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) 1723 1707 { 1724 - struct ceph_mds_session *session = arg; 1708 + int *remaining = arg; 1725 1709 struct ceph_inode_info *ci = ceph_inode(inode); 1726 1710 int used, wanted, oissued, mine; 1727 1711 1728 - if (session->s_trim_caps <= 0) 1712 + if (*remaining <= 0) 1729 1713 return -1; 1730 1714 1731 1715 spin_lock(&ci->i_ceph_lock); ··· 1762 1746 if (oissued) { 1763 1747 /* we aren't the only cap.. just remove us */ 1764 1748 __ceph_remove_cap(cap, true); 1765 - session->s_trim_caps--; 1749 + (*remaining)--; 1766 1750 } else { 1767 1751 struct dentry *dentry; 1768 1752 /* try dropping referring dentries */ ··· 1774 1758 d_prune_aliases(inode); 1775 1759 count = atomic_read(&inode->i_count); 1776 1760 if (count == 1) 1777 - session->s_trim_caps--; 1761 + (*remaining)--; 1778 1762 dout("trim_caps_cb %p cap %p pruned, count now %d\n", 1779 1763 inode, cap, count); 1780 1764 } else { ··· 1800 1784 dout("trim_caps mds%d start: %d / %d, trim %d\n", 1801 1785 session->s_mds, session->s_nr_caps, max_caps, trim_caps); 1802 1786 if (trim_caps > 0) { 1803 - session->s_trim_caps = trim_caps; 1804 - ceph_iterate_session_caps(session, trim_caps_cb, session); 1787 + int remaining = trim_caps; 1788 + 1789 + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); 1805 1790 dout("trim_caps mds%d done: %d / %d, trimmed %d\n", 1806 1791 session->s_mds, session->s_nr_caps, max_caps, 1807 - trim_caps - session->s_trim_caps); 1808 - session->s_trim_caps = 0; 1792 + trim_caps - remaining); 1809 1793 } 1810 1794 1811 1795 ceph_flush_cap_releases(mdsc, session); ··· 3031 3015 pr_err("mdsc_handle_forward decode error err=%d\n", err); 3032 3016 } 3033 3017 3034 - static int __decode_and_drop_session_metadata(void **p, void *end) 3018 + static int __decode_session_metadata(void **p, void *end, 3019 + bool *blacklisted) 3035 3020 { 3036 3021 /* map<string,string> */ 3037 3022 u32 n; 3023 + bool err_str; 3038 3024 ceph_decode_32_safe(p, end, n, bad); 3039 3025 while (n-- > 0) { 3040 3026 u32 len; 3041 3027 ceph_decode_32_safe(p, end, len, bad); 3042 3028 ceph_decode_need(p, end, len, bad); 3029 + err_str = !strncmp(*p, "error_string", len); 3043 3030 *p += len; 3044 3031 ceph_decode_32_safe(p, end, len, bad); 3045 3032 ceph_decode_need(p, end, len, bad); 3033 + if (err_str && strnstr(*p, "blacklisted", len)) 3034 + *blacklisted = true; 3046 3035 *p += len; 3047 3036 } 3048 3037 return 0; ··· 3071 3050 u64 seq; 3072 3051 unsigned long features = 0; 3073 3052 int wake = 0; 3053 + bool blacklisted = false; 3074 3054 3075 3055 /* decode */ 3076 3056 ceph_decode_need(&p, end, sizeof(*h), bad); ··· 3084 3062 if (msg_version >= 3) { 3085 3063 u32 len; 3086 3064 /* version >= 2, metadata */ 3087 - if (__decode_and_drop_session_metadata(&p, end) < 0) 3065 + if (__decode_session_metadata(&p, end, &blacklisted) < 0) 3088 3066 goto bad; 3089 3067 /* version >= 3, feature bits */ 3090 3068 ceph_decode_32_safe(&p, end, len, bad); ··· 3171 3149 session->s_state = CEPH_MDS_SESSION_REJECTED; 3172 3150 cleanup_session_requests(mdsc, session); 3173 3151 remove_session_caps(session); 3152 + if (blacklisted) 3153 + mdsc->fsc->blacklisted = true; 3174 3154 wake = 2; /* for good measure */ 3175 3155 break; 3176 3156 ··· 4022 3998 mutex_unlock(&mdsc->mutex); 4023 3999 } 4024 4000 4001 + static void maybe_recover_session(struct ceph_mds_client *mdsc) 4002 + { 4003 + struct ceph_fs_client *fsc = mdsc->fsc; 4025 4004 4005 + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) 4006 + return; 4007 + 4008 + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) 4009 + return; 4010 + 4011 + if (!READ_ONCE(fsc->blacklisted)) 4012 + return; 4013 + 4014 + if (fsc->last_auto_reconnect && 4015 + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) 4016 + return; 4017 + 4018 + pr_info("auto reconnect after blacklisted\n"); 4019 + fsc->last_auto_reconnect = jiffies; 4020 + ceph_force_reconnect(fsc->sb); 4021 + } 4026 4022 4027 4023 /* 4028 4024 * delayed work -- periodically trim expired leases, renew caps with mds ··· 4088 4044 pr_info("mds%d hung\n", s->s_mds); 4089 4045 } 4090 4046 } 4091 - if (s->s_state < CEPH_MDS_SESSION_OPEN) { 4047 + if (s->s_state == CEPH_MDS_SESSION_NEW || 4048 + s->s_state == CEPH_MDS_SESSION_RESTARTING || 4049 + s->s_state == CEPH_MDS_SESSION_REJECTED) { 4092 4050 /* this mds is failed or recovering, just wait */ 4093 4051 ceph_put_mds_session(s); 4094 4052 continue; ··· 4117 4071 ceph_queue_cap_reclaim_work(mdsc); 4118 4072 4119 4073 ceph_trim_snapid_map(mdsc); 4074 + 4075 + maybe_recover_session(mdsc); 4120 4076 4121 4077 schedule_delayed(mdsc); 4122 4078 } ··· 4403 4355 session = __ceph_lookup_mds_session(mdsc, mds); 4404 4356 if (!session) 4405 4357 continue; 4358 + 4359 + if (session->s_state == CEPH_MDS_SESSION_REJECTED) 4360 + __unregister_session(mdsc, session); 4361 + __wake_requests(mdsc, &session->s_waiting); 4406 4362 mutex_unlock(&mdsc->mutex); 4363 + 4407 4364 mutex_lock(&session->s_mutex); 4408 4365 __close_session(mdsc, session); 4409 4366 if (session->s_state == CEPH_MDS_SESSION_CLOSING) { ··· 4417 4364 } 4418 4365 mutex_unlock(&session->s_mutex); 4419 4366 ceph_put_mds_session(session); 4367 + 4420 4368 mutex_lock(&mdsc->mutex); 4421 4369 kick_requests(mdsc, mds); 4422 4370 }

+4 -4

fs/ceph/mds_client.h

··· 148 148 CEPH_MDS_SESSION_OPENING = 2, 149 149 CEPH_MDS_SESSION_OPEN = 3, 150 150 CEPH_MDS_SESSION_HUNG = 4, 151 - CEPH_MDS_SESSION_CLOSING = 5, 152 - CEPH_MDS_SESSION_RESTARTING = 6, 153 - CEPH_MDS_SESSION_RECONNECTING = 7, 151 + CEPH_MDS_SESSION_RESTARTING = 5, 152 + CEPH_MDS_SESSION_RECONNECTING = 6, 153 + CEPH_MDS_SESSION_CLOSING = 7, 154 154 CEPH_MDS_SESSION_REJECTED = 8, 155 155 }; 156 156 ··· 176 176 spinlock_t s_cap_lock; 177 177 struct list_head s_caps; /* all caps issued by this session */ 178 178 struct ceph_cap *s_cap_iterator; 179 - int s_nr_caps, s_trim_caps; 179 + int s_nr_caps; 180 180 int s_num_cap_releases; 181 181 int s_cap_reconnect; 182 182 int s_readonly;

+47 -5

fs/ceph/super.c

··· 143 143 Opt_snapdirname, 144 144 Opt_mds_namespace, 145 145 Opt_fscache_uniq, 146 + Opt_recover_session, 146 147 Opt_last_string, 147 148 /* string args above */ 148 149 Opt_dirstat, ··· 185 184 /* int args above */ 186 185 {Opt_snapdirname, "snapdirname=%s"}, 187 186 {Opt_mds_namespace, "mds_namespace=%s"}, 187 + {Opt_recover_session, "recover_session=%s"}, 188 188 {Opt_fscache_uniq, "fsc=%s"}, 189 189 /* string args above */ 190 190 {Opt_dirstat, "dirstat"}, ··· 255 253 GFP_KERNEL); 256 254 if (!fsopt->mds_namespace) 257 255 return -ENOMEM; 256 + break; 257 + case Opt_recover_session: 258 + if (!strncmp(argstr[0].from, "no", 259 + argstr[0].to - argstr[0].from)) { 260 + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; 261 + } else if (!strncmp(argstr[0].from, "clean", 262 + argstr[0].to - argstr[0].from)) { 263 + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; 264 + } else { 265 + return -EINVAL; 266 + } 258 267 break; 259 268 case Opt_fscache_uniq: 260 269 kfree(fsopt->fscache_uniq); ··· 589 576 590 577 if (fsopt->mds_namespace) 591 578 seq_show_option(m, "mds_namespace", fsopt->mds_namespace); 579 + 580 + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) 581 + seq_show_option(m, "recover_session", "clean"); 582 + 592 583 if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) 593 584 seq_printf(m, ",wsize=%d", fsopt->wsize); 594 585 if (fsopt->rsize != CEPH_MAX_READ_SIZE) ··· 681 664 682 665 fsc->sb = NULL; 683 666 fsc->mount_state = CEPH_MOUNT_MOUNTING; 667 + fsc->filp_gen = 1; 684 668 685 669 atomic_long_set(&fsc->writeback_count, 0); 686 670 ··· 731 713 { 732 714 dout("destroy_fs_client %p\n", fsc); 733 715 716 + ceph_mdsc_destroy(fsc); 734 717 destroy_workqueue(fsc->inode_wq); 735 718 destroy_workqueue(fsc->cap_wq); 736 719 ··· 848 829 fsc->mount_state = CEPH_MOUNT_SHUTDOWN; 849 830 ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); 850 831 ceph_mdsc_force_umount(fsc->mdsc); 851 - return; 832 + fsc->filp_gen++; // invalidate open files 852 833 } 853 834 854 835 static int ceph_remount(struct super_block *sb, int *flags, char *data) ··· 1108 1089 } 1109 1090 1110 1091 if (ceph_sb_to_client(sb) != fsc) { 1111 - ceph_mdsc_destroy(fsc); 1112 1092 destroy_fs_client(fsc); 1113 1093 fsc = ceph_sb_to_client(sb); 1114 1094 dout("get_sb got existing client %p\n", fsc); ··· 1133 1115 goto out_final; 1134 1116 1135 1117 out: 1136 - ceph_mdsc_destroy(fsc); 1137 1118 destroy_fs_client(fsc); 1138 1119 out_final: 1139 1120 dout("ceph_mount fail %ld\n", PTR_ERR(res)); ··· 1156 1139 1157 1140 ceph_fscache_unregister_fs(fsc); 1158 1141 1159 - ceph_mdsc_destroy(fsc); 1160 - 1161 1142 destroy_fs_client(fsc); 1162 1143 free_anon_bdev(dev); 1163 1144 } ··· 1168 1153 .fs_flags = FS_RENAME_DOES_D_MOVE, 1169 1154 }; 1170 1155 MODULE_ALIAS_FS("ceph"); 1156 + 1157 + int ceph_force_reconnect(struct super_block *sb) 1158 + { 1159 + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); 1160 + int err = 0; 1161 + 1162 + ceph_umount_begin(sb); 1163 + 1164 + /* Make sure all page caches get invalidated. 1165 + * see remove_session_caps_cb() */ 1166 + flush_workqueue(fsc->inode_wq); 1167 + 1168 + /* In case that we were blacklisted. This also reset 1169 + * all mon/osd connections */ 1170 + ceph_reset_client_addr(fsc->client); 1171 + 1172 + ceph_osdc_clear_abort_err(&fsc->client->osdc); 1173 + 1174 + fsc->blacklisted = false; 1175 + fsc->mount_state = CEPH_MOUNT_MOUNTED; 1176 + 1177 + if (sb->s_root) { 1178 + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, 1179 + CEPH_STAT_CAP_INODE, true); 1180 + } 1181 + return err; 1182 + } 1171 1183 1172 1184 static int __init init_ceph(void) 1173 1185 {

+32 -17

fs/ceph/super.h

··· 16 16 #include <linux/slab.h> 17 17 #include <linux/posix_acl.h> 18 18 #include <linux/refcount.h> 19 + #include <linux/security.h> 19 20 20 21 #include <linux/ceph/libceph.h> 21 22 ··· 32 31 #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ 33 32 #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 34 33 34 + #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ 35 35 #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 36 36 #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 37 37 #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ ··· 103 101 struct ceph_client *client; 104 102 105 103 unsigned long mount_state; 104 + 105 + unsigned long last_auto_reconnect; 106 + bool blacklisted; 107 + 108 + u32 filp_gen; 106 109 loff_t max_file_size; 107 110 108 111 struct ceph_mds_client *mdsc; ··· 402 395 struct fscache_cookie *fscache; 403 396 u32 i_fscache_gen; 404 397 #endif 398 + errseq_t i_meta_err; 399 + 405 400 struct inode vfs_inode; /* at end */ 406 401 }; 407 402 ··· 508 499 #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ 509 500 #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ 510 501 #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ 511 - #define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ 512 - #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ 513 - #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 514 - #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 515 - #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 516 - #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ 517 - #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ 518 - #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ 519 - #define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ 520 - #define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ 521 - 502 + #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ 503 + #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ 504 + #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ 505 + #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ 506 + #define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ 507 + #define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ 508 + #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ 509 + #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ 510 + #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ 511 + #define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ 522 512 523 513 /* 524 514 * Masks of ceph inode work. ··· 711 703 712 704 spinlock_t rw_contexts_lock; 713 705 struct list_head rw_contexts; 706 + 707 + errseq_t meta_err; 708 + u32 filp_gen; 709 + atomic_t num_locks; 714 710 }; 715 711 716 712 struct ceph_dir_file_info { ··· 854 842 } 855 843 856 844 857 - 845 + /* super.c */ 846 + extern int ceph_force_reconnect(struct super_block *sb); 858 847 /* snap.c */ 859 848 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 860 849 u64 ino); ··· 972 959 #ifdef CONFIG_CEPH_FS_SECURITY_LABEL 973 960 extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, 974 961 struct ceph_acl_sec_ctx *ctx); 975 - extern void ceph_security_invalidate_secctx(struct inode *inode); 962 + static inline void ceph_security_invalidate_secctx(struct inode *inode) 963 + { 964 + security_inode_invalidate_secctx(inode); 965 + } 976 966 #else 977 967 static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, 978 968 struct ceph_acl_sec_ctx *ctx) ··· 1055 1039 struct ceph_mds_session *session); 1056 1040 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, 1057 1041 int mds); 1058 - extern int ceph_get_cap_mds(struct inode *inode); 1059 1042 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 1060 1043 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 1061 1044 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ··· 1073 1058 struct inode *dir, 1074 1059 int mds, int drop, int unless); 1075 1060 1076 - extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 1061 + extern int ceph_get_caps(struct file *filp, int need, int want, 1077 1062 loff_t endoff, int *got, struct page **pinned_page); 1078 - extern int ceph_try_get_caps(struct ceph_inode_info *ci, 1063 + extern int ceph_try_get_caps(struct inode *inode, 1079 1064 int need, int want, bool nonblock, int *got); 1080 1065 1081 1066 /* for counting open files by mode */ ··· 1086 1071 extern const struct address_space_operations ceph_aops; 1087 1072 extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); 1088 1073 extern int ceph_uninline_data(struct file *filp, struct page *locked_page); 1089 - extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); 1074 + extern int ceph_pool_perm_check(struct inode *inode, int need); 1090 1075 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); 1091 1076 1092 1077 /* file.c */

+6 -70

fs/ceph/xattr.c

··· 20 20 21 21 static bool ceph_is_valid_xattr(const char *name) 22 22 { 23 - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 23 + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || 24 + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 24 25 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 25 26 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 26 27 } ··· 893 892 memcpy(value, xattr->val, xattr->val_len); 894 893 895 894 if (current->journal_info && 896 - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 895 + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && 896 + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) 897 897 ci->i_ceph_flags |= CEPH_I_SEC_INITED; 898 898 out: 899 899 spin_unlock(&ci->i_ceph_lock); ··· 905 903 { 906 904 struct inode *inode = d_inode(dentry); 907 905 struct ceph_inode_info *ci = ceph_inode(inode); 908 - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); 909 906 bool len_only = (size == 0); 910 907 u32 namelen; 911 908 int err; 912 - int i; 913 909 914 910 spin_lock(&ci->i_ceph_lock); 915 911 dout("listxattr %p ver=%lld index_ver=%lld\n", inode, ··· 935 935 } 936 936 names = __copy_xattr_names(ci, names); 937 937 size -= namelen; 938 - } 939 - 940 - 941 - /* virtual xattr names, too */ 942 - if (vxattrs) { 943 - for (i = 0; vxattrs[i].name; i++) { 944 - size_t this_len; 945 - 946 - if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) 947 - continue; 948 - if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) 949 - continue; 950 - 951 - this_len = strlen(vxattrs[i].name) + 1; 952 - namelen += this_len; 953 - if (len_only) 954 - continue; 955 - 956 - if (this_len > size) { 957 - err = -ERANGE; 958 - goto out; 959 - } 960 - 961 - memcpy(names, vxattrs[i].name, this_len); 962 - names += this_len; 963 - size -= this_len; 964 - } 965 938 } 966 939 err = namelen; 967 940 out: ··· 1266 1293 ceph_pagelist_release(pagelist); 1267 1294 return err; 1268 1295 } 1269 - 1270 - void ceph_security_invalidate_secctx(struct inode *inode) 1271 - { 1272 - security_inode_invalidate_secctx(inode); 1273 - } 1274 - 1275 - static int ceph_xattr_set_security_label(const struct xattr_handler *handler, 1276 - struct dentry *unused, struct inode *inode, 1277 - const char *key, const void *buf, 1278 - size_t buflen, int flags) 1279 - { 1280 - if (security_ismaclabel(key)) { 1281 - const char *name = xattr_full_name(handler, key); 1282 - return __ceph_setxattr(inode, name, buf, buflen, flags); 1283 - } 1284 - return -EOPNOTSUPP; 1285 - } 1286 - 1287 - static int ceph_xattr_get_security_label(const struct xattr_handler *handler, 1288 - struct dentry *unused, struct inode *inode, 1289 - const char *key, void *buf, size_t buflen) 1290 - { 1291 - if (security_ismaclabel(key)) { 1292 - const char *name = xattr_full_name(handler, key); 1293 - return __ceph_getxattr(inode, name, buf, buflen); 1294 - } 1295 - return -EOPNOTSUPP; 1296 - } 1297 - 1298 - static const struct xattr_handler ceph_security_label_handler = { 1299 - .prefix = XATTR_SECURITY_PREFIX, 1300 - .get = ceph_xattr_get_security_label, 1301 - .set = ceph_xattr_set_security_label, 1302 - }; 1303 - #endif 1304 - #endif 1296 + #endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ 1297 + #endif /* CONFIG_SECURITY */ 1305 1298 1306 1299 void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) 1307 1300 { ··· 1290 1351 #ifdef CONFIG_CEPH_FS_POSIX_ACL 1291 1352 &posix_acl_access_xattr_handler, 1292 1353 &posix_acl_default_xattr_handler, 1293 - #endif 1294 - #ifdef CONFIG_CEPH_FS_SECURITY_LABEL 1295 - &ceph_security_label_handler, 1296 1354 #endif 1297 1355 &ceph_other_xattr_handler, 1298 1356 NULL,

+1

include/linux/ceph/libceph.h

··· 293 293 struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); 294 294 u64 ceph_client_gid(struct ceph_client *client); 295 295 extern void ceph_destroy_client(struct ceph_client *client); 296 + extern void ceph_reset_client_addr(struct ceph_client *client); 296 297 extern int __ceph_open_session(struct ceph_client *client, 297 298 unsigned long started); 298 299 extern int ceph_open_session(struct ceph_client *client);

+1

include/linux/ceph/messenger.h

··· 337 337 extern void ceph_messenger_init(struct ceph_messenger *msgr, 338 338 struct ceph_entity_addr *myaddr); 339 339 extern void ceph_messenger_fini(struct ceph_messenger *msgr); 340 + extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); 340 341 341 342 extern void ceph_con_init(struct ceph_connection *con, void *private, 342 343 const struct ceph_connection_operations *ops,

+1

include/linux/ceph/mon_client.h

··· 109 109 110 110 extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); 111 111 extern void ceph_monc_stop(struct ceph_mon_client *monc); 112 + extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); 112 113 113 114 enum { 114 115 CEPH_SUB_MONMAP = 0,

+2

include/linux/ceph/osd_client.h

··· 381 381 extern int ceph_osdc_init(struct ceph_osd_client *osdc, 382 382 struct ceph_client *client); 383 383 extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 384 + extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); 384 385 385 386 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, 386 387 struct ceph_msg *msg); ··· 389 388 struct ceph_msg *msg); 390 389 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); 391 390 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); 391 + void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); 392 392 393 393 #define osd_req_op_data(oreq, whch, typ, fld) \ 394 394 ({ \

+31 -6

net/ceph/ceph_common.c

··· 13 13 #include <linux/nsproxy.h> 14 14 #include <linux/parser.h> 15 15 #include <linux/sched.h> 16 + #include <linux/sched/mm.h> 16 17 #include <linux/seq_file.h> 17 18 #include <linux/slab.h> 18 19 #include <linux/statfs.h> ··· 186 185 } 187 186 EXPORT_SYMBOL(ceph_compare_options); 188 187 188 + /* 189 + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are 190 + * compatible with (a superset of) GFP_KERNEL. This is because while the 191 + * actual pages are allocated with the specified flags, the page table pages 192 + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take 193 + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). 194 + * 195 + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. 196 + */ 189 197 void *ceph_kvmalloc(size_t size, gfp_t flags) 190 198 { 191 - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 192 - void *ptr = kmalloc(size, flags | __GFP_NOWARN); 193 - if (ptr) 194 - return ptr; 199 + void *p; 200 + 201 + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { 202 + p = kvmalloc(size, flags); 203 + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { 204 + unsigned int nofs_flag = memalloc_nofs_save(); 205 + p = kvmalloc(size, GFP_KERNEL); 206 + memalloc_nofs_restore(nofs_flag); 207 + } else { 208 + unsigned int noio_flag = memalloc_noio_save(); 209 + p = kvmalloc(size, GFP_KERNEL); 210 + memalloc_noio_restore(noio_flag); 195 211 } 196 212 197 - return __vmalloc(size, flags, PAGE_KERNEL); 213 + return p; 198 214 } 199 - 200 215 201 216 static int parse_fsid(const char *str, struct ceph_fsid *fsid) 202 217 { ··· 710 693 dout("destroy_client %p done\n", client); 711 694 } 712 695 EXPORT_SYMBOL(ceph_destroy_client); 696 + 697 + void ceph_reset_client_addr(struct ceph_client *client) 698 + { 699 + ceph_messenger_reset_nonce(&client->msgr); 700 + ceph_monc_reopen_session(&client->monc); 701 + ceph_osdc_reopen_osds(&client->osdc); 702 + } 703 + EXPORT_SYMBOL(ceph_reset_client_addr); 713 704 714 705 /* 715 706 * true if we have the mon map (and have thus joined the cluster)

+6

net/ceph/messenger.c

··· 3031 3031 } 3032 3032 3033 3033 3034 + void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) 3035 + { 3036 + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; 3037 + msgr->inst.addr.nonce = cpu_to_le32(nonce); 3038 + encode_my_addr(msgr); 3039 + } 3034 3040 3035 3041 /* 3036 3042 * initialize a new messenger instance

+7

net/ceph/mon_client.c

··· 213 213 __open_session(monc); 214 214 } 215 215 216 + void ceph_monc_reopen_session(struct ceph_mon_client *monc) 217 + { 218 + mutex_lock(&monc->mutex); 219 + reopen_session(monc); 220 + mutex_unlock(&monc->mutex); 221 + } 222 + 216 223 static void un_backoff(struct ceph_mon_client *monc) 217 224 { 218 225 monc->hunt_mult /= 2; /* reduce by 50% */

+53 -12

net/ceph/osd_client.c

··· 841 841 struct ceph_pagelist *pagelist; 842 842 size_t payload_len = 0; 843 843 size_t size; 844 + int ret; 844 845 845 846 op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); 846 847 ··· 853 852 size = strlen(class); 854 853 BUG_ON(size > (size_t) U8_MAX); 855 854 op->cls.class_len = size; 856 - ceph_pagelist_append(pagelist, class, size); 855 + ret = ceph_pagelist_append(pagelist, class, size); 856 + if (ret) 857 + goto err_pagelist_free; 857 858 payload_len += size; 858 859 859 860 op->cls.method_name = method; 860 861 size = strlen(method); 861 862 BUG_ON(size > (size_t) U8_MAX); 862 863 op->cls.method_len = size; 863 - ceph_pagelist_append(pagelist, method, size); 864 + ret = ceph_pagelist_append(pagelist, method, size); 865 + if (ret) 866 + goto err_pagelist_free; 864 867 payload_len += size; 865 868 866 869 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); 867 - 868 870 op->indata_len = payload_len; 869 871 return 0; 872 + 873 + err_pagelist_free: 874 + ceph_pagelist_release(pagelist); 875 + return ret; 870 876 } 871 877 EXPORT_SYMBOL(osd_req_op_cls_init); 872 878 ··· 885 877 opcode, 0); 886 878 struct ceph_pagelist *pagelist; 887 879 size_t payload_len; 880 + int ret; 888 881 889 882 BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); 890 883 ··· 895 886 896 887 payload_len = strlen(name); 897 888 op->xattr.name_len = payload_len; 898 - ceph_pagelist_append(pagelist, name, payload_len); 889 + ret = ceph_pagelist_append(pagelist, name, payload_len); 890 + if (ret) 891 + goto err_pagelist_free; 899 892 900 893 op->xattr.value_len = size; 901 - ceph_pagelist_append(pagelist, value, size); 894 + ret = ceph_pagelist_append(pagelist, value, size); 895 + if (ret) 896 + goto err_pagelist_free; 902 897 payload_len += size; 903 898 904 899 op->xattr.cmp_op = cmp_op; ··· 911 898 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 912 899 op->indata_len = payload_len; 913 900 return 0; 901 + 902 + err_pagelist_free: 903 + ceph_pagelist_release(pagelist); 904 + return ret; 914 905 } 915 906 EXPORT_SYMBOL(osd_req_op_xattr_init); 916 907 ··· 1505 1488 1506 1489 static enum calc_target_result calc_target(struct ceph_osd_client *osdc, 1507 1490 struct ceph_osd_request_target *t, 1508 - struct ceph_connection *con, 1509 1491 bool any_change) 1510 1492 { 1511 1493 struct ceph_pg_pool_info *pi; ··· 2288 2272 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); 2289 2273 2290 2274 again: 2291 - ct_res = calc_target(osdc, &req->r_t, NULL, false); 2275 + ct_res = calc_target(osdc, &req->r_t, false); 2292 2276 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) 2293 2277 goto promote; 2294 2278 ··· 2491 2475 up_write(&osdc->lock); 2492 2476 } 2493 2477 EXPORT_SYMBOL(ceph_osdc_abort_requests); 2478 + 2479 + void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) 2480 + { 2481 + down_write(&osdc->lock); 2482 + osdc->abort_err = 0; 2483 + up_write(&osdc->lock); 2484 + } 2485 + EXPORT_SYMBOL(ceph_osdc_clear_abort_err); 2494 2486 2495 2487 static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) 2496 2488 { ··· 3111 3087 lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; 3112 3088 } 3113 3089 3114 - calc_target(osdc, &lreq->t, NULL, false); 3090 + calc_target(osdc, &lreq->t, false); 3115 3091 osd = lookup_create_osd(osdc, lreq->t.osd, true); 3116 3092 link_linger(osd, lreq); 3117 3093 ··· 3728 3704 struct ceph_osd_client *osdc = lreq->osdc; 3729 3705 enum calc_target_result ct_res; 3730 3706 3731 - ct_res = calc_target(osdc, &lreq->t, NULL, true); 3707 + ct_res = calc_target(osdc, &lreq->t, true); 3732 3708 if (ct_res == CALC_TARGET_NEED_RESEND) { 3733 3709 struct ceph_osd *osd; 3734 3710 ··· 3800 3776 n = rb_next(n); /* unlink_request(), check_pool_dne() */ 3801 3777 3802 3778 dout("%s req %p tid %llu\n", __func__, req, req->r_tid); 3803 - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, 3804 - false); 3779 + ct_res = calc_target(osdc, &req->r_t, false); 3805 3780 switch (ct_res) { 3806 3781 case CALC_TARGET_NO_ACTION: 3807 3782 force_resend_writes = cleared_full || ··· 3909 3886 n = rb_next(n); 3910 3887 3911 3888 if (req->r_t.epoch < osdc->osdmap->epoch) { 3912 - ct_res = calc_target(osdc, &req->r_t, NULL, false); 3889 + ct_res = calc_target(osdc, &req->r_t, false); 3913 3890 if (ct_res == CALC_TARGET_POOL_DNE) { 3914 3891 erase_request(need_resend, req); 3915 3892 check_pool_dne(req); ··· 5108 5085 return ret; 5109 5086 } 5110 5087 EXPORT_SYMBOL(ceph_osdc_call); 5088 + 5089 + /* 5090 + * reset all osd connections 5091 + */ 5092 + void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) 5093 + { 5094 + struct rb_node *n; 5095 + 5096 + down_write(&osdc->lock); 5097 + for (n = rb_first(&osdc->osds); n; ) { 5098 + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 5099 + 5100 + n = rb_next(n); 5101 + if (!reopen_osd(osd)) 5102 + kick_osd_requests(osd); 5103 + } 5104 + up_write(&osdc->lock); 5105 + } 5111 5106 5112 5107 /* 5113 5108 * init, shutdown

+43 -26

net/ceph/osdmap.c

··· 973 973 struct ceph_pg_pool_info, node); 974 974 __remove_pg_pool(&map->pg_pools, pi); 975 975 } 976 - kfree(map->osd_state); 977 - kfree(map->osd_weight); 978 - kfree(map->osd_addr); 979 - kfree(map->osd_primary_affinity); 980 - kfree(map->crush_workspace); 976 + kvfree(map->osd_state); 977 + kvfree(map->osd_weight); 978 + kvfree(map->osd_addr); 979 + kvfree(map->osd_primary_affinity); 980 + kvfree(map->crush_workspace); 981 981 kfree(map); 982 982 } 983 983 ··· 986 986 * 987 987 * The new elements are properly initialized. 988 988 */ 989 - static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 989 + static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) 990 990 { 991 991 u32 *state; 992 992 u32 *weight; 993 993 struct ceph_entity_addr *addr; 994 + u32 to_copy; 994 995 int i; 995 996 996 - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); 997 - if (!state) 997 + dout("%s old %u new %u\n", __func__, map->max_osd, max); 998 + if (max == map->max_osd) 999 + return 0; 1000 + 1001 + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); 1002 + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); 1003 + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); 1004 + if (!state || !weight || !addr) { 1005 + kvfree(state); 1006 + kvfree(weight); 1007 + kvfree(addr); 998 1008 return -ENOMEM; 1009 + } 1010 + 1011 + to_copy = min(map->max_osd, max); 1012 + if (map->osd_state) { 1013 + memcpy(state, map->osd_state, to_copy * sizeof(*state)); 1014 + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); 1015 + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); 1016 + kvfree(map->osd_state); 1017 + kvfree(map->osd_weight); 1018 + kvfree(map->osd_addr); 1019 + } 1020 + 999 1021 map->osd_state = state; 1000 - 1001 - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); 1002 - if (!weight) 1003 - return -ENOMEM; 1004 1022 map->osd_weight = weight; 1005 - 1006 - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); 1007 - if (!addr) 1008 - return -ENOMEM; 1009 1023 map->osd_addr = addr; 1010 - 1011 1024 for (i = map->max_osd; i < max; i++) { 1012 1025 map->osd_state[i] = 0; 1013 1026 map->osd_weight[i] = CEPH_OSD_OUT; ··· 1030 1017 if (map->osd_primary_affinity) { 1031 1018 u32 *affinity; 1032 1019 1033 - affinity = krealloc(map->osd_primary_affinity, 1034 - max*sizeof(*affinity), GFP_NOFS); 1020 + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), 1021 + GFP_NOFS); 1035 1022 if (!affinity) 1036 1023 return -ENOMEM; 1037 - map->osd_primary_affinity = affinity; 1038 1024 1025 + memcpy(affinity, map->osd_primary_affinity, 1026 + to_copy * sizeof(*affinity)); 1027 + kvfree(map->osd_primary_affinity); 1028 + 1029 + map->osd_primary_affinity = affinity; 1039 1030 for (i = map->max_osd; i < max; i++) 1040 1031 map->osd_primary_affinity[i] = 1041 1032 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; ··· 1060 1043 1061 1044 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); 1062 1045 dout("%s work_size %zu bytes\n", __func__, work_size); 1063 - workspace = kmalloc(work_size, GFP_NOIO); 1046 + workspace = ceph_kvmalloc(work_size, GFP_NOIO); 1064 1047 if (!workspace) { 1065 1048 crush_destroy(crush); 1066 1049 return -ENOMEM; ··· 1069 1052 1070 1053 if (map->crush) 1071 1054 crush_destroy(map->crush); 1072 - kfree(map->crush_workspace); 1055 + kvfree(map->crush_workspace); 1073 1056 map->crush = crush; 1074 1057 map->crush_workspace = workspace; 1075 1058 return 0; ··· 1315 1298 if (!map->osd_primary_affinity) { 1316 1299 int i; 1317 1300 1318 - map->osd_primary_affinity = kmalloc_array(map->max_osd, 1319 - sizeof(u32), 1320 - GFP_NOFS); 1301 + map->osd_primary_affinity = ceph_kvmalloc( 1302 + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), 1303 + GFP_NOFS); 1321 1304 if (!map->osd_primary_affinity) 1322 1305 return -ENOMEM; 1323 1306 ··· 1338 1321 1339 1322 ceph_decode_32_safe(p, end, len, e_inval); 1340 1323 if (len == 0) { 1341 - kfree(map->osd_primary_affinity); 1324 + kvfree(map->osd_primary_affinity); 1342 1325 map->osd_primary_affinity = NULL; 1343 1326 return 0; 1344 1327 }