Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-4.16-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"Things have been very quiet on the rbd side, as work continues on the
big ticket items slated for the next merge window.

On the CephFS side we have a large number of cap handling
improvements, a fix for our long-standing abuse of ->journal_info in
ceph_readpages() and yet another dentry pointer management patch"

* tag 'ceph-for-4.16-rc1' of git://github.com/ceph/ceph-client:
ceph: improving efficiency of syncfs
libceph: check kstrndup() return value
ceph: try to allocate enough memory for reserved caps
ceph: fix race of queuing delayed caps
ceph: delete unreachable code in ceph_check_caps()
ceph: limit rate of cap import/export error messages
ceph: fix incorrect snaprealm when adding caps
ceph: fix un-balanced fsc->writeback_count update
ceph: track read contexts in ceph_file_info
ceph: avoid dereferencing invalid pointer during cached readdir
ceph: use atomic_t for ceph_inode_info::i_shared_gen
ceph: cleanup traceless reply handling for rename
ceph: voluntarily drop Fx cap for readdir request
ceph: properly drop caps for setattr request
ceph: voluntarily drop Lx cap for link/rename requests
ceph: voluntarily drop Ax cap for requests that create new inode
rbd: whitelist RBD_FEATURE_OPERATIONS feature bit
rbd: don't NULL out ->obj_request in rbd_img_obj_parent_read_full()
rbd: use kmem_cache_zalloc() in rbd_img_request_create()
rbd: obj_request->completion is unused

+323 -145
+6 -16
drivers/block/rbd.c
··· 124 124 #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 125 125 #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 126 126 #define RBD_FEATURE_DATA_POOL (1ULL<<7) 127 + #define RBD_FEATURE_OPERATIONS (1ULL<<8) 127 128 128 129 #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 129 130 RBD_FEATURE_STRIPINGV2 | \ 130 131 RBD_FEATURE_EXCLUSIVE_LOCK | \ 131 - RBD_FEATURE_DATA_POOL) 132 + RBD_FEATURE_DATA_POOL | \ 133 + RBD_FEATURE_OPERATIONS) 132 134 133 135 /* Features supported by this (client software) implementation. */ 134 136 ··· 283 281 int result; 284 282 285 283 rbd_obj_callback_t callback; 286 - struct completion completion; 287 284 288 285 struct kref kref; 289 286 }; ··· 1735 1734 { 1736 1735 dout("%s: obj %p cb %p\n", __func__, obj_request, 1737 1736 obj_request->callback); 1738 - if (obj_request->callback) 1739 - obj_request->callback(obj_request); 1740 - else 1741 - complete_all(&obj_request->completion); 1737 + obj_request->callback(obj_request); 1742 1738 } 1743 1739 1744 1740 static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) ··· 2011 2013 obj_request->which = BAD_WHICH; 2012 2014 obj_request->type = type; 2013 2015 INIT_LIST_HEAD(&obj_request->links); 2014 - init_completion(&obj_request->completion); 2015 2016 kref_init(&obj_request->kref); 2016 2017 2017 2018 dout("%s %p\n", __func__, obj_request); ··· 2126 2129 { 2127 2130 struct rbd_img_request *img_request; 2128 2131 2129 - img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2132 + img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO); 2130 2133 if (!img_request) 2131 2134 return NULL; 2132 2135 2133 - img_request->rq = NULL; 2134 2136 img_request->rbd_dev = rbd_dev; 2135 2137 img_request->offset = offset; 2136 2138 img_request->length = length; 2137 - img_request->flags = 0; 2138 2139 if (op_type == OBJ_OP_DISCARD) { 2139 2140 img_request_discard_set(img_request); 2140 2141 img_request->snapc = snapc; ··· 2144 2149 } 2145 2150 if (rbd_dev_parent_get(rbd_dev)) 2146 2151 img_request_layered_set(img_request); 2152 + 2147 2153 spin_lock_init(&img_request->completion_lock); 2148 - img_request->next_completion = 0; 2149 - img_request->callback = NULL; 2150 - img_request->result = 0; 2151 - img_request->obj_request_count = 0; 2152 2154 INIT_LIST_HEAD(&img_request->obj_requests); 2153 2155 kref_init(&img_request->kref); 2154 2156 ··· 2684 2692 2685 2693 parent_request->copyup_pages = NULL; 2686 2694 parent_request->copyup_page_count = 0; 2687 - parent_request->obj_request = NULL; 2688 - rbd_obj_request_put(obj_request); 2689 2695 out_err: 2690 2696 if (pages) 2691 2697 ceph_release_page_vector(pages, page_count);
+18 -10
fs/ceph/addr.c
··· 299 299 * start an async read(ahead) operation. return nr_pages we submitted 300 300 * a read for on success, or negative error code. 301 301 */ 302 - static int start_read(struct inode *inode, struct list_head *page_list, int max) 302 + static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, 303 + struct list_head *page_list, int max) 303 304 { 304 305 struct ceph_osd_client *osdc = 305 306 &ceph_inode_to_client(inode)->client->osdc; ··· 317 316 int got = 0; 318 317 int ret = 0; 319 318 320 - if (!current->journal_info) { 319 + if (!rw_ctx) { 321 320 /* caller of readpages does not hold buffer and read caps 322 321 * (fadvise, madvise and readahead cases) */ 323 322 int want = CEPH_CAP_FILE_CACHE; ··· 438 437 { 439 438 struct inode *inode = file_inode(file); 440 439 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 440 + struct ceph_file_info *ci = file->private_data; 441 + struct ceph_rw_context *rw_ctx; 441 442 int rc = 0; 442 443 int max = 0; 443 444 ··· 452 449 if (rc == 0) 453 450 goto out; 454 451 452 + rw_ctx = ceph_find_rw_context(ci); 455 453 max = fsc->mount_options->rsize >> PAGE_SHIFT; 456 - dout("readpages %p file %p nr_pages %d max %d\n", 457 - inode, file, nr_pages, max); 454 + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", 455 + inode, file, rw_ctx, nr_pages, max); 458 456 while (!list_empty(page_list)) { 459 - rc = start_read(inode, page_list, max); 457 + rc = start_read(inode, rw_ctx, page_list, max); 460 458 if (rc < 0) 461 459 goto out; 462 460 } ··· 578 574 struct ceph_fs_client *fsc; 579 575 struct ceph_snap_context *snapc, *oldest; 580 576 loff_t page_off = page_offset(page); 581 - long writeback_stat; 582 577 int err, len = PAGE_SIZE; 583 578 struct ceph_writeback_ctl ceph_wbc; 584 579 ··· 618 615 dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", 619 616 inode, page, page->index, page_off, len, snapc, snapc->seq); 620 617 621 - writeback_stat = atomic_long_inc_return(&fsc->writeback_count); 622 - if (writeback_stat > 618 + if (atomic_long_inc_return(&fsc->writeback_count) > 623 619 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) 624 620 set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); 625 621 ··· 653 651 end_page_writeback(page); 654 652 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 655 653 ceph_put_snap_context(snapc); /* page's reference */ 654 + 655 + if (atomic_long_dec_return(&fsc->writeback_count) < 656 + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 657 + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); 658 + 656 659 return err; 657 660 } 658 661 ··· 1457 1450 1458 1451 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1459 1452 ci->i_inline_version == CEPH_INLINE_NONE) { 1460 - current->journal_info = vma->vm_file; 1453 + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 1454 + ceph_add_rw_context(fi, &rw_ctx); 1461 1455 ret = filemap_fault(vmf); 1462 - current->journal_info = NULL; 1456 + ceph_del_rw_context(fi, &rw_ctx); 1463 1457 } else 1464 1458 ret = -EAGAIN; 1465 1459
+117 -53
fs/ceph/caps.c
··· 154 154 spin_unlock(&mdsc->caps_list_lock); 155 155 } 156 156 157 - void ceph_reserve_caps(struct ceph_mds_client *mdsc, 157 + /* 158 + * Called under mdsc->mutex. 159 + */ 160 + int ceph_reserve_caps(struct ceph_mds_client *mdsc, 158 161 struct ceph_cap_reservation *ctx, int need) 159 162 { 160 - int i; 163 + int i, j; 161 164 struct ceph_cap *cap; 162 165 int have; 163 166 int alloc = 0; 167 + int max_caps; 168 + bool trimmed = false; 169 + struct ceph_mds_session *s; 164 170 LIST_HEAD(newcaps); 165 171 166 172 dout("reserve caps ctx=%p need=%d\n", ctx, need); ··· 185 179 spin_unlock(&mdsc->caps_list_lock); 186 180 187 181 for (i = have; i < need; i++) { 182 + retry: 188 183 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 189 - if (!cap) 190 - break; 184 + if (!cap) { 185 + if (!trimmed) { 186 + for (j = 0; j < mdsc->max_sessions; j++) { 187 + s = __ceph_lookup_mds_session(mdsc, j); 188 + if (!s) 189 + continue; 190 + mutex_unlock(&mdsc->mutex); 191 + 192 + mutex_lock(&s->s_mutex); 193 + max_caps = s->s_nr_caps - (need - i); 194 + ceph_trim_caps(mdsc, s, max_caps); 195 + mutex_unlock(&s->s_mutex); 196 + 197 + ceph_put_mds_session(s); 198 + mutex_lock(&mdsc->mutex); 199 + } 200 + trimmed = true; 201 + goto retry; 202 + } else { 203 + pr_warn("reserve caps ctx=%p ENOMEM " 204 + "need=%d got=%d\n", 205 + ctx, need, have + alloc); 206 + goto out_nomem; 207 + } 208 + } 191 209 list_add(&cap->caps_item, &newcaps); 192 210 alloc++; 193 211 } 194 - /* we didn't manage to reserve as much as we needed */ 195 - if (have + alloc != need) 196 - pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", 197 - ctx, need, have + alloc); 212 + BUG_ON(have + alloc != need); 198 213 199 214 spin_lock(&mdsc->caps_list_lock); 200 215 mdsc->caps_total_count += alloc; ··· 231 204 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 232 205 ctx, mdsc->caps_total_count, mdsc->caps_use_count, 233 206 mdsc->caps_reserve_count, mdsc->caps_avail_count); 207 + return 0; 208 + 209 + out_nomem: 210 + while (!list_empty(&newcaps)) { 211 + cap = list_first_entry(&newcaps, 212 + struct ceph_cap, caps_item); 213 + list_del(&cap->caps_item); 214 + kmem_cache_free(ceph_cap_cachep, cap); 215 + } 216 + 217 + spin_lock(&mdsc->caps_list_lock); 218 + mdsc->caps_avail_count += have; 219 + mdsc->caps_reserve_count -= have; 220 + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 221 + mdsc->caps_reserve_count + 222 + mdsc->caps_avail_count); 223 + spin_unlock(&mdsc->caps_list_lock); 224 + return -ENOMEM; 234 225 } 235 226 236 227 int ceph_unreserve_caps(struct ceph_mds_client *mdsc, ··· 543 498 */ 544 499 if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { 545 500 if (issued & CEPH_CAP_FILE_SHARED) 546 - ci->i_shared_gen++; 501 + atomic_inc(&ci->i_shared_gen); 547 502 if (S_ISDIR(ci->vfs_inode.i_mode)) { 548 503 dout(" marking %p NOT complete\n", &ci->vfs_inode); 549 504 __ceph_dir_clear_complete(ci); ··· 622 577 } 623 578 } 624 579 625 - if (!ci->i_snap_realm) { 580 + if (!ci->i_snap_realm || 581 + ((flags & CEPH_CAP_FLAG_AUTH) && 582 + realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { 626 583 /* 627 584 * add this inode to the appropriate snap realm 628 585 */ 629 586 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, 630 587 realmino); 631 588 if (realm) { 589 + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; 590 + if (oldrealm) { 591 + spin_lock(&oldrealm->inodes_with_caps_lock); 592 + list_del_init(&ci->i_snap_realm_item); 593 + spin_unlock(&oldrealm->inodes_with_caps_lock); 594 + } 595 + 632 596 spin_lock(&realm->inodes_with_caps_lock); 633 597 ci->i_snap_realm = realm; 634 598 list_add(&ci->i_snap_realm_item, 635 599 &realm->inodes_with_caps); 636 600 spin_unlock(&realm->inodes_with_caps_lock); 601 + 602 + if (oldrealm) 603 + ceph_put_snap_realm(mdsc, oldrealm); 637 604 } else { 638 605 pr_err("ceph_add_cap: couldn't find snap realm %llx\n", 639 606 realmino); ··· 947 890 /* 948 891 * called under i_ceph_lock 949 892 */ 893 + static int __ceph_is_single_caps(struct ceph_inode_info *ci) 894 + { 895 + return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); 896 + } 897 + 950 898 static int __ceph_is_any_caps(struct ceph_inode_info *ci) 951 899 { 952 900 return !RB_EMPTY_ROOT(&ci->i_caps); ··· 1765 1703 int mds = -1; /* keep track of how far we've gone through i_caps list 1766 1704 to avoid an infinite loop on retry */ 1767 1705 struct rb_node *p; 1768 - int delayed = 0, sent = 0, num; 1769 - bool is_delayed = flags & CHECK_CAPS_NODELAY; 1706 + int delayed = 0, sent = 0; 1707 + bool no_delay = flags & CHECK_CAPS_NODELAY; 1770 1708 bool queue_invalidate = false; 1771 - bool force_requeue = false; 1772 1709 bool tried_invalidate = false; 1773 1710 1774 1711 /* if we are unmounting, flush any unused caps immediately. */ 1775 1712 if (mdsc->stopping) 1776 - is_delayed = true; 1713 + no_delay = true; 1777 1714 1778 1715 spin_lock(&ci->i_ceph_lock); 1779 1716 1780 1717 if (ci->i_ceph_flags & CEPH_I_FLUSH) 1781 1718 flags |= CHECK_CAPS_FLUSH; 1719 + 1720 + if (!(flags & CHECK_CAPS_AUTHONLY) || 1721 + (ci->i_auth_cap && __ceph_is_single_caps(ci))) 1722 + __cap_delay_cancel(mdsc, ci); 1782 1723 1783 1724 goto retry_locked; 1784 1725 retry: ··· 1837 1772 * have cached pages, but don't want them, then try to invalidate. 1838 1773 * If we fail, it's because pages are locked.... try again later. 1839 1774 */ 1840 - if ((!is_delayed || mdsc->stopping) && 1775 + if ((!no_delay || mdsc->stopping) && 1841 1776 !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ 1842 1777 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ 1843 1778 inode->i_data.nrpages && /* have cached pages */ ··· 1846 1781 !tried_invalidate) { 1847 1782 dout("check_caps trying to invalidate on %p\n", inode); 1848 1783 if (try_nonblocking_invalidate(inode) < 0) { 1849 - if (revoking & (CEPH_CAP_FILE_CACHE| 1850 - CEPH_CAP_FILE_LAZYIO)) { 1851 - dout("check_caps queuing invalidate\n"); 1852 - queue_invalidate = true; 1853 - ci->i_rdcache_revoking = ci->i_rdcache_gen; 1854 - } else { 1855 - dout("check_caps failed to invalidate pages\n"); 1856 - /* we failed to invalidate pages. check these 1857 - caps again later. */ 1858 - force_requeue = true; 1859 - __cap_set_timeouts(mdsc, ci); 1860 - } 1784 + dout("check_caps queuing invalidate\n"); 1785 + queue_invalidate = true; 1786 + ci->i_rdcache_revoking = ci->i_rdcache_gen; 1861 1787 } 1862 1788 tried_invalidate = true; 1863 1789 goto retry_locked; 1864 1790 } 1865 1791 1866 - num = 0; 1867 1792 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 1868 1793 cap = rb_entry(p, struct ceph_cap, ci_node); 1869 - num++; 1870 1794 1871 1795 /* avoid looping forever */ 1872 1796 if (mds >= cap->mds || ··· 1918 1864 cap->mds_wanted == want) 1919 1865 continue; /* nope, all good */ 1920 1866 1921 - if (is_delayed) 1867 + if (no_delay) 1922 1868 goto ack; 1923 1869 1924 1870 /* delay? */ ··· 2009 1955 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 2010 1956 } 2011 1957 2012 - /* 2013 - * Reschedule delayed caps release if we delayed anything, 2014 - * otherwise cancel. 2015 - */ 2016 - if (delayed && is_delayed) 2017 - force_requeue = true; /* __send_cap delayed release; requeue */ 2018 - if (!delayed && !is_delayed) 2019 - __cap_delay_cancel(mdsc, ci); 2020 - else if (!is_delayed || force_requeue) 1958 + /* Reschedule delayed caps release if we delayed anything */ 1959 + if (delayed) 2021 1960 __cap_delay_requeue(mdsc, ci); 2022 1961 2023 1962 spin_unlock(&ci->i_ceph_lock); ··· 2207 2160 u64 flush_tid; 2208 2161 int err = 0; 2209 2162 int dirty; 2210 - int wait = wbc->sync_mode == WB_SYNC_ALL; 2163 + int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); 2211 2164 2212 2165 dout("write_inode %p wait=%d\n", inode, wait); 2213 2166 if (wait) { ··· 3473 3426 */ 3474 3427 3475 3428 issued = cap->issued; 3476 - WARN_ON(issued != cap->implemented); 3429 + if (issued != cap->implemented) 3430 + pr_err_ratelimited("handle_cap_export: issued != implemented: " 3431 + "ino (%llx.%llx) mds%d seq %d mseq %d " 3432 + "issued %s implemented %s\n", 3433 + ceph_vinop(inode), mds, cap->seq, cap->mseq, 3434 + ceph_cap_string(issued), 3435 + ceph_cap_string(cap->implemented)); 3436 + 3477 3437 3478 3438 tcap = __get_cap_for_mds(ci, target); 3479 3439 if (tcap) { ··· 3626 3572 if ((ph->flags & CEPH_CAP_FLAG_AUTH) && 3627 3573 (ocap->seq != le32_to_cpu(ph->seq) || 3628 3574 ocap->mseq != le32_to_cpu(ph->mseq))) { 3629 - pr_err("handle_cap_import: mismatched seq/mseq: " 3630 - "ino (%llx.%llx) mds%d seq %d mseq %d " 3631 - "importer mds%d has peer seq %d mseq %d\n", 3632 - ceph_vinop(inode), peer, ocap->seq, 3633 - ocap->mseq, mds, le32_to_cpu(ph->seq), 3634 - le32_to_cpu(ph->mseq)); 3575 + pr_err_ratelimited("handle_cap_import: " 3576 + "mismatched seq/mseq: ino (%llx.%llx) " 3577 + "mds%d seq %d mseq %d importer mds%d " 3578 + "has peer seq %d mseq %d\n", 3579 + ceph_vinop(inode), peer, ocap->seq, 3580 + ocap->mseq, mds, le32_to_cpu(ph->seq), 3581 + le32_to_cpu(ph->mseq)); 3635 3582 } 3636 3583 __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); 3637 3584 } ··· 3994 3939 3995 3940 cap = __get_cap_for_mds(ci, mds); 3996 3941 if (cap && __cap_is_valid(cap)) { 3997 - if (force || 3998 - ((cap->issued & drop) && 3999 - (cap->issued & unless) == 0)) { 4000 - if ((cap->issued & drop) && 4001 - (cap->issued & unless) == 0) { 3942 + unless &= cap->issued; 3943 + if (unless) { 3944 + if (unless & CEPH_CAP_AUTH_EXCL) 3945 + drop &= ~CEPH_CAP_AUTH_SHARED; 3946 + if (unless & CEPH_CAP_LINK_EXCL) 3947 + drop &= ~CEPH_CAP_LINK_SHARED; 3948 + if (unless & CEPH_CAP_XATTR_EXCL) 3949 + drop &= ~CEPH_CAP_XATTR_SHARED; 3950 + if (unless & CEPH_CAP_FILE_EXCL) 3951 + drop &= ~CEPH_CAP_FILE_SHARED; 3952 + } 3953 + 3954 + if (force || (cap->issued & drop)) { 3955 + if (cap->issued & drop) { 4002 3956 int wanted = __ceph_caps_wanted(ci); 4003 3957 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) 4004 3958 wanted |= cap->mds_wanted; ··· 4039 3975 *p += sizeof(*rel); 4040 3976 ret = 1; 4041 3977 } else { 4042 - dout("encode_inode_release %p cap %p %s\n", 3978 + dout("encode_inode_release %p cap %p %s (noop)\n", 4043 3979 inode, cap, ceph_cap_string(cap->issued)); 4044 3980 } 4045 3981 }
+45 -34
fs/ceph/dir.c
··· 173 173 * the MDS if/when the directory is modified). 174 174 */ 175 175 static int __dcache_readdir(struct file *file, struct dir_context *ctx, 176 - u32 shared_gen) 176 + int shared_gen) 177 177 { 178 178 struct ceph_file_info *fi = file->private_data; 179 179 struct dentry *parent = file->f_path.dentry; ··· 184 184 u64 idx = 0; 185 185 int err = 0; 186 186 187 - dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); 187 + dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos); 188 188 189 189 /* search start position */ 190 190 if (ctx->pos > 2) { ··· 231 231 goto out; 232 232 } 233 233 234 - di = ceph_dentry(dentry); 235 234 spin_lock(&dentry->d_lock); 236 - if (di->lease_shared_gen == shared_gen && 237 - d_really_is_positive(dentry) && 238 - fpos_cmp(ctx->pos, di->offset) <= 0) { 235 + di = ceph_dentry(dentry); 236 + if (d_unhashed(dentry) || 237 + d_really_is_negative(dentry) || 238 + di->lease_shared_gen != shared_gen) { 239 + spin_unlock(&dentry->d_lock); 240 + dput(dentry); 241 + err = -EAGAIN; 242 + goto out; 243 + } 244 + if (fpos_cmp(ctx->pos, di->offset) <= 0) { 239 245 emit_dentry = true; 240 246 } 241 247 spin_unlock(&dentry->d_lock); ··· 339 333 ceph_snap(inode) != CEPH_SNAPDIR && 340 334 __ceph_dir_is_complete_ordered(ci) && 341 335 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 342 - u32 shared_gen = ci->i_shared_gen; 336 + int shared_gen = atomic_read(&ci->i_shared_gen); 343 337 spin_unlock(&ci->i_ceph_lock); 344 338 err = __dcache_readdir(file, ctx, shared_gen); 345 339 if (err != -EAGAIN) ··· 387 381 if (op == CEPH_MDS_OP_READDIR) { 388 382 req->r_direct_hash = ceph_frag_value(frag); 389 383 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 384 + req->r_inode_drop = CEPH_CAP_FILE_EXCL; 390 385 } 391 386 if (fi->last_name) { 392 387 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); ··· 757 750 spin_unlock(&ci->i_ceph_lock); 758 751 dout(" dir %p complete, -ENOENT\n", dir); 759 752 d_add(dentry, NULL); 760 - di->lease_shared_gen = ci->i_shared_gen; 753 + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); 761 754 return NULL; 762 755 } 763 756 spin_unlock(&ci->i_ceph_lock); ··· 842 835 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 843 836 req->r_args.mknod.mode = cpu_to_le32(mode); 844 837 req->r_args.mknod.rdev = cpu_to_le32(rdev); 845 - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 838 + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 846 839 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 847 840 if (acls.pagelist) { 848 841 req->r_pagelist = acls.pagelist; ··· 894 887 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 895 888 req->r_dentry = dget(dentry); 896 889 req->r_num_caps = 2; 897 - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 890 + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 898 891 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 899 892 err = ceph_mdsc_do_request(mdsc, dir, req); 900 893 if (!err && !req->r_reply_info.head->is_dentry) ··· 943 936 req->r_parent = dir; 944 937 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 945 938 req->r_args.mkdir.mode = cpu_to_le32(mode); 946 - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 939 + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 947 940 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 948 941 if (acls.pagelist) { 949 942 req->r_pagelist = acls.pagelist; ··· 990 983 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 991 984 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 992 985 /* release LINK_SHARED on source inode (mds will lock it) */ 993 - req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 986 + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 994 987 err = ceph_mdsc_do_request(mdsc, dir, req); 995 988 if (err) { 996 989 d_drop(dentry); ··· 1103 1096 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1104 1097 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1105 1098 /* release LINK_RDCACHE on source inode (mds will lock it) */ 1106 - req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 1099 + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 1107 1100 if (d_really_is_positive(new_dentry)) 1108 1101 req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); 1109 1102 err = ceph_mdsc_do_request(mdsc, old_dir, req); ··· 1113 1106 * do_request, above). If there is no trace, we need 1114 1107 * to do it here. 1115 1108 */ 1116 - 1117 - /* d_move screws up sibling dentries' offsets */ 1118 - ceph_dir_clear_complete(old_dir); 1119 - ceph_dir_clear_complete(new_dir); 1120 - 1121 1109 d_move(old_dentry, new_dentry); 1122 - 1123 - /* ensure target dentry is invalidated, despite 1124 - rehashing bug in vfs_rename_dir */ 1125 - ceph_invalidate_dentry_lease(new_dentry); 1126 1110 } 1127 1111 ceph_mdsc_put_request(req); 1128 1112 return err; ··· 1197 1199 int valid = 0; 1198 1200 1199 1201 spin_lock(&ci->i_ceph_lock); 1200 - if (ci->i_shared_gen == di->lease_shared_gen) 1202 + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) 1201 1203 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1202 1204 spin_unlock(&ci->i_ceph_lock); 1203 1205 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1204 - dir, (unsigned)ci->i_shared_gen, dentry, 1205 - (unsigned)di->lease_shared_gen, valid); 1206 + dir, (unsigned)atomic_read(&ci->i_shared_gen), 1207 + dentry, (unsigned)di->lease_shared_gen, valid); 1206 1208 return valid; 1207 1209 } 1208 1210 ··· 1330 1332 */ 1331 1333 static void ceph_d_prune(struct dentry *dentry) 1332 1334 { 1333 - dout("ceph_d_prune %p\n", dentry); 1335 + struct ceph_inode_info *dir_ci; 1336 + struct ceph_dentry_info *di; 1337 + 1338 + dout("ceph_d_prune %pd %p\n", dentry, dentry); 1334 1339 1335 1340 /* do we have a valid parent? */ 1336 1341 if (IS_ROOT(dentry)) 1337 1342 return; 1338 1343 1339 - /* if we are not hashed, we don't affect dir's completeness */ 1340 - if (d_unhashed(dentry)) 1344 + /* we hold d_lock, so d_parent is stable */ 1345 + dir_ci = ceph_inode(d_inode(dentry->d_parent)); 1346 + if (dir_ci->i_vino.snap == CEPH_SNAPDIR) 1341 1347 return; 1342 1348 1343 - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) 1349 + /* who calls d_delete() should also disable dcache readdir */ 1350 + if (d_really_is_negative(dentry)) 1344 1351 return; 1345 1352 1346 - /* 1347 - * we hold d_lock, so d_parent is stable, and d_fsdata is never 1348 - * cleared until d_release 1349 - */ 1350 - ceph_dir_clear_complete(d_inode(dentry->d_parent)); 1353 + /* d_fsdata does not get cleared until d_release */ 1354 + if (!d_unhashed(dentry)) { 1355 + __ceph_dir_clear_complete(dir_ci); 1356 + return; 1357 + } 1358 + 1359 + /* Disable dcache readdir just in case that someone called d_drop() 1360 + * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED 1361 + * properly (dcache readdir is still enabled) */ 1362 + di = ceph_dentry(dentry); 1363 + if (di->offset > 0 && 1364 + di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen)) 1365 + __ceph_dir_clear_ordered(dir_ci); 1351 1366 } 1352 1367 1353 1368 /*
+9 -3
fs/ceph/file.c
··· 181 181 return -ENOMEM; 182 182 } 183 183 cf->fmode = fmode; 184 + 185 + spin_lock_init(&cf->rw_contexts_lock); 186 + INIT_LIST_HEAD(&cf->rw_contexts); 187 + 184 188 cf->next_offset = 2; 185 189 cf->readdir_cache_idx = -1; 186 190 file->private_data = cf; ··· 400 396 req->r_dentry = dget(dentry); 401 397 req->r_num_caps = 2; 402 398 if (flags & O_CREAT) { 403 - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 399 + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 404 400 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 405 401 if (acls.pagelist) { 406 402 req->r_pagelist = acls.pagelist; ··· 468 464 ceph_mdsc_put_request(cf->last_readdir); 469 465 kfree(cf->last_name); 470 466 kfree(cf->dir_info); 467 + WARN_ON(!list_empty(&cf->rw_contexts)); 471 468 kmem_cache_free(ceph_file_cachep, cf); 472 469 473 470 /* wake up anyone waiting for caps on this inode */ ··· 1204 1199 retry_op = READ_INLINE; 1205 1200 } 1206 1201 } else { 1202 + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 1207 1203 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 1208 1204 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1209 1205 ceph_cap_string(got)); 1210 - current->journal_info = filp; 1206 + ceph_add_rw_context(fi, &rw_ctx); 1211 1207 ret = generic_file_read_iter(iocb, to); 1212 - current->journal_info = NULL; 1208 + ceph_del_rw_context(fi, &rw_ctx); 1213 1209 } 1214 1210 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 1215 1211 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+42 -14
fs/ceph/inode.c
··· 494 494 ci->i_wrbuffer_ref = 0; 495 495 ci->i_wrbuffer_ref_head = 0; 496 496 atomic_set(&ci->i_filelock_ref, 0); 497 - ci->i_shared_gen = 0; 497 + atomic_set(&ci->i_shared_gen, 0); 498 498 ci->i_rdcache_gen = 0; 499 499 ci->i_rdcache_revoking = 0; 500 500 ··· 1041 1041 if (ceph_snap(dir) != CEPH_NOSNAP) 1042 1042 goto out_unlock; 1043 1043 1044 - di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; 1044 + di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); 1045 1045 1046 1046 if (duration == 0) 1047 1047 goto out_unlock; ··· 1079 1079 struct dentry *realdn; 1080 1080 1081 1081 BUG_ON(d_inode(dn)); 1082 + 1083 + if (S_ISDIR(in->i_mode)) { 1084 + /* If inode is directory, d_splice_alias() below will remove 1085 + * 'realdn' from its origin parent. We need to ensure that 1086 + * origin parent's readdir cache will not reference 'realdn' 1087 + */ 1088 + realdn = d_find_any_alias(in); 1089 + if (realdn) { 1090 + struct ceph_dentry_info *di = ceph_dentry(realdn); 1091 + spin_lock(&realdn->d_lock); 1092 + 1093 + realdn->d_op->d_prune(realdn); 1094 + 1095 + di->time = jiffies; 1096 + di->lease_shared_gen = 0; 1097 + di->offset = 0; 1098 + 1099 + spin_unlock(&realdn->d_lock); 1100 + dput(realdn); 1101 + } 1102 + } 1082 1103 1083 1104 /* dn must be unhashed */ 1084 1105 if (!d_unhashed(dn)) ··· 1316 1295 if (!rinfo->head->is_target) { 1317 1296 dout("fill_trace null dentry\n"); 1318 1297 if (d_really_is_positive(dn)) { 1319 - ceph_dir_clear_ordered(dir); 1320 1298 dout("d_delete %p\n", dn); 1299 + ceph_dir_clear_ordered(dir); 1321 1300 d_delete(dn); 1322 1301 } else if (have_lease) { 1323 1302 if (d_unhashed(dn)) ··· 1344 1323 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1345 1324 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1346 1325 ceph_vinop(in)); 1347 - ceph_dir_clear_ordered(dir); 1348 1326 d_invalidate(dn); 1349 1327 have_lease = false; 1350 1328 } ··· 1593 1573 } else if (d_really_is_positive(dn) && 1594 1574 (ceph_ino(d_inode(dn)) != tvino.ino || 1595 1575 ceph_snap(d_inode(dn)) != tvino.snap)) { 1576 + struct ceph_dentry_info *di = ceph_dentry(dn); 1596 1577 dout(" dn %p points to wrong inode %p\n", 1597 1578 dn, d_inode(dn)); 1598 - __ceph_dir_clear_ordered(ci); 1579 + 1580 + spin_lock(&dn->d_lock); 1581 + if (di->offset > 0 && 1582 + di->lease_shared_gen == 1583 + atomic_read(&ci->i_shared_gen)) { 1584 + __ceph_dir_clear_ordered(ci); 1585 + di->offset = 0; 1586 + } 1587 + spin_unlock(&dn->d_lock); 1588 + 1599 1589 d_delete(dn); 1600 1590 dput(dn); 1601 1591 goto retry_lookup; ··· 1630 1600 &req->r_caps_reservation); 1631 1601 if (ret < 0) { 1632 1602 pr_err("fill_inode badness on %p\n", in); 1633 - if (d_really_is_positive(dn)) 1634 - __ceph_dir_clear_ordered(ci); 1635 - else 1603 + if (d_really_is_negative(dn)) 1636 1604 iput(in); 1637 1605 d_drop(dn); 1638 1606 err = ret; ··· 2028 2000 ceph_encode_timespec(&req->r_args.setattr.atime, 2029 2001 &attr->ia_atime); 2030 2002 mask |= CEPH_SETATTR_ATIME; 2031 - release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | 2032 - CEPH_CAP_FILE_WR; 2003 + release |= CEPH_CAP_FILE_SHARED | 2004 + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2033 2005 } 2034 2006 } 2035 2007 if (ia_valid & ATTR_MTIME) { ··· 2050 2022 ceph_encode_timespec(&req->r_args.setattr.mtime, 2051 2023 &attr->ia_mtime); 2052 2024 mask |= CEPH_SETATTR_MTIME; 2053 - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | 2054 - CEPH_CAP_FILE_WR; 2025 + release |= CEPH_CAP_FILE_SHARED | 2026 + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2055 2027 } 2056 2028 } 2057 2029 if (ia_valid & ATTR_SIZE) { ··· 2069 2041 req->r_args.setattr.old_size = 2070 2042 cpu_to_le64(inode->i_size); 2071 2043 mask |= CEPH_SETATTR_SIZE; 2072 - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | 2073 - CEPH_CAP_FILE_WR; 2044 + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2045 + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2074 2046 } 2075 2047 } 2076 2048
+23 -10
fs/ceph/mds_client.c
··· 604 604 struct ceph_mds_request *req, 605 605 struct inode *dir) 606 606 { 607 + int ret = 0; 608 + 607 609 req->r_tid = ++mdsc->last_tid; 608 - if (req->r_num_caps) 609 - ceph_reserve_caps(mdsc, &req->r_caps_reservation, 610 - req->r_num_caps); 610 + if (req->r_num_caps) { 611 + ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, 612 + req->r_num_caps); 613 + if (ret < 0) { 614 + pr_err("__register_request %p " 615 + "failed to reserve caps: %d\n", req, ret); 616 + /* set req->r_err to fail early from __do_request */ 617 + req->r_err = ret; 618 + return; 619 + } 620 + } 611 621 dout("__register_request %p tid %lld\n", req, req->r_tid); 612 622 ceph_mdsc_get_request(req); 613 623 insert_request(&mdsc->request_tree, req); ··· 1555 1545 /* 1556 1546 * Trim session cap count down to some max number. 1557 1547 */ 1558 - static int trim_caps(struct ceph_mds_client *mdsc, 1559 - struct ceph_mds_session *session, 1560 - int max_caps) 1548 + int ceph_trim_caps(struct ceph_mds_client *mdsc, 1549 + struct ceph_mds_session *session, 1550 + int max_caps) 1561 1551 { 1562 1552 int trim_caps = session->s_nr_caps - max_caps; 1563 1553 ··· 2448 2438 */ 2449 2439 void ceph_invalidate_dir_request(struct ceph_mds_request *req) 2450 2440 { 2451 - struct inode *inode = req->r_parent; 2441 + struct inode *dir = req->r_parent; 2442 + struct inode *old_dir = req->r_old_dentry_dir; 2452 2443 2453 - dout("invalidate_dir_request %p (complete, lease(s))\n", inode); 2444 + dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir); 2454 2445 2455 - ceph_dir_clear_complete(inode); 2446 + ceph_dir_clear_complete(dir); 2447 + if (old_dir) 2448 + ceph_dir_clear_complete(old_dir); 2456 2449 if (req->r_dentry) 2457 2450 ceph_invalidate_dentry_lease(req->r_dentry); 2458 2451 if (req->r_old_dentry) ··· 2786 2773 break; 2787 2774 2788 2775 case CEPH_SESSION_RECALL_STATE: 2789 - trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); 2776 + ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); 2790 2777 break; 2791 2778 2792 2779 case CEPH_SESSION_FLUSHMSG:
+3
fs/ceph/mds_client.h
··· 444 444 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 445 445 struct ceph_mds_session *session); 446 446 447 + extern int ceph_trim_caps(struct ceph_mds_client *mdsc, 448 + struct ceph_mds_session *session, 449 + int max_caps); 447 450 #endif
+6 -2
fs/ceph/snap.c
··· 922 922 /* 923 923 * Move the inode to the new realm 924 924 */ 925 - spin_lock(&realm->inodes_with_caps_lock); 925 + oldrealm = ci->i_snap_realm; 926 + spin_lock(&oldrealm->inodes_with_caps_lock); 926 927 list_del_init(&ci->i_snap_realm_item); 928 + spin_unlock(&oldrealm->inodes_with_caps_lock); 929 + 930 + spin_lock(&realm->inodes_with_caps_lock); 927 931 list_add(&ci->i_snap_realm_item, 928 932 &realm->inodes_with_caps); 929 - oldrealm = ci->i_snap_realm; 930 933 ci->i_snap_realm = realm; 931 934 spin_unlock(&realm->inodes_with_caps_lock); 935 + 932 936 spin_unlock(&ci->i_ceph_lock); 933 937 934 938 ceph_get_snap_realm(mdsc, realm);
+50 -3
fs/ceph/super.h
··· 256 256 */ 257 257 struct ceph_dentry_info { 258 258 struct ceph_mds_session *lease_session; 259 - u32 lease_gen, lease_shared_gen; 259 + int lease_shared_gen; 260 + u32 lease_gen; 260 261 u32 lease_seq; 261 262 unsigned long lease_renew_after, lease_renew_from; 262 263 struct list_head lru; ··· 354 353 int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; 355 354 int i_wrbuffer_ref, i_wrbuffer_ref_head; 356 355 atomic_t i_filelock_ref; 357 - u32 i_shared_gen; /* increment each time we get FILE_SHARED */ 356 + atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ 358 357 u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ 359 358 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ 360 359 ··· 649 648 extern void ceph_caps_init(struct ceph_mds_client *mdsc); 650 649 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); 651 650 extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); 652 - extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, 651 + extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, 653 652 struct ceph_cap_reservation *ctx, int need); 654 653 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 655 654 struct ceph_cap_reservation *ctx); ··· 669 668 short fmode; /* initialized on open */ 670 669 short flags; /* CEPH_F_* */ 671 670 671 + spinlock_t rw_contexts_lock; 672 + struct list_head rw_contexts; 673 + 672 674 /* readdir: position within the dir */ 673 675 u32 frag; 674 676 struct ceph_mds_request *last_readdir; ··· 687 683 char *dir_info; 688 684 int dir_info_len; 689 685 }; 686 + 687 + struct ceph_rw_context { 688 + struct list_head list; 689 + struct task_struct *thread; 690 + int caps; 691 + }; 692 + 693 + #define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ 694 + struct ceph_rw_context _name = { \ 695 + .thread = current, \ 696 + .caps = _caps, \ 697 + } 698 + 699 + static inline void ceph_add_rw_context(struct ceph_file_info *cf, 700 + struct ceph_rw_context *ctx) 701 + { 702 + spin_lock(&cf->rw_contexts_lock); 703 + list_add(&ctx->list, &cf->rw_contexts); 704 + spin_unlock(&cf->rw_contexts_lock); 705 + } 706 + 707 + static inline void ceph_del_rw_context(struct ceph_file_info *cf, 708 + struct ceph_rw_context *ctx) 709 + { 710 + spin_lock(&cf->rw_contexts_lock); 711 + list_del(&ctx->list); 712 + spin_unlock(&cf->rw_contexts_lock); 713 + } 714 + 715 + static inline struct ceph_rw_context* 716 + ceph_find_rw_context(struct ceph_file_info *cf) 717 + { 718 + struct ceph_rw_context *ctx, *found = NULL; 719 + spin_lock(&cf->rw_contexts_lock); 720 + list_for_each_entry(ctx, &cf->rw_contexts, list) { 721 + if (ctx->thread == current) { 722 + found = ctx; 723 + break; 724 + } 725 + } 726 + spin_unlock(&cf->rw_contexts_lock); 727 + return found; 728 + } 690 729 691 730 struct ceph_readdir_cache_control { 692 731 struct page *page;
+4
net/ceph/ceph_common.c
··· 421 421 opt->name = kstrndup(argstr[0].from, 422 422 argstr[0].to-argstr[0].from, 423 423 GFP_KERNEL); 424 + if (!opt->name) { 425 + err = -ENOMEM; 426 + goto out; 427 + } 424 428 break; 425 429 case Opt_secret: 426 430 opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);