Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-5.19-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"A big pile of assorted fixes and improvements for the filesystem with
nothing in particular standing out, except perhaps that the fact that
the MDS never really maintained atime was made official and thus it's
no longer updated on the client either.

We also have a MAINTAINERS update: Jeff is transitioning his
filesystem maintainership duties to Xiubo"

* tag 'ceph-for-5.19-rc1' of https://github.com/ceph/ceph-client: (23 commits)
MAINTAINERS: move myself from ceph "Maintainer" to "Reviewer"
ceph: fix decoding of client session messages flags
ceph: switch TASK_INTERRUPTIBLE to TASK_KILLABLE
ceph: remove redundant variable ino
ceph: try to queue a writeback if revoking fails
ceph: fix statfs for subdir mounts
ceph: fix possible deadlock when holding Fwb to get inline_data
ceph: redirty the page for writepage on failure
ceph: try to choose the auth MDS if possible for getattr
ceph: disable updating the atime since cephfs won't maintain it
ceph: flush the mdlog for filesystem sync
ceph: rename unsafe_request_wait()
libceph: use swap() macro instead of taking tmp variable
ceph: fix statx AT_STATX_DONT_SYNC vs AT_STATX_FORCE_SYNC check
ceph: no need to invalidate the fscache twice
ceph: replace usage of found with dedicated list iterator variable
ceph: use dedicated list iterator variable
ceph: update the dlease for the hashed dentry when removing
ceph: stop retrying the request when exceeding 256 times
ceph: stop forwarding the request when exceeding 256 times
...

+257 -107
+2 -2
MAINTAINERS
··· 4566 4566 4567 4567 CEPH COMMON CODE (LIBCEPH) 4568 4568 M: Ilya Dryomov <idryomov@gmail.com> 4569 - M: Jeff Layton <jlayton@kernel.org> 4570 4569 M: Xiubo Li <xiubli@redhat.com> 4570 + R: Jeff Layton <jlayton@kernel.org> 4571 4571 L: ceph-devel@vger.kernel.org 4572 4572 S: Supported 4573 4573 W: http://ceph.com/ ··· 4577 4577 F: net/ceph/ 4578 4578 4579 4579 CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) 4580 - M: Jeff Layton <jlayton@kernel.org> 4581 4580 M: Xiubo Li <xiubli@redhat.com> 4582 4581 M: Ilya Dryomov <idryomov@gmail.com> 4582 + R: Jeff Layton <jlayton@kernel.org> 4583 4583 L: ceph-devel@vger.kernel.org 4584 4584 S: Supported 4585 4585 W: http://ceph.com/
+6 -7
drivers/block/rbd.c
··· 756 756 */ 757 757 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 758 758 { 759 - struct rbd_client *client_node; 760 - bool found = false; 759 + struct rbd_client *rbdc = NULL, *iter; 761 760 762 761 if (ceph_opts->flags & CEPH_OPT_NOSHARE) 763 762 return NULL; 764 763 765 764 spin_lock(&rbd_client_list_lock); 766 - list_for_each_entry(client_node, &rbd_client_list, node) { 767 - if (!ceph_compare_options(ceph_opts, client_node->client)) { 768 - __rbd_get_client(client_node); 765 + list_for_each_entry(iter, &rbd_client_list, node) { 766 + if (!ceph_compare_options(ceph_opts, iter->client)) { 767 + __rbd_get_client(iter); 769 768 770 - found = true; 769 + rbdc = iter; 771 770 break; 772 771 } 773 772 } 774 773 spin_unlock(&rbd_client_list_lock); 775 774 776 - return found ? client_node : NULL; 775 + return rbdc; 777 776 } 778 777 779 778 /*
+29 -21
fs/ceph/addr.c
··· 256 256 struct iov_iter iter; 257 257 ssize_t err = 0; 258 258 size_t len; 259 + int mode; 259 260 260 261 __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); 261 262 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); ··· 265 264 goto out; 266 265 267 266 /* We need to fetch the inline data. */ 268 - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 267 + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); 268 + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); 269 269 if (IS_ERR(req)) { 270 270 err = PTR_ERR(req); 271 271 goto out; ··· 606 604 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, 607 605 ceph_wbc.truncate_seq, ceph_wbc.truncate_size, 608 606 true); 609 - if (IS_ERR(req)) 607 + if (IS_ERR(req)) { 608 + redirty_page_for_writepage(wbc, page); 610 609 return PTR_ERR(req); 610 + } 611 611 612 612 set_page_writeback(page); 613 613 if (caching) ··· 1648 1644 struct inode *inode = file_inode(file); 1649 1645 struct ceph_inode_info *ci = ceph_inode(inode); 1650 1646 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1651 - struct ceph_osd_request *req; 1647 + struct ceph_osd_request *req = NULL; 1652 1648 struct ceph_cap_flush *prealloc_cf; 1653 1649 struct folio *folio = NULL; 1654 1650 u64 inline_version = CEPH_INLINE_NONE; 1655 1651 struct page *pages[1]; 1656 1652 int err = 0; 1657 1653 u64 len; 1658 - 1659 - prealloc_cf = ceph_alloc_cap_flush(); 1660 - if (!prealloc_cf) 1661 - return -ENOMEM; 1662 - 1663 - folio = read_mapping_folio(inode->i_mapping, 0, file); 1664 - if (IS_ERR(folio)) { 1665 - err = PTR_ERR(folio); 1666 - goto out; 1667 - } 1668 - 1669 - folio_lock(folio); 1670 1654 1671 1655 spin_lock(&ci->i_ceph_lock); 1672 1656 inline_version = ci->i_inline_version; ··· 1663 1671 dout("uninline_data %p %llx.%llx inline_version %llu\n", 1664 1672 inode, ceph_vinop(inode), inline_version); 1665 1673 1666 - if (inline_version == 1 || /* initial version, no data */ 1667 - inline_version == CEPH_INLINE_NONE) 1668 - goto out_unlock; 1674 + if (inline_version == CEPH_INLINE_NONE) 1675 + return 0; 1676 + 1677 + prealloc_cf = ceph_alloc_cap_flush(); 1678 + if (!prealloc_cf) 1679 + return -ENOMEM; 1680 + 1681 + if (inline_version == 1) /* initial version, no data */ 1682 + goto out_uninline; 1683 + 1684 + folio = read_mapping_folio(inode->i_mapping, 0, file); 1685 + if (IS_ERR(folio)) { 1686 + err = PTR_ERR(folio); 1687 + goto out; 1688 + } 1689 + 1690 + folio_lock(folio); 1669 1691 1670 1692 len = i_size_read(inode); 1671 1693 if (len > folio_size(folio)) ··· 1745 1739 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1746 1740 req->r_end_latency, len, err); 1747 1741 1742 + out_uninline: 1748 1743 if (!err) { 1749 1744 int dirty; 1750 1745 ··· 1764 1757 if (err == -ECANCELED) 1765 1758 err = 0; 1766 1759 out_unlock: 1767 - folio_unlock(folio); 1768 - folio_put(folio); 1760 + if (folio) { 1761 + folio_unlock(folio); 1762 + folio_put(folio); 1763 + } 1769 1764 out: 1770 1765 ceph_free_cap_flush(prealloc_cf); 1771 1766 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", ··· 1786 1777 1787 1778 if (!mapping->a_ops->read_folio) 1788 1779 return -ENOEXEC; 1789 - file_accessed(file); 1790 1780 vma->vm_ops = &ceph_vmops; 1791 1781 return 0; 1792 1782 }
+47 -28
fs/ceph/caps.c
··· 1577 1577 1578 1578 while (first_tid <= last_tid) { 1579 1579 struct ceph_cap *cap = ci->i_auth_cap; 1580 - struct ceph_cap_flush *cf; 1580 + struct ceph_cap_flush *cf = NULL, *iter; 1581 1581 int ret; 1582 1582 1583 1583 if (!(cap && cap->session == session)) { ··· 1587 1587 } 1588 1588 1589 1589 ret = -ENOENT; 1590 - list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { 1591 - if (cf->tid >= first_tid) { 1590 + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { 1591 + if (iter->tid >= first_tid) { 1592 + cf = iter; 1592 1593 ret = 0; 1593 1594 break; 1594 1595 } ··· 1911 1910 struct rb_node *p; 1912 1911 bool queue_invalidate = false; 1913 1912 bool tried_invalidate = false; 1913 + bool queue_writeback = false; 1914 1914 1915 1915 if (session) 1916 1916 ceph_get_mds_session(session); ··· 2064 2062 } 2065 2063 2066 2064 /* completed revocation? going down and there are no caps? */ 2067 - if (revoking && (revoking & cap_used) == 0) { 2068 - dout("completed revocation of %s\n", 2069 - ceph_cap_string(cap->implemented & ~cap->issued)); 2070 - goto ack; 2065 + if (revoking) { 2066 + if ((revoking & cap_used) == 0) { 2067 + dout("completed revocation of %s\n", 2068 + ceph_cap_string(cap->implemented & ~cap->issued)); 2069 + goto ack; 2070 + } 2071 + 2072 + /* 2073 + * If the "i_wrbuffer_ref" was increased by mmap or generic 2074 + * cache write just before the ceph_check_caps() is called, 2075 + * the Fb capability revoking will fail this time. Then we 2076 + * must wait for the BDI's delayed work to flush the dirty 2077 + * pages and to release the "i_wrbuffer_ref", which will cost 2078 + * at most 5 seconds. That means the MDS needs to wait at 2079 + * most 5 seconds to finished the Fb capability's revocation. 2080 + * 2081 + * Let's queue a writeback for it. 2082 + */ 2083 + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && 2084 + (revoking & CEPH_CAP_FILE_BUFFER)) 2085 + queue_writeback = true; 2071 2086 } 2072 2087 2073 2088 /* want more caps from mds? */ ··· 2154 2135 spin_unlock(&ci->i_ceph_lock); 2155 2136 2156 2137 ceph_put_mds_session(session); 2138 + if (queue_writeback) 2139 + ceph_queue_writeback(inode); 2157 2140 if (queue_invalidate) 2158 2141 ceph_queue_invalidate(inode); 2159 2142 } ··· 2239 2218 } 2240 2219 2241 2220 /* 2242 - * wait for any unsafe requests to complete. 2221 + * flush the mdlog and wait for any unsafe requests to complete. 2243 2222 */ 2244 - static int unsafe_request_wait(struct inode *inode) 2223 + static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) 2245 2224 { 2246 2225 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 2247 2226 struct ceph_inode_info *ci = ceph_inode(inode); ··· 2357 2336 kfree(sessions); 2358 2337 } 2359 2338 2360 - dout("unsafe_request_wait %p wait on tid %llu %llu\n", 2339 + dout("%s %p wait on tid %llu %llu\n", __func__, 2361 2340 inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); 2362 2341 if (req1) { 2363 2342 ret = !wait_for_completion_timeout(&req1->r_safe_completion, ··· 2401 2380 dirty = try_flush_caps(inode, &flush_tid); 2402 2381 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 2403 2382 2404 - err = unsafe_request_wait(inode); 2383 + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); 2405 2384 2406 2385 /* 2407 2386 * only wait on non-file metadata writeback (the mds ··· 3203 3182 struct ceph_snap_context *snapc) 3204 3183 { 3205 3184 struct inode *inode = &ci->vfs_inode; 3206 - struct ceph_cap_snap *capsnap = NULL; 3185 + struct ceph_cap_snap *capsnap = NULL, *iter; 3207 3186 int put = 0; 3208 3187 bool last = false; 3209 - bool found = false; 3210 3188 bool flush_snaps = false; 3211 3189 bool complete_capsnap = false; 3212 3190 ··· 3232 3212 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, 3233 3213 last ? " LAST" : ""); 3234 3214 } else { 3235 - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 3236 - if (capsnap->context == snapc) { 3237 - found = true; 3215 + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { 3216 + if (iter->context == snapc) { 3217 + capsnap = iter; 3238 3218 break; 3239 3219 } 3240 3220 } 3241 3221 3242 - if (!found) { 3222 + if (!capsnap) { 3243 3223 /* 3244 3224 * The capsnap should already be removed when removing 3245 3225 * auth cap in the case of a forced unmount. ··· 3789 3769 struct ceph_inode_info *ci = ceph_inode(inode); 3790 3770 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 3791 3771 u64 follows = le64_to_cpu(m->snap_follows); 3792 - struct ceph_cap_snap *capsnap; 3793 - bool flushed = false; 3772 + struct ceph_cap_snap *capsnap = NULL, *iter; 3794 3773 bool wake_ci = false; 3795 3774 bool wake_mdsc = false; 3796 3775 ··· 3797 3778 inode, ci, session->s_mds, follows); 3798 3779 3799 3780 spin_lock(&ci->i_ceph_lock); 3800 - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 3801 - if (capsnap->follows == follows) { 3802 - if (capsnap->cap_flush.tid != flush_tid) { 3781 + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { 3782 + if (iter->follows == follows) { 3783 + if (iter->cap_flush.tid != flush_tid) { 3803 3784 dout(" cap_snap %p follows %lld tid %lld !=" 3804 - " %lld\n", capsnap, follows, 3805 - flush_tid, capsnap->cap_flush.tid); 3785 + " %lld\n", iter, follows, 3786 + flush_tid, iter->cap_flush.tid); 3806 3787 break; 3807 3788 } 3808 - flushed = true; 3789 + capsnap = iter; 3809 3790 break; 3810 3791 } else { 3811 3792 dout(" skipping cap_snap %p follows %lld\n", 3812 - capsnap, capsnap->follows); 3793 + iter, iter->follows); 3813 3794 } 3814 3795 } 3815 - if (flushed) 3796 + if (capsnap) 3816 3797 ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); 3817 3798 spin_unlock(&ci->i_ceph_lock); 3818 3799 3819 - if (flushed) { 3800 + if (capsnap) { 3820 3801 ceph_put_snap_context(capsnap->context); 3821 3802 ceph_put_cap_snap(capsnap); 3822 3803 if (wake_ci)
+30 -5
fs/ceph/inode.c
··· 578 578 579 579 __ceph_remove_caps(ci); 580 580 581 - if (__ceph_has_any_quota(ci)) 581 + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) 582 582 ceph_adjust_quota_realms_count(inode, false); 583 583 584 584 /* ··· 1466 1466 } else if (have_lease) { 1467 1467 if (d_unhashed(dn)) 1468 1468 d_add(dn, NULL); 1469 + } 1470 + 1471 + if (!d_unhashed(dn) && have_lease) 1469 1472 update_dentry_lease(dir, dn, 1470 1473 rinfo->dlease, session, 1471 1474 req->r_request_started); 1472 - } 1473 1475 goto done; 1474 1476 } 1475 1477 ··· 1886 1884 orig_gen = ci->i_rdcache_gen; 1887 1885 spin_unlock(&ci->i_ceph_lock); 1888 1886 1889 - ceph_fscache_invalidate(inode, false); 1890 1887 if (invalidate_inode_pages2(inode->i_mapping) < 0) { 1891 1888 pr_err("invalidate_inode_pages2 %llx.%llx failed\n", 1892 1889 ceph_vinop(inode)); ··· 2259 2258 return err; 2260 2259 } 2261 2260 2261 + int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) 2262 + { 2263 + int issued = ceph_caps_issued(ceph_inode(inode)); 2264 + 2265 + /* 2266 + * If any 'x' caps is issued we can just choose the auth MDS 2267 + * instead of the random replica MDSes. Because only when the 2268 + * Locker is in LOCK_EXEC state will the loner client could 2269 + * get the 'x' caps. And if we send the getattr requests to 2270 + * any replica MDS it must auth pin and tries to rdlock from 2271 + * the auth MDS, and then the auth MDS need to do the Locker 2272 + * state transition to LOCK_SYNC. And after that the lock state 2273 + * will change back. 2274 + * 2275 + * This cost much when doing the Locker state transition and 2276 + * usually will need to revoke caps from clients. 2277 + */ 2278 + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) 2279 + || (mask & CEPH_STAT_RSTAT)) 2280 + return USE_AUTH_MDS; 2281 + else 2282 + return USE_ANY_MDS; 2283 + } 2284 + 2262 2285 /* 2263 2286 * Verify that we have a lease on the given mask. If not, 2264 2287 * do a getattr against an mds. ··· 2306 2281 if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) 2307 2282 return 0; 2308 2283 2309 - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; 2284 + mode = ceph_try_to_choose_auth_mds(inode, mask); 2310 2285 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); 2311 2286 if (IS_ERR(req)) 2312 2287 return PTR_ERR(req); ··· 2448 2423 return -ESTALE; 2449 2424 2450 2425 /* Skip the getattr altogether if we're asked not to sync */ 2451 - if (!(flags & AT_STATX_DONT_SYNC)) { 2426 + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { 2452 2427 err = ceph_do_getattr(inode, 2453 2428 statx_to_caps(request_mask, inode->i_mode), 2454 2429 flags & AT_STATX_FORCE_SYNC);
+95 -26
fs/ceph/mds_client.c
··· 437 437 ceph_decode_32_safe(p, end, sets, bad); 438 438 dout("got %u sets of delegated inodes\n", sets); 439 439 while (sets--) { 440 - u64 start, len, ino; 440 + u64 start, len; 441 441 442 442 ceph_decode_64_safe(p, end, start, bad); 443 443 ceph_decode_64_safe(p, end, len, bad); ··· 449 449 continue; 450 450 } 451 451 while (len--) { 452 - int err = xa_insert(&s->s_delegated_inos, ino = start++, 452 + int err = xa_insert(&s->s_delegated_inos, start++, 453 453 DELEGATED_INO_AVAILABLE, 454 454 GFP_KERNEL); 455 455 if (!err) { ··· 2651 2651 struct ceph_mds_client *mdsc = session->s_mdsc; 2652 2652 struct ceph_mds_request_head_old *rhead; 2653 2653 struct ceph_msg *msg; 2654 - int flags = 0; 2654 + int flags = 0, max_retry; 2655 + 2656 + /* 2657 + * The type of 'r_attempts' in kernel 'ceph_mds_request' 2658 + * is 'int', while in 'ceph_mds_request_head' the type of 2659 + * 'num_retry' is '__u8'. So in case the request retries 2660 + * exceeding 256 times, the MDS will receive a incorrect 2661 + * retry seq. 2662 + * 2663 + * In this case it's ususally a bug in MDS and continue 2664 + * retrying the request makes no sense. 2665 + * 2666 + * In future this could be fixed in ceph code, so avoid 2667 + * using the hardcode here. 2668 + */ 2669 + max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); 2670 + max_retry = 1 << (max_retry * BITS_PER_BYTE); 2671 + if (req->r_attempts >= max_retry) { 2672 + pr_warn_ratelimited("%s request tid %llu seq overflow\n", 2673 + __func__, req->r_tid); 2674 + return -EMULTIHOP; 2675 + } 2655 2676 2656 2677 req->r_attempts++; 2657 2678 if (req->r_inode) { ··· 2684 2663 else 2685 2664 req->r_sent_on_mseq = -1; 2686 2665 } 2687 - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 2666 + dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, 2688 2667 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 2689 2668 2690 2669 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { ··· 3286 3265 int err = -EINVAL; 3287 3266 void *p = msg->front.iov_base; 3288 3267 void *end = p + msg->front.iov_len; 3268 + bool aborted = false; 3289 3269 3290 3270 ceph_decode_need(&p, end, 2*sizeof(u32), bad); 3291 3271 next_mds = ceph_decode_32(&p); ··· 3295 3273 mutex_lock(&mdsc->mutex); 3296 3274 req = lookup_get_request(mdsc, tid); 3297 3275 if (!req) { 3276 + mutex_unlock(&mdsc->mutex); 3298 3277 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); 3299 - goto out; /* dup reply? */ 3278 + return; /* dup reply? */ 3300 3279 } 3301 3280 3302 3281 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { 3303 3282 dout("forward tid %llu aborted, unregistering\n", tid); 3304 3283 __unregister_request(mdsc, req); 3305 3284 } else if (fwd_seq <= req->r_num_fwd) { 3306 - dout("forward tid %llu to mds%d - old seq %d <= %d\n", 3307 - tid, next_mds, req->r_num_fwd, fwd_seq); 3285 + /* 3286 + * The type of 'num_fwd' in ceph 'MClientRequestForward' 3287 + * is 'int32_t', while in 'ceph_mds_request_head' the 3288 + * type is '__u8'. So in case the request bounces between 3289 + * MDSes exceeding 256 times, the client will get stuck. 3290 + * 3291 + * In this case it's ususally a bug in MDS and continue 3292 + * bouncing the request makes no sense. 3293 + * 3294 + * In future this could be fixed in ceph code, so avoid 3295 + * using the hardcode here. 3296 + */ 3297 + int max = sizeof_field(struct ceph_mds_request_head, num_fwd); 3298 + max = 1 << (max * BITS_PER_BYTE); 3299 + if (req->r_num_fwd >= max) { 3300 + mutex_lock(&req->r_fill_mutex); 3301 + req->r_err = -EMULTIHOP; 3302 + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 3303 + mutex_unlock(&req->r_fill_mutex); 3304 + aborted = true; 3305 + pr_warn_ratelimited("forward tid %llu seq overflow\n", 3306 + tid); 3307 + } else { 3308 + dout("forward tid %llu to mds%d - old seq %d <= %d\n", 3309 + tid, next_mds, req->r_num_fwd, fwd_seq); 3310 + } 3308 3311 } else { 3309 3312 /* resend. forward race not possible; mds would drop */ 3310 3313 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); ··· 3341 3294 put_request_session(req); 3342 3295 __do_request(mdsc, req); 3343 3296 } 3344 - ceph_mdsc_put_request(req); 3345 - out: 3346 3297 mutex_unlock(&mdsc->mutex); 3298 + 3299 + /* kick calling process */ 3300 + if (aborted) 3301 + complete_request(mdsc, req); 3302 + ceph_mdsc_put_request(req); 3347 3303 return; 3348 3304 3349 3305 bad: ··· 3425 3375 } 3426 3376 3427 3377 if (msg_version >= 5) { 3428 - u32 flags; 3429 - /* version >= 4, struct_v, struct_cv, len, metric_spec */ 3430 - ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); 3378 + u32 flags, len; 3379 + 3380 + /* version >= 4 */ 3381 + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ 3382 + ceph_decode_32_safe(&p, end, len, bad); /* len */ 3383 + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ 3384 + 3431 3385 /* version >= 5, flags */ 3432 - ceph_decode_32_safe(&p, end, flags, bad); 3386 + ceph_decode_32_safe(&p, end, flags, bad); 3433 3387 if (flags & CEPH_SESSION_BLOCKLISTED) { 3434 - pr_warn("mds%d session blocklisted\n", session->s_mds); 3388 + pr_warn("mds%d session blocklisted\n", session->s_mds); 3435 3389 blocklisted = true; 3436 3390 } 3437 3391 } ··· 4450 4396 memcpy((void *)(lease + 1) + 4, 4451 4397 dentry->d_name.name, dentry->d_name.len); 4452 4398 spin_unlock(&dentry->d_lock); 4453 - /* 4454 - * if this is a preemptive lease RELEASE, no need to 4455 - * flush request stream, since the actual request will 4456 - * soon follow. 4457 - */ 4458 - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); 4459 4399 4460 4400 ceph_con_send(&session->s_con, msg); 4461 4401 } ··· 4744 4696 } 4745 4697 4746 4698 /* 4747 - * wait for all write mds requests to flush. 4699 + * flush the mdlog and wait for all write mds requests to flush. 4748 4700 */ 4749 - static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) 4701 + static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, 4702 + u64 want_tid) 4750 4703 { 4751 4704 struct ceph_mds_request *req = NULL, *nextreq; 4705 + struct ceph_mds_session *last_session = NULL; 4752 4706 struct rb_node *n; 4753 4707 4754 4708 mutex_lock(&mdsc->mutex); 4755 - dout("wait_unsafe_requests want %lld\n", want_tid); 4709 + dout("%s want %lld\n", __func__, want_tid); 4756 4710 restart: 4757 4711 req = __get_oldest_req(mdsc); 4758 4712 while (req && req->r_tid <= want_tid) { ··· 4766 4716 nextreq = NULL; 4767 4717 if (req->r_op != CEPH_MDS_OP_SETFILELOCK && 4768 4718 (req->r_op & CEPH_MDS_OP_WRITE)) { 4719 + struct ceph_mds_session *s = req->r_session; 4720 + 4721 + if (!s) { 4722 + req = nextreq; 4723 + continue; 4724 + } 4725 + 4769 4726 /* write op */ 4770 4727 ceph_mdsc_get_request(req); 4771 4728 if (nextreq) 4772 4729 ceph_mdsc_get_request(nextreq); 4730 + s = ceph_get_mds_session(s); 4773 4731 mutex_unlock(&mdsc->mutex); 4774 - dout("wait_unsafe_requests wait on %llu (want %llu)\n", 4732 + 4733 + /* send flush mdlog request to MDS */ 4734 + if (last_session != s) { 4735 + send_flush_mdlog(s); 4736 + ceph_put_mds_session(last_session); 4737 + last_session = s; 4738 + } else { 4739 + ceph_put_mds_session(s); 4740 + } 4741 + dout("%s wait on %llu (want %llu)\n", __func__, 4775 4742 req->r_tid, want_tid); 4776 4743 wait_for_completion(&req->r_safe_completion); 4744 + 4777 4745 mutex_lock(&mdsc->mutex); 4778 4746 ceph_mdsc_put_request(req); 4779 4747 if (!nextreq) ··· 4806 4738 req = nextreq; 4807 4739 } 4808 4740 mutex_unlock(&mdsc->mutex); 4809 - dout("wait_unsafe_requests done\n"); 4741 + ceph_put_mds_session(last_session); 4742 + dout("%s done\n", __func__); 4810 4743 } 4811 4744 4812 4745 void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ··· 4836 4767 dout("sync want tid %lld flush_seq %lld\n", 4837 4768 want_tid, want_flush); 4838 4769 4839 - wait_unsafe_requests(mdsc, want_tid); 4770 + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); 4840 4771 wait_caps_flush(mdsc, want_flush); 4841 4772 } 4842 4773
+1 -1
fs/ceph/mds_client.h
··· 579 579 struct ceph_inode_info *ci = ceph_inode(inode); 580 580 581 581 return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, 582 - TASK_INTERRUPTIBLE); 582 + TASK_KILLABLE); 583 583 } 584 584 585 585 extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+11 -8
fs/ceph/quota.c
··· 195 195 196 196 /* 197 197 * This function walks through the snaprealm for an inode and returns the 198 - * ceph_snap_realm for the first snaprealm that has quotas set (either max_files 199 - * or max_bytes). If the root is reached, return the root ceph_snap_realm 200 - * instead. 198 + * ceph_snap_realm for the first snaprealm that has quotas set (max_files, 199 + * max_bytes, or any, depending on the 'which_quota' argument). If the root is 200 + * reached, return the root ceph_snap_realm instead. 201 201 * 202 202 * Note that the caller is responsible for calling ceph_put_snap_realm() on the 203 203 * returned realm. ··· 209 209 * will be restarted. 210 210 */ 211 211 static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, 212 - struct inode *inode, bool retry) 212 + struct inode *inode, 213 + enum quota_get_realm which_quota, 214 + bool retry) 213 215 { 214 216 struct ceph_inode_info *ci = NULL; 215 217 struct ceph_snap_realm *realm, *next; ··· 250 248 } 251 249 252 250 ci = ceph_inode(in); 253 - has_quota = __ceph_has_any_quota(ci); 251 + has_quota = __ceph_has_quota(ci, which_quota); 254 252 iput(in); 255 253 256 254 next = realm->parent; ··· 281 279 * dropped and we can then restart the whole operation. 282 280 */ 283 281 down_read(&mdsc->snap_rwsem); 284 - old_realm = get_quota_realm(mdsc, old, true); 285 - new_realm = get_quota_realm(mdsc, new, false); 282 + old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); 283 + new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); 286 284 if (PTR_ERR(new_realm) == -EAGAIN) { 287 285 up_read(&mdsc->snap_rwsem); 288 286 if (old_realm) ··· 485 483 bool is_updated = false; 486 484 487 485 down_read(&mdsc->snap_rwsem); 488 - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); 486 + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), 487 + QUOTA_GET_MAX_BYTES, true); 489 488 up_read(&mdsc->snap_rwsem); 490 489 if (!realm) 491 490 return false;
+1
fs/ceph/super.c
··· 1119 1119 s->s_time_gran = 1; 1120 1120 s->s_time_min = 0; 1121 1121 s->s_time_max = U32_MAX; 1122 + s->s_flags |= SB_NODIRATIME | SB_NOATIME; 1122 1123 1123 1124 ret = set_anon_super_fc(s, fc); 1124 1125 if (ret != 0)
+25 -4
fs/ceph/super.h
··· 1022 1022 ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); 1023 1023 } 1024 1024 1025 + extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); 1025 1026 extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 1026 1027 int mask, bool force); 1027 1028 static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) ··· 1279 1278 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); 1280 1279 1281 1280 /* quota.c */ 1282 - static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) 1281 + 1282 + enum quota_get_realm { 1283 + QUOTA_GET_MAX_FILES, 1284 + QUOTA_GET_MAX_BYTES, 1285 + QUOTA_GET_ANY 1286 + }; 1287 + 1288 + static inline bool __ceph_has_quota(struct ceph_inode_info *ci, 1289 + enum quota_get_realm which) 1283 1290 { 1284 - return ci->i_max_files || ci->i_max_bytes; 1291 + bool has_quota = false; 1292 + 1293 + switch (which) { 1294 + case QUOTA_GET_MAX_BYTES: 1295 + has_quota = !!ci->i_max_bytes; 1296 + break; 1297 + case QUOTA_GET_MAX_FILES: 1298 + has_quota = !!ci->i_max_files; 1299 + break; 1300 + default: 1301 + has_quota = !!(ci->i_max_files || ci->i_max_bytes); 1302 + } 1303 + return has_quota; 1285 1304 } 1286 1305 1287 1306 extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); ··· 1310 1289 u64 max_bytes, u64 max_files) 1311 1290 { 1312 1291 bool had_quota, has_quota; 1313 - had_quota = __ceph_has_any_quota(ci); 1292 + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); 1314 1293 ci->i_max_bytes = max_bytes; 1315 1294 ci->i_max_files = max_files; 1316 - has_quota = __ceph_has_any_quota(ci); 1295 + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); 1317 1296 1318 1297 if (had_quota != has_quota) 1319 1298 ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+9 -1
fs/ceph/xattr.c
··· 366 366 } 367 367 #define XATTR_RSTAT_FIELD(_type, _name) \ 368 368 XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) 369 + #define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ 370 + { \ 371 + .name = CEPH_XATTR_NAME(_type, _name), \ 372 + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ 373 + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ 374 + .exists_cb = NULL, \ 375 + .flags = VXATTR_FLAG_RSTAT, \ 376 + } 369 377 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ 370 378 { \ 371 379 .name = CEPH_XATTR_NAME2(_type, _name, _field), \ ··· 412 404 XATTR_RSTAT_FIELD(dir, rsubdirs), 413 405 XATTR_RSTAT_FIELD(dir, rsnaps), 414 406 XATTR_RSTAT_FIELD(dir, rbytes), 415 - XATTR_RSTAT_FIELD(dir, rctime), 407 + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), 416 408 { 417 409 .name = "ceph.dir.pin", 418 410 .name_size = sizeof("ceph.dir.pin"),
+1 -4
net/ceph/crush/mapper.c
··· 906 906 int recurse_to_leaf; 907 907 int wsize = 0; 908 908 int osize; 909 - int *tmp; 910 909 const struct crush_rule *rule; 911 910 __u32 step; 912 911 int i, j; ··· 1072 1073 memcpy(o, c, osize*sizeof(*o)); 1073 1074 1074 1075 /* swap o and w arrays */ 1075 - tmp = o; 1076 - o = w; 1077 - w = tmp; 1076 + swap(o, w); 1078 1077 wsize = osize; 1079 1078 break; 1080 1079