Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-4.18-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"The main piece is a set of libceph changes that revamps how OSD
requests are aborted, improving CephFS ENOSPC handling and making
"umount -f" actually work (Zheng and myself).

The rest is mostly mount option handling cleanups from Chengguang and
assorted fixes from Zheng, Luis and Dongsheng.

* tag 'ceph-for-4.18-rc1' of git://github.com/ceph/ceph-client: (31 commits)
rbd: flush rbd_dev->watch_dwork after watch is unregistered
ceph: update description of some mount options
ceph: show ino32 if the value is different with default
ceph: strengthen rsize/wsize/readdir_max_bytes validation
ceph: fix alignment of rasize
ceph: fix use-after-free in ceph_statfs()
ceph: prevent i_version from going back
ceph: fix wrong check for the case of updating link count
libceph: allocate the locator string with GFP_NOFAIL
libceph: make abort_on_full a per-osdc setting
libceph: don't abort reads in ceph_osdc_abort_on_full()
libceph: avoid a use-after-free during map check
libceph: don't warn if req->r_abort_on_full is set
libceph: use for_each_request() in ceph_osdc_abort_on_full()
libceph: defer __complete_request() to a workqueue
libceph: move more code into __complete_request()
libceph: no need to call flush_workqueue() before destruction
ceph: flush pending works before shutdown super
ceph: abort osd requests on force umount
libceph: introduce ceph_osdc_abort_requests()
...

+372 -256
+3 -5
Documentation/filesystems/ceph.txt
··· 105 105 address its connection to the monitor originates from. 106 106 107 107 wsize=X 108 - Specify the maximum write size in bytes. By default there is no 109 - maximum. Ceph will normally size writes based on the file stripe 110 - size. 108 + Specify the maximum write size in bytes. Default: 16 MB. 111 109 112 110 rsize=X 113 - Specify the maximum read size in bytes. Default: 64 MB. 111 + Specify the maximum read size in bytes. Default: 16 MB. 114 112 115 113 rasize=X 116 - Specify the maximum readahead. Default: 8 MB. 114 + Specify the maximum readahead size in bytes. Default: 8 MB. 117 115 118 116 mount_timeout=X 119 117 Specify the timeout value for mount (in seconds), in the case
+7 -4
drivers/block/rbd.c
··· 2339 2339 static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) 2340 2340 { 2341 2341 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops; 2342 + int ret; 2342 2343 2343 2344 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 2344 2345 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); ··· 2354 2353 if (!obj_req->osd_req) 2355 2354 return -ENOMEM; 2356 2355 2356 + ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", 2357 + "copyup"); 2358 + if (ret) 2359 + return ret; 2360 + 2357 2361 /* 2358 2362 * Only send non-zero copyup data to save some I/O and network 2359 2363 * bandwidth -- zero copyup data is equivalent to the object not ··· 2368 2362 dout("%s obj_req %p detected zeroes\n", __func__, obj_req); 2369 2363 bytes = 0; 2370 2364 } 2371 - 2372 - osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", 2373 - "copyup"); 2374 2365 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, 2375 2366 obj_req->copyup_bvecs, 2376 2367 obj_req->copyup_bvec_count, ··· 3400 3397 { 3401 3398 dout("%s rbd_dev %p\n", __func__, rbd_dev); 3402 3399 3403 - cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3404 3400 cancel_work_sync(&rbd_dev->acquired_lock_work); 3405 3401 cancel_work_sync(&rbd_dev->released_lock_work); 3406 3402 cancel_delayed_work_sync(&rbd_dev->lock_dwork); ··· 3417 3415 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 3418 3416 mutex_unlock(&rbd_dev->watch_mutex); 3419 3417 3418 + cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3420 3419 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3421 3420 } 3422 3421
-1
fs/ceph/addr.c
··· 1936 1936 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); 1937 1937 1938 1938 wr_req->r_mtime = ci->vfs_inode.i_mtime; 1939 - wr_req->r_abort_on_full = true; 1940 1939 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); 1941 1940 1942 1941 if (!err)
+99 -61
fs/ceph/caps.c
··· 69 69 *s++ = 'w'; 70 70 if (c & CEPH_CAP_GBUFFER) 71 71 *s++ = 'b'; 72 + if (c & CEPH_CAP_GWREXTEND) 73 + *s++ = 'a'; 72 74 if (c & CEPH_CAP_GLAZYIO) 73 75 *s++ = 'l'; 74 76 return s; ··· 3024 3022 dput(prev); 3025 3023 } 3026 3024 3025 + struct cap_extra_info { 3026 + struct ceph_string *pool_ns; 3027 + /* inline data */ 3028 + u64 inline_version; 3029 + void *inline_data; 3030 + u32 inline_len; 3031 + /* dirstat */ 3032 + bool dirstat_valid; 3033 + u64 nfiles; 3034 + u64 nsubdirs; 3035 + /* currently issued */ 3036 + int issued; 3037 + }; 3038 + 3027 3039 /* 3028 3040 * Handle a cap GRANT message from the MDS. (Note that a GRANT may 3029 3041 * actually be a revocation if it specifies a smaller cap set.) 3030 3042 * 3031 3043 * caller holds s_mutex and i_ceph_lock, we drop both. 3032 3044 */ 3033 - static void handle_cap_grant(struct ceph_mds_client *mdsc, 3034 - struct inode *inode, struct ceph_mds_caps *grant, 3035 - struct ceph_string **pns, u64 inline_version, 3036 - void *inline_data, u32 inline_len, 3037 - struct ceph_buffer *xattr_buf, 3045 + static void handle_cap_grant(struct inode *inode, 3038 3046 struct ceph_mds_session *session, 3039 - struct ceph_cap *cap, int issued) 3047 + struct ceph_cap *cap, 3048 + struct ceph_mds_caps *grant, 3049 + struct ceph_buffer *xattr_buf, 3050 + struct cap_extra_info *extra_info) 3040 3051 __releases(ci->i_ceph_lock) 3041 - __releases(mdsc->snap_rwsem) 3052 + __releases(session->s_mdsc->snap_rwsem) 3042 3053 { 3043 3054 struct ceph_inode_info *ci = ceph_inode(inode); 3044 - int mds = session->s_mds; 3045 3055 int seq = le32_to_cpu(grant->seq); 3046 3056 int newcaps = le32_to_cpu(grant->caps); 3047 3057 int used, wanted, dirty; 3048 3058 u64 size = le64_to_cpu(grant->size); 3049 3059 u64 max_size = le64_to_cpu(grant->max_size); 3050 - struct timespec mtime, atime, ctime; 3051 3060 int check_caps = 0; 3052 3061 bool wake = false; 3053 3062 bool writeback = false; ··· 3068 3055 bool fill_inline = false; 3069 3056 3070 3057 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 3071 - inode, cap, mds, seq, ceph_cap_string(newcaps)); 3058 + inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); 3072 3059 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 3073 3060 inode->i_size); 3074 3061 ··· 3114 3101 __check_cap_issue(ci, cap, newcaps); 3115 3102 3116 3103 if ((newcaps & CEPH_CAP_AUTH_SHARED) && 3117 - (issued & CEPH_CAP_AUTH_EXCL) == 0) { 3104 + (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { 3118 3105 inode->i_mode = le32_to_cpu(grant->mode); 3119 3106 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); 3120 3107 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ··· 3123 3110 from_kgid(&init_user_ns, inode->i_gid)); 3124 3111 } 3125 3112 3126 - if ((newcaps & CEPH_CAP_AUTH_SHARED) && 3127 - (issued & CEPH_CAP_LINK_EXCL) == 0) { 3113 + if ((newcaps & CEPH_CAP_LINK_SHARED) && 3114 + (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { 3128 3115 set_nlink(inode, le32_to_cpu(grant->nlink)); 3129 3116 if (inode->i_nlink == 0 && 3130 3117 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) 3131 3118 deleted_inode = true; 3132 3119 } 3133 3120 3134 - if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { 3121 + if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && 3122 + grant->xattr_len) { 3135 3123 int len = le32_to_cpu(grant->xattr_len); 3136 3124 u64 version = le64_to_cpu(grant->xattr_version); 3137 3125 ··· 3148 3134 } 3149 3135 3150 3136 if (newcaps & CEPH_CAP_ANY_RD) { 3137 + struct timespec mtime, atime, ctime; 3151 3138 /* ctime/mtime/atime? */ 3152 3139 ceph_decode_timespec(&mtime, &grant->mtime); 3153 3140 ceph_decode_timespec(&atime, &grant->atime); 3154 3141 ceph_decode_timespec(&ctime, &grant->ctime); 3155 - ceph_fill_file_time(inode, issued, 3142 + ceph_fill_file_time(inode, extra_info->issued, 3156 3143 le32_to_cpu(grant->time_warp_seq), 3157 3144 &ctime, &mtime, &atime); 3145 + } 3146 + 3147 + if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { 3148 + ci->i_files = extra_info->nfiles; 3149 + ci->i_subdirs = extra_info->nsubdirs; 3158 3150 } 3159 3151 3160 3152 if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { ··· 3171 3151 ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); 3172 3152 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, 3173 3153 lockdep_is_held(&ci->i_ceph_lock)); 3174 - rcu_assign_pointer(ci->i_layout.pool_ns, *pns); 3154 + rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); 3175 3155 3176 - if (ci->i_layout.pool_id != old_pool || *pns != old_ns) 3156 + if (ci->i_layout.pool_id != old_pool || 3157 + extra_info->pool_ns != old_ns) 3177 3158 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; 3178 3159 3179 - *pns = old_ns; 3160 + extra_info->pool_ns = old_ns; 3180 3161 3181 3162 /* size/truncate_seq? */ 3182 - queue_trunc = ceph_fill_file_size(inode, issued, 3163 + queue_trunc = ceph_fill_file_size(inode, extra_info->issued, 3183 3164 le32_to_cpu(grant->truncate_seq), 3184 3165 le64_to_cpu(grant->truncate_size), 3185 3166 size); ··· 3259 3238 } 3260 3239 BUG_ON(cap->issued & ~cap->implemented); 3261 3240 3262 - if (inline_version > 0 && inline_version >= ci->i_inline_version) { 3263 - ci->i_inline_version = inline_version; 3241 + if (extra_info->inline_version > 0 && 3242 + extra_info->inline_version >= ci->i_inline_version) { 3243 + ci->i_inline_version = extra_info->inline_version; 3264 3244 if (ci->i_inline_version != CEPH_INLINE_NONE && 3265 3245 (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) 3266 3246 fill_inline = true; 3267 3247 } 3268 3248 3269 3249 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 3270 - if (newcaps & ~issued) 3250 + if (newcaps & ~extra_info->issued) 3271 3251 wake = true; 3272 - kick_flushing_inode_caps(mdsc, session, inode); 3273 - up_read(&mdsc->snap_rwsem); 3252 + kick_flushing_inode_caps(session->s_mdsc, session, inode); 3253 + up_read(&session->s_mdsc->snap_rwsem); 3274 3254 } else { 3275 3255 spin_unlock(&ci->i_ceph_lock); 3276 3256 } 3277 3257 3278 3258 if (fill_inline) 3279 - ceph_fill_inline_data(inode, NULL, inline_data, inline_len); 3259 + ceph_fill_inline_data(inode, NULL, extra_info->inline_data, 3260 + extra_info->inline_len); 3280 3261 3281 3262 if (queue_trunc) 3282 3263 ceph_queue_vmtruncate(inode); ··· 3743 3720 struct ceph_msg *msg) 3744 3721 { 3745 3722 struct ceph_mds_client *mdsc = session->s_mdsc; 3746 - struct super_block *sb = mdsc->fsc->sb; 3747 3723 struct inode *inode; 3748 3724 struct ceph_inode_info *ci; 3749 3725 struct ceph_cap *cap; 3750 3726 struct ceph_mds_caps *h; 3751 3727 struct ceph_mds_cap_peer *peer = NULL; 3752 3728 struct ceph_snap_realm *realm = NULL; 3753 - struct ceph_string *pool_ns = NULL; 3754 - int mds = session->s_mds; 3755 - int op, issued; 3729 + int op; 3730 + int msg_version = le16_to_cpu(msg->hdr.version); 3756 3731 u32 seq, mseq; 3757 3732 struct ceph_vino vino; 3758 - u64 tid; 3759 - u64 inline_version = 0; 3760 - void *inline_data = NULL; 3761 - u32 inline_len = 0; 3762 3733 void *snaptrace; 3763 3734 size_t snaptrace_len; 3764 3735 void *p, *end; 3736 + struct cap_extra_info extra_info = {}; 3765 3737 3766 - dout("handle_caps from mds%d\n", mds); 3738 + dout("handle_caps from mds%d\n", session->s_mds); 3767 3739 3768 3740 /* decode */ 3769 3741 end = msg->front.iov_base + msg->front.iov_len; 3770 - tid = le64_to_cpu(msg->hdr.tid); 3771 3742 if (msg->front.iov_len < sizeof(*h)) 3772 3743 goto bad; 3773 3744 h = msg->front.iov_base; ··· 3775 3758 snaptrace_len = le32_to_cpu(h->snap_trace_len); 3776 3759 p = snaptrace + snaptrace_len; 3777 3760 3778 - if (le16_to_cpu(msg->hdr.version) >= 2) { 3761 + if (msg_version >= 2) { 3779 3762 u32 flock_len; 3780 3763 ceph_decode_32_safe(&p, end, flock_len, bad); 3781 3764 if (p + flock_len > end) ··· 3783 3766 p += flock_len; 3784 3767 } 3785 3768 3786 - if (le16_to_cpu(msg->hdr.version) >= 3) { 3769 + if (msg_version >= 3) { 3787 3770 if (op == CEPH_CAP_OP_IMPORT) { 3788 3771 if (p + sizeof(*peer) > end) 3789 3772 goto bad; ··· 3795 3778 } 3796 3779 } 3797 3780 3798 - if (le16_to_cpu(msg->hdr.version) >= 4) { 3799 - ceph_decode_64_safe(&p, end, inline_version, bad); 3800 - ceph_decode_32_safe(&p, end, inline_len, bad); 3801 - if (p + inline_len > end) 3781 + if (msg_version >= 4) { 3782 + ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); 3783 + ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); 3784 + if (p + extra_info.inline_len > end) 3802 3785 goto bad; 3803 - inline_data = p; 3804 - p += inline_len; 3786 + extra_info.inline_data = p; 3787 + p += extra_info.inline_len; 3805 3788 } 3806 3789 3807 - if (le16_to_cpu(msg->hdr.version) >= 5) { 3790 + if (msg_version >= 5) { 3808 3791 struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; 3809 3792 u32 epoch_barrier; 3810 3793 ··· 3812 3795 ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); 3813 3796 } 3814 3797 3815 - if (le16_to_cpu(msg->hdr.version) >= 8) { 3798 + if (msg_version >= 8) { 3816 3799 u64 flush_tid; 3817 3800 u32 caller_uid, caller_gid; 3818 3801 u32 pool_ns_len; ··· 3826 3809 ceph_decode_32_safe(&p, end, pool_ns_len, bad); 3827 3810 if (pool_ns_len > 0) { 3828 3811 ceph_decode_need(&p, end, pool_ns_len, bad); 3829 - pool_ns = ceph_find_or_create_string(p, pool_ns_len); 3812 + extra_info.pool_ns = 3813 + ceph_find_or_create_string(p, pool_ns_len); 3830 3814 p += pool_ns_len; 3831 3815 } 3832 3816 } 3833 3817 3818 + if (msg_version >= 11) { 3819 + struct ceph_timespec *btime; 3820 + u64 change_attr; 3821 + u32 flags; 3822 + 3823 + /* version >= 9 */ 3824 + if (p + sizeof(*btime) > end) 3825 + goto bad; 3826 + btime = p; 3827 + p += sizeof(*btime); 3828 + ceph_decode_64_safe(&p, end, change_attr, bad); 3829 + /* version >= 10 */ 3830 + ceph_decode_32_safe(&p, end, flags, bad); 3831 + /* version >= 11 */ 3832 + extra_info.dirstat_valid = true; 3833 + ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); 3834 + ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); 3835 + } 3836 + 3834 3837 /* lookup ino */ 3835 - inode = ceph_find_inode(sb, vino); 3838 + inode = ceph_find_inode(mdsc->fsc->sb, vino); 3836 3839 ci = ceph_inode(inode); 3837 3840 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, 3838 3841 vino.snap, inode); ··· 3885 3848 /* these will work even if we don't have a cap yet */ 3886 3849 switch (op) { 3887 3850 case CEPH_CAP_OP_FLUSHSNAP_ACK: 3888 - handle_cap_flushsnap_ack(inode, tid, h, session); 3851 + handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), 3852 + h, session); 3889 3853 goto done; 3890 3854 3891 3855 case CEPH_CAP_OP_EXPORT: ··· 3905 3867 down_read(&mdsc->snap_rwsem); 3906 3868 } 3907 3869 handle_cap_import(mdsc, inode, h, peer, session, 3908 - &cap, &issued); 3909 - handle_cap_grant(mdsc, inode, h, &pool_ns, 3910 - inline_version, inline_data, inline_len, 3911 - msg->middle, session, cap, issued); 3870 + &cap, &extra_info.issued); 3871 + handle_cap_grant(inode, session, cap, 3872 + h, msg->middle, &extra_info); 3912 3873 if (realm) 3913 3874 ceph_put_snap_realm(mdsc, realm); 3914 3875 goto done_unlocked; ··· 3915 3878 3916 3879 /* the rest require a cap */ 3917 3880 spin_lock(&ci->i_ceph_lock); 3918 - cap = __get_cap_for_mds(ceph_inode(inode), mds); 3881 + cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); 3919 3882 if (!cap) { 3920 3883 dout(" no cap on %p ino %llx.%llx from mds%d\n", 3921 - inode, ceph_ino(inode), ceph_snap(inode), mds); 3884 + inode, ceph_ino(inode), ceph_snap(inode), 3885 + session->s_mds); 3922 3886 spin_unlock(&ci->i_ceph_lock); 3923 3887 goto flush_cap_releases; 3924 3888 } ··· 3928 3890 switch (op) { 3929 3891 case CEPH_CAP_OP_REVOKE: 3930 3892 case CEPH_CAP_OP_GRANT: 3931 - __ceph_caps_issued(ci, &issued); 3932 - issued |= __ceph_caps_dirty(ci); 3933 - handle_cap_grant(mdsc, inode, h, &pool_ns, 3934 - inline_version, inline_data, inline_len, 3935 - msg->middle, session, cap, issued); 3893 + __ceph_caps_issued(ci, &extra_info.issued); 3894 + extra_info.issued |= __ceph_caps_dirty(ci); 3895 + handle_cap_grant(inode, session, cap, 3896 + h, msg->middle, &extra_info); 3936 3897 goto done_unlocked; 3937 3898 3938 3899 case CEPH_CAP_OP_FLUSH_ACK: 3939 - handle_cap_flush_ack(inode, tid, h, session, cap); 3900 + handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), 3901 + h, session, cap); 3940 3902 break; 3941 3903 3942 3904 case CEPH_CAP_OP_TRUNC: ··· 3963 3925 mutex_unlock(&session->s_mutex); 3964 3926 done_unlocked: 3965 3927 iput(inode); 3966 - ceph_put_string(pool_ns); 3928 + ceph_put_string(extra_info.pool_ns); 3967 3929 return; 3968 3930 3969 3931 bad:
+2
fs/ceph/dir.c
··· 1486 1486 .release = ceph_release, 1487 1487 .unlocked_ioctl = ceph_ioctl, 1488 1488 .fsync = ceph_fsync, 1489 + .lock = ceph_lock, 1490 + .flock = ceph_flock, 1489 1491 }; 1490 1492 1491 1493 const struct file_operations ceph_snapdir_fops = {
-1
fs/ceph/file.c
··· 895 895 req->r_callback = ceph_aio_complete_req; 896 896 req->r_inode = inode; 897 897 req->r_priv = aio_req; 898 - req->r_abort_on_full = true; 899 898 900 899 ret = ceph_osdc_start_request(req->r_osdc, req, false); 901 900 out:
+43 -24
fs/ceph/inode.c
··· 739 739 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 740 740 struct ceph_mds_reply_inode *info = iinfo->in; 741 741 struct ceph_inode_info *ci = ceph_inode(inode); 742 - int issued = 0, implemented, new_issued; 742 + int issued, new_issued, info_caps; 743 743 struct timespec mtime, atime, ctime; 744 744 struct ceph_buffer *xattr_blob = NULL; 745 745 struct ceph_string *pool_ns = NULL; ··· 754 754 inode, ceph_vinop(inode), le64_to_cpu(info->version), 755 755 ci->i_version); 756 756 757 + info_caps = le32_to_cpu(info->cap.caps); 758 + 757 759 /* prealloc new cap struct */ 758 - if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) 760 + if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) 759 761 new_cap = ceph_get_cap(mdsc, caps_reservation); 760 762 761 763 /* ··· 794 792 le64_to_cpu(info->version) > (ci->i_version & ~1))) 795 793 new_version = true; 796 794 797 - issued = __ceph_caps_issued(ci, &implemented); 798 - issued |= implemented | __ceph_caps_dirty(ci); 799 - new_issued = ~issued & le32_to_cpu(info->cap.caps); 795 + __ceph_caps_issued(ci, &issued); 796 + issued |= __ceph_caps_dirty(ci); 797 + new_issued = ~issued & info_caps; 800 798 801 799 /* update inode */ 802 800 inode->i_rdev = le32_to_cpu(info->rdev); ··· 828 826 &ctime, &mtime, &atime); 829 827 } 830 828 829 + if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) { 830 + ci->i_files = le64_to_cpu(info->files); 831 + ci->i_subdirs = le64_to_cpu(info->subdirs); 832 + } 833 + 831 834 if (new_version || 832 835 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { 833 836 s64 old_pool = ci->i_layout.pool_id; ··· 861 854 } 862 855 } 863 856 857 + /* layout and rstat are not tracked by capability, update them if 858 + * the inode info is from auth mds */ 859 + if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) { 860 + if (S_ISDIR(inode->i_mode)) { 861 + ci->i_dir_layout = iinfo->dir_layout; 862 + ci->i_rbytes = le64_to_cpu(info->rbytes); 863 + ci->i_rfiles = le64_to_cpu(info->rfiles); 864 + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); 865 + ceph_decode_timespec(&ci->i_rctime, &info->rctime); 866 + } 867 + } 868 + 864 869 /* xattrs */ 865 870 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ 866 871 if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && ··· 889 870 } 890 871 891 872 /* finally update i_version */ 892 - ci->i_version = le64_to_cpu(info->version); 873 + if (le64_to_cpu(info->version) > ci->i_version) 874 + ci->i_version = le64_to_cpu(info->version); 893 875 894 876 inode->i_mapping->a_ops = &ceph_aops; 895 877 ··· 938 918 case S_IFDIR: 939 919 inode->i_op = &ceph_dir_iops; 940 920 inode->i_fop = &ceph_dir_fops; 941 - 942 - ci->i_dir_layout = iinfo->dir_layout; 943 - 944 - ci->i_files = le64_to_cpu(info->files); 945 - ci->i_subdirs = le64_to_cpu(info->subdirs); 946 - ci->i_rbytes = le64_to_cpu(info->rbytes); 947 - ci->i_rfiles = le64_to_cpu(info->rfiles); 948 - ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); 949 - ceph_decode_timespec(&ci->i_rctime, &info->rctime); 950 921 break; 951 922 default: 952 923 pr_err("fill_inode %llx.%llx BAD mode 0%o\n", ··· 945 934 } 946 935 947 936 /* were we issued a capability? */ 948 - if (info->cap.caps) { 937 + if (info_caps) { 949 938 if (ceph_snap(inode) == CEPH_NOSNAP) { 950 - unsigned caps = le32_to_cpu(info->cap.caps); 951 939 ceph_add_cap(inode, session, 952 940 le64_to_cpu(info->cap.cap_id), 953 - cap_fmode, caps, 941 + cap_fmode, info_caps, 954 942 le32_to_cpu(info->cap.wanted), 955 943 le32_to_cpu(info->cap.seq), 956 944 le32_to_cpu(info->cap.mseq), ··· 959 949 /* set dir completion flag? */ 960 950 if (S_ISDIR(inode->i_mode) && 961 951 ci->i_files == 0 && ci->i_subdirs == 0 && 962 - (caps & CEPH_CAP_FILE_SHARED) && 952 + (info_caps & CEPH_CAP_FILE_SHARED) && 963 953 (issued & CEPH_CAP_FILE_EXCL) == 0 && 964 954 !__ceph_dir_is_complete(ci)) { 965 955 dout(" marking %p complete (empty)\n", inode); ··· 972 962 wake = true; 973 963 } else { 974 964 dout(" %p got snap_caps %s\n", inode, 975 - ceph_cap_string(le32_to_cpu(info->cap.caps))); 976 - ci->i_snap_caps |= le32_to_cpu(info->cap.caps); 965 + ceph_cap_string(info_caps)); 966 + ci->i_snap_caps |= info_caps; 977 967 if (cap_fmode >= 0) 978 968 __ceph_get_fmode(ci, cap_fmode); 979 969 } ··· 988 978 int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 989 979 ci->i_inline_version = iinfo->inline_version; 990 980 if (ci->i_inline_version != CEPH_INLINE_NONE && 991 - (locked_page || 992 - (le32_to_cpu(info->cap.caps) & cache_caps))) 981 + (locked_page || (info_caps & cache_caps))) 993 982 fill_inline = true; 994 983 } 995 984 ··· 2187 2178 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 2188 2179 struct ceph_mds_client *mdsc = fsc->mdsc; 2189 2180 struct ceph_mds_request *req; 2181 + int mode; 2190 2182 int err; 2191 2183 2192 2184 if (ceph_snap(inode) == CEPH_SNAPDIR) { ··· 2200 2190 if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 2201 2191 return 0; 2202 2192 2203 - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 2193 + mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; 2194 + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); 2204 2195 if (IS_ERR(req)) 2205 2196 return PTR_ERR(req); 2206 2197 req->r_inode = inode; ··· 2272 2261 stat->size = ci->i_files + ci->i_subdirs; 2273 2262 stat->blocks = 0; 2274 2263 stat->blksize = 65536; 2264 + /* 2265 + * Some applications rely on the number of st_nlink 2266 + * value on directories to be either 0 (if unlinked) 2267 + * or 2 + number of subdirectories. 2268 + */ 2269 + if (stat->nlink == 1) 2270 + /* '.' + '..' + subdirs */ 2271 + stat->nlink = 1 + 1 + ci->i_subdirs; 2275 2272 } 2276 2273 } 2277 2274 return err;
+26 -9
fs/ceph/super.c
··· 45 45 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 46 46 { 47 47 struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); 48 - struct ceph_monmap *monmap = fsc->client->monc.monmap; 48 + struct ceph_mon_client *monc = &fsc->client->monc; 49 49 struct ceph_statfs st; 50 50 u64 fsid; 51 51 int err; ··· 58 58 } 59 59 60 60 dout("statfs\n"); 61 - err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st); 61 + err = ceph_monc_do_statfs(monc, data_pool, &st); 62 62 if (err < 0) 63 63 return err; 64 64 ··· 94 94 buf->f_namelen = NAME_MAX; 95 95 96 96 /* Must convert the fsid, for consistent values across arches */ 97 - fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^ 98 - le64_to_cpu(*((__le64 *)&monmap->fsid + 1)); 97 + mutex_lock(&monc->mutex); 98 + fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^ 99 + le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1)); 100 + mutex_unlock(&monc->mutex); 101 + 99 102 buf->f_fsid.val[0] = fsid & 0xffffffff; 100 103 buf->f_fsid.val[1] = fsid >> 32; 101 104 ··· 259 256 break; 260 257 /* misc */ 261 258 case Opt_wsize: 262 - if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) 259 + if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) 263 260 return -EINVAL; 264 261 fsopt->wsize = ALIGN(intval, PAGE_SIZE); 265 262 break; 266 263 case Opt_rsize: 267 - if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) 264 + if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) 268 265 return -EINVAL; 269 266 fsopt->rsize = ALIGN(intval, PAGE_SIZE); 270 267 break; 271 268 case Opt_rasize: 272 269 if (intval < 0) 273 270 return -EINVAL; 274 - fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE); 271 + fsopt->rasize = ALIGN(intval, PAGE_SIZE); 275 272 break; 276 273 case Opt_caps_wanted_delay_min: 277 274 if (intval < 1) ··· 289 286 fsopt->max_readdir = intval; 290 287 break; 291 288 case Opt_readdir_max_bytes: 292 - if (intval < PAGE_SIZE && intval != 0) 289 + if (intval < (int)PAGE_SIZE && intval != 0) 293 290 return -EINVAL; 294 291 fsopt->max_readdir_bytes = intval; 295 292 break; ··· 537 534 seq_puts(m, ",noasyncreaddir"); 538 535 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) 539 536 seq_puts(m, ",nodcache"); 537 + if (fsopt->flags & CEPH_MOUNT_OPT_INO32) 538 + seq_puts(m, ",ino32"); 540 539 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { 541 540 seq_show_option(m, "fsc", fsopt->fscache_uniq); 542 541 } ··· 556 551 557 552 if (fsopt->mds_namespace) 558 553 seq_show_option(m, "mds_namespace", fsopt->mds_namespace); 559 - if (fsopt->wsize) 554 + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) 560 555 seq_printf(m, ",wsize=%d", fsopt->wsize); 561 556 if (fsopt->rsize != CEPH_MAX_READ_SIZE) 562 557 seq_printf(m, ",rsize=%d", fsopt->rsize); ··· 621 616 err = PTR_ERR(fsc->client); 622 617 goto fail; 623 618 } 619 + 624 620 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 621 + fsc->client->osdc.abort_on_full = true; 625 622 626 623 if (!fsopt->mds_namespace) { 627 624 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, ··· 679 672 fail: 680 673 kfree(fsc); 681 674 return ERR_PTR(err); 675 + } 676 + 677 + static void flush_fs_workqueues(struct ceph_fs_client *fsc) 678 + { 679 + flush_workqueue(fsc->wb_wq); 680 + flush_workqueue(fsc->pg_inv_wq); 681 + flush_workqueue(fsc->trunc_wq); 682 682 } 683 683 684 684 static void destroy_fs_client(struct ceph_fs_client *fsc) ··· 807 793 if (!fsc) 808 794 return; 809 795 fsc->mount_state = CEPH_MOUNT_SHUTDOWN; 796 + ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); 810 797 ceph_mdsc_force_umount(fsc->mdsc); 811 798 return; 812 799 } ··· 1103 1088 dout("kill_sb %p\n", s); 1104 1089 1105 1090 ceph_mdsc_pre_umount(fsc->mdsc); 1091 + flush_fs_workqueues(fsc); 1092 + 1106 1093 generic_shutdown_super(s); 1107 1094 1108 1095 fsc->client->extra_mon_dispatch = NULL;
+32 -28
fs/ceph/xattr.c
··· 50 50 size_t name_size; /* strlen(name) + 1 (for '\0') */ 51 51 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, 52 52 size_t size); 53 - bool readonly, hidden; 54 53 bool (*exists_cb)(struct ceph_inode_info *ci); 54 + unsigned int flags; 55 55 }; 56 + 57 + #define VXATTR_FLAG_READONLY (1<<0) 58 + #define VXATTR_FLAG_HIDDEN (1<<1) 59 + #define VXATTR_FLAG_RSTAT (1<<2) 56 60 57 61 /* layouts */ 58 62 ··· 266 262 #define CEPH_XATTR_NAME2(_type, _name, _name2) \ 267 263 XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 268 264 269 - #define XATTR_NAME_CEPH(_type, _name) \ 265 + #define XATTR_NAME_CEPH(_type, _name, _flags) \ 270 266 { \ 271 267 .name = CEPH_XATTR_NAME(_type, _name), \ 272 268 .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ 273 269 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ 274 - .readonly = true, \ 275 - .hidden = false, \ 276 - .exists_cb = NULL, \ 270 + .exists_cb = NULL, \ 271 + .flags = (VXATTR_FLAG_READONLY | _flags), \ 277 272 } 273 + #define XATTR_RSTAT_FIELD(_type, _name) \ 274 + XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) 278 275 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ 279 276 { \ 280 277 .name = CEPH_XATTR_NAME2(_type, _name, _field), \ 281 278 .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ 282 279 .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ 283 - .readonly = false, \ 284 - .hidden = true, \ 285 280 .exists_cb = ceph_vxattrcb_layout_exists, \ 281 + .flags = VXATTR_FLAG_HIDDEN, \ 286 282 } 287 283 #define XATTR_QUOTA_FIELD(_type, _name) \ 288 284 { \ 289 285 .name = CEPH_XATTR_NAME(_type, _name), \ 290 286 .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ 291 287 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ 292 - .readonly = false, \ 293 - .hidden = true, \ 294 288 .exists_cb = ceph_vxattrcb_quota_exists, \ 289 + .flags = VXATTR_FLAG_HIDDEN, \ 295 290 } 296 291 297 292 static struct ceph_vxattr ceph_dir_vxattrs[] = { ··· 298 295 .name = "ceph.dir.layout", 299 296 .name_size = sizeof("ceph.dir.layout"), 300 297 .getxattr_cb = ceph_vxattrcb_layout, 301 - .readonly = false, 302 - .hidden = true, 303 298 .exists_cb = ceph_vxattrcb_layout_exists, 299 + .flags = VXATTR_FLAG_HIDDEN, 304 300 }, 305 301 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), 306 302 XATTR_LAYOUT_FIELD(dir, layout, stripe_count), 307 303 XATTR_LAYOUT_FIELD(dir, layout, object_size), 308 304 XATTR_LAYOUT_FIELD(dir, layout, pool), 309 305 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), 310 - XATTR_NAME_CEPH(dir, entries), 311 - XATTR_NAME_CEPH(dir, files), 312 - XATTR_NAME_CEPH(dir, subdirs), 313 - XATTR_NAME_CEPH(dir, rentries), 314 - XATTR_NAME_CEPH(dir, rfiles), 315 - XATTR_NAME_CEPH(dir, rsubdirs), 316 - XATTR_NAME_CEPH(dir, rbytes), 317 - XATTR_NAME_CEPH(dir, rctime), 306 + XATTR_NAME_CEPH(dir, entries, 0), 307 + XATTR_NAME_CEPH(dir, files, 0), 308 + XATTR_NAME_CEPH(dir, subdirs, 0), 309 + XATTR_RSTAT_FIELD(dir, rentries), 310 + XATTR_RSTAT_FIELD(dir, rfiles), 311 + XATTR_RSTAT_FIELD(dir, rsubdirs), 312 + XATTR_RSTAT_FIELD(dir, rbytes), 313 + XATTR_RSTAT_FIELD(dir, rctime), 318 314 { 319 315 .name = "ceph.quota", 320 316 .name_size = sizeof("ceph.quota"), 321 317 .getxattr_cb = ceph_vxattrcb_quota, 322 - .readonly = false, 323 - .hidden = true, 324 318 .exists_cb = ceph_vxattrcb_quota_exists, 319 + .flags = VXATTR_FLAG_HIDDEN, 325 320 }, 326 321 XATTR_QUOTA_FIELD(quota, max_bytes), 327 322 XATTR_QUOTA_FIELD(quota, max_files), ··· 334 333 .name = "ceph.file.layout", 335 334 .name_size = sizeof("ceph.file.layout"), 336 335 .getxattr_cb = ceph_vxattrcb_layout, 337 - .readonly = false, 338 - .hidden = true, 339 336 .exists_cb = ceph_vxattrcb_layout_exists, 337 + .flags = VXATTR_FLAG_HIDDEN, 340 338 }, 341 339 XATTR_LAYOUT_FIELD(file, layout, stripe_unit), 342 340 XATTR_LAYOUT_FIELD(file, layout, stripe_count), ··· 374 374 struct ceph_vxattr *vxattr; 375 375 size_t size = 0; 376 376 377 - for (vxattr = vxattrs; vxattr->name; vxattr++) 378 - if (!vxattr->hidden) 377 + for (vxattr = vxattrs; vxattr->name; vxattr++) { 378 + if (!(vxattr->flags & VXATTR_FLAG_HIDDEN)) 379 379 size += vxattr->name_size; 380 + } 380 381 381 382 return size; 382 383 } ··· 810 809 /* let's see if a virtual xattr was requested */ 811 810 vxattr = ceph_match_vxattr(inode, name); 812 811 if (vxattr) { 813 - err = ceph_do_getattr(inode, 0, true); 812 + int mask = 0; 813 + if (vxattr->flags & VXATTR_FLAG_RSTAT) 814 + mask |= CEPH_STAT_RSTAT; 815 + err = ceph_do_getattr(inode, mask, true); 814 816 if (err) 815 817 return err; 816 818 err = -ENODATA; ··· 923 919 err = namelen; 924 920 if (vxattrs) { 925 921 for (i = 0; vxattrs[i].name; i++) { 926 - if (!vxattrs[i].hidden && 922 + if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) && 927 923 !(vxattrs[i].exists_cb && 928 924 !vxattrs[i].exists_cb(ci))) { 929 925 len = sprintf(names, "%s", vxattrs[i].name); ··· 1028 1024 1029 1025 vxattr = ceph_match_vxattr(inode, name); 1030 1026 if (vxattr) { 1031 - if (vxattr->readonly) 1027 + if (vxattr->flags & VXATTR_FLAG_READONLY) 1032 1028 return -EOPNOTSUPP; 1033 1029 if (value && !strncmp(vxattr->name, "ceph.quota", 10)) 1034 1030 check_realm = true;
+1
include/linux/ceph/ceph_fs.h
··· 628 628 CEPH_CAP_XATTR_SHARED) 629 629 #define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \ 630 630 CEPH_CAP_FILE_RD) 631 + #define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND 631 632 632 633 #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ 633 634 CEPH_CAP_LINK_SHARED | \
+6 -2
include/linux/ceph/osd_client.h
··· 170 170 u64 r_tid; /* unique for this client */ 171 171 struct rb_node r_node; 172 172 struct rb_node r_mc_node; /* map check */ 173 + struct work_struct r_complete_work; 173 174 struct ceph_osd *r_osd; 174 175 175 176 struct ceph_osd_request_target r_t; ··· 202 201 struct timespec r_mtime; /* ditto */ 203 202 u64 r_data_offset; /* ditto */ 204 203 bool r_linger; /* don't resend on failure */ 205 - bool r_abort_on_full; /* return ENOSPC when full */ 206 204 207 205 /* internal */ 208 206 unsigned long r_stamp; /* jiffies, send or check time */ ··· 347 347 struct rb_root linger_map_checks; 348 348 atomic_t num_requests; 349 349 atomic_t num_homeless; 350 + bool abort_on_full; /* abort w/ ENOSPC when full */ 351 + int abort_err; 350 352 struct delayed_work timeout_work; 351 353 struct delayed_work osds_timeout_work; 352 354 #ifdef CONFIG_DEBUG_FS ··· 361 359 struct ceph_msgpool msgpool_op_reply; 362 360 363 361 struct workqueue_struct *notify_wq; 362 + struct workqueue_struct *completion_wq; 364 363 }; 365 364 366 365 static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag) ··· 381 378 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 382 379 struct ceph_msg *msg); 383 380 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); 381 + void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); 384 382 385 383 extern void osd_req_op_init(struct ceph_osd_request *osd_req, 386 384 unsigned int which, u16 opcode, u32 flags); ··· 444 440 struct page **pages, u64 length, 445 441 u32 alignment, bool pages_from_pool, 446 442 bool own_pages); 447 - extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, 443 + extern int osd_req_op_cls_init(struct ceph_osd_request *osd_req, 448 444 unsigned int which, u16 opcode, 449 445 const char *class, const char *method); 450 446 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
+4 -4
include/linux/ceph/osdmap.h
··· 279 279 const struct ceph_osds *new_acting, 280 280 bool any_change); 281 281 282 - int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 283 - const struct ceph_object_id *oid, 284 - const struct ceph_object_locator *oloc, 285 - struct ceph_pg *raw_pgid); 282 + void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 283 + const struct ceph_object_id *oid, 284 + const struct ceph_object_locator *oloc, 285 + struct ceph_pg *raw_pgid); 286 286 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, 287 287 const struct ceph_object_id *oid, 288 288 const struct ceph_object_locator *oloc,
+11 -20
net/ceph/messenger.c
··· 168 168 static struct lock_class_key socket_class; 169 169 #endif 170 170 171 - /* 172 - * When skipping (ignoring) a block of input we read it into a "skip 173 - * buffer," which is this many bytes in size. 174 - */ 175 - #define SKIP_BUF_SIZE 1024 176 - 177 171 static void queue_con(struct ceph_connection *con); 178 172 static void cancel_con(struct ceph_connection *con); 179 173 static void ceph_con_workfn(struct work_struct *); ··· 514 520 return 0; 515 521 } 516 522 523 + /* 524 + * If @buf is NULL, discard up to @len bytes. 525 + */ 517 526 static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) 518 527 { 519 528 struct kvec iov = {buf, len}; 520 529 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 521 530 int r; 531 + 532 + if (!buf) 533 + msg.msg_flags |= MSG_TRUNC; 522 534 523 535 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); 524 536 r = sock_recvmsg(sock, &msg, msg.msg_flags); ··· 2575 2575 con->state != CON_STATE_OPEN) 2576 2576 return 0; 2577 2577 2578 - more: 2579 - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 2580 - 2581 2578 /* open the socket first? */ 2582 2579 if (con->state == CON_STATE_PREOPEN) { 2583 2580 BUG_ON(con->sock); ··· 2595 2598 } 2596 2599 } 2597 2600 2598 - more_kvec: 2601 + more: 2602 + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 2599 2603 BUG_ON(!con->sock); 2600 2604 2601 2605 /* kvec data queued? */ ··· 2621 2623 2622 2624 ret = write_partial_message_data(con); 2623 2625 if (ret == 1) 2624 - goto more_kvec; /* we need to send the footer, too! */ 2626 + goto more; /* we need to send the footer, too! */ 2625 2627 if (ret == 0) 2626 2628 goto out; 2627 2629 if (ret < 0) { ··· 2656 2658 dout("try_write done on %p ret %d\n", con, ret); 2657 2659 return ret; 2658 2660 } 2659 - 2660 - 2661 2661 2662 2662 /* 2663 2663 * Read what we can from the socket. ··· 2717 2721 if (con->in_base_pos < 0) { 2718 2722 /* 2719 2723 * skipping + discarding content. 2720 - * 2721 - * FIXME: there must be a better way to do this! 2722 2724 */ 2723 - static char buf[SKIP_BUF_SIZE]; 2724 - int skip = min((int) sizeof (buf), -con->in_base_pos); 2725 - 2726 - dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); 2727 - ret = ceph_tcp_recvmsg(con->sock, buf, skip); 2725 + ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); 2728 2726 if (ret <= 0) 2729 2727 goto out; 2728 + dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); 2730 2729 con->in_base_pos += ret; 2731 2730 if (con->in_base_pos) 2732 2731 goto more;
+130 -86
net/ceph/osd_client.c
··· 766 766 } 767 767 EXPORT_SYMBOL(osd_req_op_extent_dup_last); 768 768 769 - void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, 769 + int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, 770 770 u16 opcode, const char *class, const char *method) 771 771 { 772 772 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, ··· 778 778 BUG_ON(opcode != CEPH_OSD_OP_CALL); 779 779 780 780 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 781 - BUG_ON(!pagelist); 781 + if (!pagelist) 782 + return -ENOMEM; 783 + 782 784 ceph_pagelist_init(pagelist); 783 785 784 786 op->cls.class_name = class; ··· 800 798 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); 801 799 802 800 op->indata_len = payload_len; 801 + return 0; 803 802 } 804 803 EXPORT_SYMBOL(osd_req_op_cls_init); 805 804 ··· 1029 1026 truncate_size, truncate_seq); 1030 1027 } 1031 1028 1032 - req->r_abort_on_full = true; 1033 1029 req->r_flags = flags; 1034 1030 req->r_base_oloc.pool = layout->pool_id; 1035 1031 req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ··· 1055 1053 */ 1056 1054 DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) 1057 1055 DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) 1056 + 1057 + /* 1058 + * Call @fn on each OSD request as long as @fn returns 0. 1059 + */ 1060 + static void for_each_request(struct ceph_osd_client *osdc, 1061 + int (*fn)(struct ceph_osd_request *req, void *arg), 1062 + void *arg) 1063 + { 1064 + struct rb_node *n, *p; 1065 + 1066 + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { 1067 + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 1068 + 1069 + for (p = rb_first(&osd->o_requests); p; ) { 1070 + struct ceph_osd_request *req = 1071 + rb_entry(p, struct ceph_osd_request, r_node); 1072 + 1073 + p = rb_next(p); 1074 + if (fn(req, arg)) 1075 + return; 1076 + } 1077 + } 1078 + 1079 + for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { 1080 + struct ceph_osd_request *req = 1081 + rb_entry(p, struct ceph_osd_request, r_node); 1082 + 1083 + p = rb_next(p); 1084 + if (fn(req, arg)) 1085 + return; 1086 + } 1087 + } 1058 1088 1059 1089 static bool osd_homeless(struct ceph_osd *osd) 1060 1090 { ··· 1429 1395 bool recovery_deletes = ceph_osdmap_flag(osdc, 1430 1396 CEPH_OSDMAP_RECOVERY_DELETES); 1431 1397 enum calc_target_result ct_res; 1432 - int ret; 1433 1398 1434 1399 t->epoch = osdc->osdmap->epoch; 1435 1400 pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); ··· 1464 1431 } 1465 1432 } 1466 1433 1467 - ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, 1468 - &pgid); 1469 - if (ret) { 1470 - WARN_ON(ret != -ENOENT); 1471 - t->osd = CEPH_HOMELESS_OSD; 1472 - ct_res = CALC_TARGET_POOL_DNE; 1473 - goto out; 1474 - } 1434 + __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid); 1475 1435 last_pgid.pool = pgid.pool; 1476 1436 last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); 1477 1437 ··· 2187 2161 struct ceph_osd_client *osdc = req->r_osdc; 2188 2162 struct ceph_osd *osd; 2189 2163 enum calc_target_result ct_res; 2164 + int err = 0; 2190 2165 bool need_send = false; 2191 2166 bool promoted = false; 2192 - bool need_abort = false; 2193 2167 2194 2168 WARN_ON(req->r_tid); 2195 2169 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); ··· 2205 2179 goto promote; 2206 2180 } 2207 2181 2208 - if (osdc->osdmap->epoch < osdc->epoch_barrier) { 2182 + if (osdc->abort_err) { 2183 + dout("req %p abort_err %d\n", req, osdc->abort_err); 2184 + err = osdc->abort_err; 2185 + } else if (osdc->osdmap->epoch < osdc->epoch_barrier) { 2209 2186 dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, 2210 2187 osdc->epoch_barrier); 2211 2188 req->r_t.paused = true; ··· 2229 2200 (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || 2230 2201 pool_full(osdc, req->r_t.base_oloc.pool))) { 2231 2202 dout("req %p full/pool_full\n", req); 2232 - pr_warn_ratelimited("FULL or reached pool quota\n"); 2233 - req->r_t.paused = true; 2234 - maybe_request_map(osdc); 2235 - if (req->r_abort_on_full) 2236 - need_abort = true; 2203 + if (osdc->abort_on_full) { 2204 + err = -ENOSPC; 2205 + } else { 2206 + pr_warn_ratelimited("FULL or reached pool quota\n"); 2207 + req->r_t.paused = true; 2208 + maybe_request_map(osdc); 2209 + } 2237 2210 } else if (!osd_homeless(osd)) { 2238 2211 need_send = true; 2239 2212 } else { ··· 2252 2221 link_request(osd, req); 2253 2222 if (need_send) 2254 2223 send_request(req); 2255 - else if (need_abort) 2256 - complete_request(req, -ENOSPC); 2224 + else if (err) 2225 + complete_request(req, err); 2257 2226 mutex_unlock(&osd->lock); 2258 2227 2259 - if (ct_res == CALC_TARGET_POOL_DNE) 2228 + if (!err && ct_res == CALC_TARGET_POOL_DNE) 2260 2229 send_map_check(req); 2261 2230 2262 2231 if (promoted) ··· 2312 2281 2313 2282 static void __complete_request(struct ceph_osd_request *req) 2314 2283 { 2315 - if (req->r_callback) { 2316 - dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, 2317 - req->r_tid, req->r_callback, req->r_result); 2284 + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, 2285 + req->r_tid, req->r_callback, req->r_result); 2286 + 2287 + if (req->r_callback) 2318 2288 req->r_callback(req); 2319 - } 2289 + complete_all(&req->r_completion); 2290 + ceph_osdc_put_request(req); 2291 + } 2292 + 2293 + static void complete_request_workfn(struct work_struct *work) 2294 + { 2295 + struct ceph_osd_request *req = 2296 + container_of(work, struct ceph_osd_request, r_complete_work); 2297 + 2298 + __complete_request(req); 2320 2299 } 2321 2300 2322 2301 /* ··· 2338 2297 2339 2298 req->r_result = err; 2340 2299 finish_request(req); 2341 - __complete_request(req); 2342 - complete_all(&req->r_completion); 2343 - ceph_osdc_put_request(req); 2300 + 2301 + INIT_WORK(&req->r_complete_work, complete_request_workfn); 2302 + queue_work(req->r_osdc->completion_wq, &req->r_complete_work); 2344 2303 } 2345 2304 2346 2305 static void cancel_map_check(struct ceph_osd_request *req) ··· 2377 2336 complete_request(req, err); 2378 2337 } 2379 2338 2339 + static int abort_fn(struct ceph_osd_request *req, void *arg) 2340 + { 2341 + int err = *(int *)arg; 2342 + 2343 + abort_request(req, err); 2344 + return 0; /* continue iteration */ 2345 + } 2346 + 2347 + /* 2348 + * Abort all in-flight requests with @err and arrange for all future 2349 + * requests to be failed immediately. 2350 + */ 2351 + void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) 2352 + { 2353 + dout("%s osdc %p err %d\n", __func__, osdc, err); 2354 + down_write(&osdc->lock); 2355 + for_each_request(osdc, abort_fn, &err); 2356 + osdc->abort_err = err; 2357 + up_write(&osdc->lock); 2358 + } 2359 + EXPORT_SYMBOL(ceph_osdc_abort_requests); 2360 + 2380 2361 static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) 2381 2362 { 2382 2363 if (likely(eb > osdc->epoch_barrier)) { ··· 2426 2363 EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); 2427 2364 2428 2365 /* 2366 + * We can end up releasing caps as a result of abort_request(). 2367 + * In that case, we probably want to ensure that the cap release message 2368 + * has an updated epoch barrier in it, so set the epoch barrier prior to 2369 + * aborting the first request. 2370 + */ 2371 + static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) 2372 + { 2373 + struct ceph_osd_client *osdc = req->r_osdc; 2374 + bool *victims = arg; 2375 + 2376 + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && 2377 + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || 2378 + pool_full(osdc, req->r_t.base_oloc.pool))) { 2379 + if (!*victims) { 2380 + update_epoch_barrier(osdc, osdc->osdmap->epoch); 2381 + *victims = true; 2382 + } 2383 + abort_request(req, -ENOSPC); 2384 + } 2385 + 2386 + return 0; /* continue iteration */ 2387 + } 2388 + 2389 + /* 2429 2390 * Drop all pending requests that are stalled waiting on a full condition to 2430 2391 * clear, and complete them with ENOSPC as the return code. Set the 2431 2392 * osdc->epoch_barrier to the latest map epoch that we've seen if any were ··· 2457 2370 */ 2458 2371 static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) 2459 2372 { 2460 - struct rb_node *n; 2461 2373 bool victims = false; 2462 2374 2463 - dout("enter abort_on_full\n"); 2464 - 2465 - if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc)) 2466 - goto out; 2467 - 2468 - /* Scan list and see if there is anything to abort */ 2469 - for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { 2470 - struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 2471 - struct rb_node *m; 2472 - 2473 - m = rb_first(&osd->o_requests); 2474 - while (m) { 2475 - struct ceph_osd_request *req = rb_entry(m, 2476 - struct ceph_osd_request, r_node); 2477 - m = rb_next(m); 2478 - 2479 - if (req->r_abort_on_full) { 2480 - victims = true; 2481 - break; 2482 - } 2483 - } 2484 - if (victims) 2485 - break; 2486 - } 2487 - 2488 - if (!victims) 2489 - goto out; 2490 - 2491 - /* 2492 - * Update the barrier to current epoch if it's behind that point, 2493 - * since we know we have some calls to be aborted in the tree. 2494 - */ 2495 - update_epoch_barrier(osdc, osdc->osdmap->epoch); 2496 - 2497 - for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { 2498 - struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 2499 - struct rb_node *m; 2500 - 2501 - m = rb_first(&osd->o_requests); 2502 - while (m) { 2503 - struct ceph_osd_request *req = rb_entry(m, 2504 - struct ceph_osd_request, r_node); 2505 - m = rb_next(m); 2506 - 2507 - if (req->r_abort_on_full && 2508 - (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || 2509 - pool_full(osdc, req->r_t.target_oloc.pool))) 2510 - abort_request(req, -ENOSPC); 2511 - } 2512 - } 2513 - out: 2514 - dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier); 2375 + if (osdc->abort_on_full && 2376 + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) 2377 + for_each_request(osdc, abort_on_full_fn, &victims); 2515 2378 } 2516 2379 2517 2380 static void check_pool_dne(struct ceph_osd_request *req) ··· 3578 3541 up_read(&osdc->lock); 3579 3542 3580 3543 __complete_request(req); 3581 - complete_all(&req->r_completion); 3582 - ceph_osdc_put_request(req); 3583 3544 return; 3584 3545 3585 3546 fail_request: ··· 4962 4927 if (ret) 4963 4928 goto out_put_req; 4964 4929 4965 - osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); 4930 + ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); 4931 + if (ret) 4932 + goto out_put_req; 4933 + 4966 4934 if (req_page) 4967 4935 osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, 4968 4936 0, false, false); ··· 5034 4996 if (!osdc->notify_wq) 5035 4997 goto out_msgpool_reply; 5036 4998 4999 + osdc->completion_wq = create_singlethread_workqueue("ceph-completion"); 5000 + if (!osdc->completion_wq) 5001 + goto out_notify_wq; 5002 + 5037 5003 schedule_delayed_work(&osdc->timeout_work, 5038 5004 osdc->client->options->osd_keepalive_timeout); 5039 5005 schedule_delayed_work(&osdc->osds_timeout_work, ··· 5045 5003 5046 5004 return 0; 5047 5005 5006 + out_notify_wq: 5007 + destroy_workqueue(osdc->notify_wq); 5048 5008 out_msgpool_reply: 5049 5009 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 5050 5010 out_msgpool: ··· 5061 5017 5062 5018 void ceph_osdc_stop(struct ceph_osd_client *osdc) 5063 5019 { 5064 - flush_workqueue(osdc->notify_wq); 5020 + destroy_workqueue(osdc->completion_wq); 5065 5021 destroy_workqueue(osdc->notify_wq); 5066 5022 cancel_delayed_work_sync(&osdc->timeout_work); 5067 5023 cancel_delayed_work_sync(&osdc->osds_timeout_work);
+8 -11
net/ceph/osdmap.c
··· 2146 2146 * Should only be called with target_oid and target_oloc (as opposed to 2147 2147 * base_oid and base_oloc), since tiering isn't taken into account. 2148 2148 */ 2149 - int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 2150 - const struct ceph_object_id *oid, 2151 - const struct ceph_object_locator *oloc, 2152 - struct ceph_pg *raw_pgid) 2149 + void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 2150 + const struct ceph_object_id *oid, 2151 + const struct ceph_object_locator *oloc, 2152 + struct ceph_pg *raw_pgid) 2153 2153 { 2154 2154 WARN_ON(pi->id != oloc->pool); 2155 2155 ··· 2165 2165 int nsl = oloc->pool_ns->len; 2166 2166 size_t total = nsl + 1 + oid->name_len; 2167 2167 2168 - if (total > sizeof(stack_buf)) { 2169 - buf = kmalloc(total, GFP_NOIO); 2170 - if (!buf) 2171 - return -ENOMEM; 2172 - } 2168 + if (total > sizeof(stack_buf)) 2169 + buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL); 2173 2170 memcpy(buf, oloc->pool_ns->str, nsl); 2174 2171 buf[nsl] = '\037'; 2175 2172 memcpy(buf + nsl + 1, oid->name, oid->name_len); ··· 2178 2181 oid->name, nsl, oloc->pool_ns->str, 2179 2182 raw_pgid->pool, raw_pgid->seed); 2180 2183 } 2181 - return 0; 2182 2184 } 2183 2185 2184 2186 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, ··· 2191 2195 if (!pi) 2192 2196 return -ENOENT; 2193 2197 2194 - return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); 2198 + __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); 2199 + return 0; 2195 2200 } 2196 2201 EXPORT_SYMBOL(ceph_object_locator_to_pg); 2197 2202