Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph changes from Sage Weil:
"On the RBD side, there is a conversion to blk-mq from Christoph,
several long-standing bug fixes from Ilya, and some cleanup from
Rickard Strandqvist.

On the CephFS side there is a long list of fixes from Zheng, including
improved session handling, a few IO path fixes, some dcache management
correctness fixes, and several blocking while !TASK_RUNNING fixes.

The core code gets a few cleanups and Chaitanya has added support for
TCP_NODELAY (which has been used on the server side for ages but we
somehow missed on the kernel client).

There is also an update to MAINTAINERS to fix up some email addresses
and reflect that Ilya and Zheng are doing most of the maintenance for
RBD and CephFS these days. Do not be surprised to see a pull request
come from one of them in the future if I am unavailable for some
reason"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (27 commits)
MAINTAINERS: update Ceph and RBD maintainers
libceph: kfree() in put_osd() shouldn't depend on authorizer
libceph: fix double __remove_osd() problem
rbd: convert to blk-mq
ceph: return error for traceless reply race
ceph: fix dentry leaks
ceph: re-send requests when MDS enters reconnecting stage
ceph: show nocephx_require_signatures and notcp_nodelay options
libceph: tcp_nodelay support
rbd: do not treat standalone as flatten
ceph: fix atomic_open snapdir
ceph: properly mark empty directory as complete
client: include kernel version in client metadata
ceph: provide seperate {inode,file}_operations for snapdir
ceph: fix request time stamp encoding
ceph: fix reading inline data when i_size > PAGE_SIZE
ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_close_sessions)
ceph: avoid block operation when !TASK_RUNNING (ceph_get_caps)
ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_sync)
rbd: fix error paths in rbd_dev_refresh()
...

+444 -488
+4 -3
MAINTAINERS
··· 2433 2433 F: arch/powerpc/platforms/cell/ 2434 2434 2435 2435 CEPH DISTRIBUTED FILE SYSTEM CLIENT 2436 - M: Sage Weil <sage@inktank.com> 2436 + M: Yan, Zheng <zyan@redhat.com> 2437 + M: Sage Weil <sage@redhat.com> 2437 2438 L: ceph-devel@vger.kernel.org 2438 2439 W: http://ceph.com/ 2439 2440 T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git ··· 7999 7998 F: drivers/net/wireless/ath/wcn36xx/ 8000 7999 8001 8000 RADOS BLOCK DEVICE (RBD) 8002 - M: Yehuda Sadeh <yehuda@inktank.com> 8003 - M: Sage Weil <sage@inktank.com> 8001 + M: Ilya Dryomov <idryomov@gmail.com> 8002 + M: Sage Weil <sage@redhat.com> 8004 8003 M: Alex Elder <elder@kernel.org> 8005 8004 M: ceph-devel@vger.kernel.org 8006 8005 W: http://ceph.com/
+83 -110
drivers/block/rbd.c
··· 38 38 #include <linux/kernel.h> 39 39 #include <linux/device.h> 40 40 #include <linux/module.h> 41 + #include <linux/blk-mq.h> 41 42 #include <linux/fs.h> 42 43 #include <linux/blkdev.h> 43 44 #include <linux/slab.h> ··· 341 340 342 341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 343 342 344 - struct list_head rq_queue; /* incoming rq queue */ 345 343 spinlock_t lock; /* queue, flags, open_count */ 346 - struct work_struct rq_work; 347 344 348 345 struct rbd_image_header header; 349 346 unsigned long flags; /* possibly lock protected */ ··· 358 359 u64 parent_overlap; 359 360 atomic_t parent_ref; 360 361 struct rbd_device *parent; 362 + 363 + /* Block layer tags. */ 364 + struct blk_mq_tag_set tag_set; 361 365 362 366 /* protects updating the header */ 363 367 struct rw_semaphore header_rwsem; ··· 1819 1817 1820 1818 /* 1821 1819 * We support a 64-bit length, but ultimately it has to be 1822 - * passed to blk_end_request(), which takes an unsigned int. 1820 + * passed to the block layer, which just supports a 32-bit 1821 + * length field. 1823 1822 */ 1824 1823 obj_request->xferred = osd_req->r_reply_op_len[0]; 1825 1824 rbd_assert(obj_request->xferred < (u64)UINT_MAX); ··· 2278 2275 more = obj_request->which < img_request->obj_request_count - 1; 2279 2276 } else { 2280 2277 rbd_assert(img_request->rq != NULL); 2281 - more = blk_end_request(img_request->rq, result, xferred); 2278 + 2279 + more = blk_update_request(img_request->rq, result, xferred); 2280 + if (!more) 2281 + __blk_mq_end_request(img_request->rq, result); 2282 2282 } 2283 2283 2284 2284 return more; ··· 3310 3304 return ret; 3311 3305 } 3312 3306 3313 - static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 3307 + static void rbd_queue_workfn(struct work_struct *work) 3314 3308 { 3309 + struct request *rq = blk_mq_rq_from_pdu(work); 3310 + struct rbd_device *rbd_dev = rq->q->queuedata; 3315 3311 struct rbd_img_request *img_request; 3316 3312 struct ceph_snap_context *snapc = NULL; 3317 3313 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; ··· 3321 3313 enum obj_operation_type op_type; 3322 3314 u64 mapping_size; 3323 3315 int result; 3316 + 3317 + if (rq->cmd_type != REQ_TYPE_FS) { 3318 + dout("%s: non-fs request type %d\n", __func__, 3319 + (int) rq->cmd_type); 3320 + result = -EIO; 3321 + goto err; 3322 + } 3324 3323 3325 3324 if (rq->cmd_flags & REQ_DISCARD) 3326 3325 op_type = OBJ_OP_DISCARD; ··· 3374 3359 goto err_rq; /* Shouldn't happen */ 3375 3360 } 3376 3361 3362 + blk_mq_start_request(rq); 3363 + 3377 3364 down_read(&rbd_dev->header_rwsem); 3378 3365 mapping_size = rbd_dev->mapping.size; 3379 3366 if (op_type != OBJ_OP_READ) { ··· 3421 3404 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3422 3405 obj_op_name(op_type), length, offset, result); 3423 3406 ceph_put_snap_context(snapc); 3424 - blk_end_request_all(rq, result); 3407 + err: 3408 + blk_mq_end_request(rq, result); 3425 3409 } 3426 3410 3427 - static void rbd_request_workfn(struct work_struct *work) 3411 + static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 3412 + const struct blk_mq_queue_data *bd) 3428 3413 { 3429 - struct rbd_device *rbd_dev = 3430 - container_of(work, struct rbd_device, rq_work); 3431 - struct request *rq, *next; 3432 - LIST_HEAD(requests); 3414 + struct request *rq = bd->rq; 3415 + struct work_struct *work = blk_mq_rq_to_pdu(rq); 3433 3416 3434 - spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ 3435 - list_splice_init(&rbd_dev->rq_queue, &requests); 3436 - spin_unlock_irq(&rbd_dev->lock); 3437 - 3438 - list_for_each_entry_safe(rq, next, &requests, queuelist) { 3439 - list_del_init(&rq->queuelist); 3440 - rbd_handle_request(rbd_dev, rq); 3441 - } 3442 - } 3443 - 3444 - /* 3445 - * Called with q->queue_lock held and interrupts disabled, possibly on 3446 - * the way to schedule(). Do not sleep here! 3447 - */ 3448 - static void rbd_request_fn(struct request_queue *q) 3449 - { 3450 - struct rbd_device *rbd_dev = q->queuedata; 3451 - struct request *rq; 3452 - int queued = 0; 3453 - 3454 - rbd_assert(rbd_dev); 3455 - 3456 - while ((rq = blk_fetch_request(q))) { 3457 - /* Ignore any non-FS requests that filter through. */ 3458 - if (rq->cmd_type != REQ_TYPE_FS) { 3459 - dout("%s: non-fs request type %d\n", __func__, 3460 - (int) rq->cmd_type); 3461 - __blk_end_request_all(rq, 0); 3462 - continue; 3463 - } 3464 - 3465 - list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); 3466 - queued++; 3467 - } 3468 - 3469 - if (queued) 3470 - queue_work(rbd_wq, &rbd_dev->rq_work); 3417 + queue_work(rbd_wq, work); 3418 + return BLK_MQ_RQ_QUEUE_OK; 3471 3419 } 3472 3420 3473 3421 /* ··· 3493 3511 del_gendisk(disk); 3494 3512 if (disk->queue) 3495 3513 blk_cleanup_queue(disk->queue); 3514 + blk_mq_free_tag_set(&rbd_dev->tag_set); 3496 3515 } 3497 3516 put_disk(disk); 3498 3517 } ··· 3677 3694 3678 3695 ret = rbd_dev_header_info(rbd_dev); 3679 3696 if (ret) 3680 - return ret; 3697 + goto out; 3681 3698 3682 3699 /* 3683 3700 * If there is a parent, see if it has disappeared due to the ··· 3686 3703 if (rbd_dev->parent) { 3687 3704 ret = rbd_dev_v2_parent_info(rbd_dev); 3688 3705 if (ret) 3689 - return ret; 3706 + goto out; 3690 3707 } 3691 3708 3692 3709 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 3693 - if (rbd_dev->mapping.size != rbd_dev->header.image_size) 3694 - rbd_dev->mapping.size = rbd_dev->header.image_size; 3710 + rbd_dev->mapping.size = rbd_dev->header.image_size; 3695 3711 } else { 3696 3712 /* validate mapped snapshot's EXISTS flag */ 3697 3713 rbd_exists_validate(rbd_dev); 3698 3714 } 3699 3715 3716 + out: 3700 3717 up_write(&rbd_dev->header_rwsem); 3701 - 3702 - if (mapping_size != rbd_dev->mapping.size) 3718 + if (!ret && mapping_size != rbd_dev->mapping.size) 3703 3719 rbd_dev_update_size(rbd_dev); 3704 3720 3721 + return ret; 3722 + } 3723 + 3724 + static int rbd_init_request(void *data, struct request *rq, 3725 + unsigned int hctx_idx, unsigned int request_idx, 3726 + unsigned int numa_node) 3727 + { 3728 + struct work_struct *work = blk_mq_rq_to_pdu(rq); 3729 + 3730 + INIT_WORK(work, rbd_queue_workfn); 3705 3731 return 0; 3706 3732 } 3733 + 3734 + static struct blk_mq_ops rbd_mq_ops = { 3735 + .queue_rq = rbd_queue_rq, 3736 + .map_queue = blk_mq_map_queue, 3737 + .init_request = rbd_init_request, 3738 + }; 3707 3739 3708 3740 static int rbd_init_disk(struct rbd_device *rbd_dev) 3709 3741 { 3710 3742 struct gendisk *disk; 3711 3743 struct request_queue *q; 3712 3744 u64 segment_size; 3745 + int err; 3713 3746 3714 3747 /* create gendisk info */ 3715 3748 disk = alloc_disk(single_major ? ··· 3743 3744 disk->fops = &rbd_bd_ops; 3744 3745 disk->private_data = rbd_dev; 3745 3746 3746 - q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3747 - if (!q) 3747 + memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 3748 + rbd_dev->tag_set.ops = &rbd_mq_ops; 3749 + rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; 3750 + rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 3751 + rbd_dev->tag_set.flags = 3752 + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 3753 + rbd_dev->tag_set.nr_hw_queues = 1; 3754 + rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 3755 + 3756 + err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 3757 + if (err) 3748 3758 goto out_disk; 3759 + 3760 + q = blk_mq_init_queue(&rbd_dev->tag_set); 3761 + if (IS_ERR(q)) { 3762 + err = PTR_ERR(q); 3763 + goto out_tag_set; 3764 + } 3749 3765 3750 3766 /* We use the default size, but let's be explicit about it. */ 3751 3767 blk_queue_physical_block_size(q, SECTOR_SIZE); ··· 3787 3773 rbd_dev->disk = disk; 3788 3774 3789 3775 return 0; 3776 + out_tag_set: 3777 + blk_mq_free_tag_set(&rbd_dev->tag_set); 3790 3778 out_disk: 3791 3779 put_disk(disk); 3792 - 3793 - return -ENOMEM; 3780 + return err; 3794 3781 } 3795 3782 3796 3783 /* ··· 4048 4033 return NULL; 4049 4034 4050 4035 spin_lock_init(&rbd_dev->lock); 4051 - INIT_LIST_HEAD(&rbd_dev->rq_queue); 4052 - INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); 4053 4036 rbd_dev->flags = 0; 4054 4037 atomic_set(&rbd_dev->parent_ref, 0); 4055 4038 INIT_LIST_HEAD(&rbd_dev->node); ··· 4287 4274 } 4288 4275 4289 4276 /* 4290 - * We always update the parent overlap. If it's zero we 4291 - * treat it specially. 4277 + * We always update the parent overlap. If it's zero we issue 4278 + * a warning, as we will proceed as if there was no parent. 4292 4279 */ 4293 - rbd_dev->parent_overlap = overlap; 4294 4280 if (!overlap) { 4295 - 4296 - /* A null parent_spec indicates it's the initial probe */ 4297 - 4298 4281 if (parent_spec) { 4299 - /* 4300 - * The overlap has become zero, so the clone 4301 - * must have been resized down to 0 at some 4302 - * point. Treat this the same as a flatten. 4303 - */ 4304 - rbd_dev_parent_put(rbd_dev); 4305 - pr_info("%s: clone image now standalone\n", 4306 - rbd_dev->disk->disk_name); 4282 + /* refresh, careful to warn just once */ 4283 + if (rbd_dev->parent_overlap) 4284 + rbd_warn(rbd_dev, 4285 + "clone now standalone (overlap became 0)"); 4307 4286 } else { 4308 - /* 4309 - * For the initial probe, if we find the 4310 - * overlap is zero we just pretend there was 4311 - * no parent image. 4312 - */ 4313 - rbd_warn(rbd_dev, "ignoring parent with overlap 0"); 4287 + /* initial probe */ 4288 + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 4314 4289 } 4315 4290 } 4291 + rbd_dev->parent_overlap = overlap; 4292 + 4316 4293 out: 4317 4294 ret = 0; 4318 4295 out_err: ··· 4771 4768 *buf += strspn(*buf, spaces); /* Find start of token */ 4772 4769 4773 4770 return strcspn(*buf, spaces); /* Return token length */ 4774 - } 4775 - 4776 - /* 4777 - * Finds the next token in *buf, and if the provided token buffer is 4778 - * big enough, copies the found token into it. The result, if 4779 - * copied, is guaranteed to be terminated with '\0'. Note that *buf 4780 - * must be terminated with '\0' on entry. 4781 - * 4782 - * Returns the length of the token found (not including the '\0'). 4783 - * Return value will be 0 if no token is found, and it will be >= 4784 - * token_size if the token would not fit. 4785 - * 4786 - * The *buf pointer will be updated to point beyond the end of the 4787 - * found token. Note that this occurs even if the token buffer is 4788 - * too small to hold it. 4789 - */ 4790 - static inline size_t copy_token(const char **buf, 4791 - char *token, 4792 - size_t token_size) 4793 - { 4794 - size_t len; 4795 - 4796 - len = next_token(buf); 4797 - if (len < token_size) { 4798 - memcpy(token, *buf, len); 4799 - *(token + len) = '\0'; 4800 - } 4801 - *buf += len; 4802 - 4803 - return len; 4804 4771 } 4805 4772 4806 4773 /*
-14
fs/ceph/acl.c
··· 40 40 spin_unlock(&ci->i_ceph_lock); 41 41 } 42 42 43 - static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, 44 - int type) 45 - { 46 - struct ceph_inode_info *ci = ceph_inode(inode); 47 - struct posix_acl *acl = ACL_NOT_CACHED; 48 - 49 - spin_lock(&ci->i_ceph_lock); 50 - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) 51 - acl = get_cached_acl(inode, type); 52 - spin_unlock(&ci->i_ceph_lock); 53 - 54 - return acl; 55 - } 56 - 57 43 struct posix_acl *ceph_get_acl(struct inode *inode, int type) 58 44 { 59 45 int size;
+12 -7
fs/ceph/addr.c
··· 196 196 u64 len = PAGE_CACHE_SIZE; 197 197 198 198 if (off >= i_size_read(inode)) { 199 - zero_user_segment(page, err, PAGE_CACHE_SIZE); 199 + zero_user_segment(page, 0, PAGE_CACHE_SIZE); 200 200 SetPageUptodate(page); 201 201 return 0; 202 202 } 203 203 204 - /* 205 - * Uptodate inline data should have been added into page cache 206 - * while getting Fcr caps. 207 - */ 208 - if (ci->i_inline_version != CEPH_INLINE_NONE) 209 - return -EINVAL; 204 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 205 + /* 206 + * Uptodate inline data should have been added 207 + * into page cache while getting Fcr caps. 208 + */ 209 + if (off == 0) 210 + return -EINVAL; 211 + zero_user_segment(page, 0, PAGE_CACHE_SIZE); 212 + SetPageUptodate(page); 213 + return 0; 214 + } 210 215 211 216 err = ceph_readpage_from_fscache(inode, page); 212 217 if (err == 0)
+74 -53
fs/ceph/caps.c
··· 577 577 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, 578 578 realmino); 579 579 if (realm) { 580 - ceph_get_snap_realm(mdsc, realm); 581 580 spin_lock(&realm->inodes_with_caps_lock); 582 581 ci->i_snap_realm = realm; 583 582 list_add(&ci->i_snap_realm_item, ··· 1450 1451 spin_lock(&mdsc->cap_dirty_lock); 1451 1452 list_del_init(&ci->i_dirty_item); 1452 1453 1453 - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; 1454 1454 if (list_empty(&ci->i_flushing_item)) { 1455 + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; 1455 1456 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1456 1457 mdsc->num_cap_flushing++; 1457 1458 dout(" inode %p now flushing seq %lld\n", inode, ··· 2072 2073 * requested from the MDS. 2073 2074 */ 2074 2075 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2075 - loff_t endoff, int *got, struct page **pinned_page, 2076 - int *check_max, int *err) 2076 + loff_t endoff, int *got, int *check_max, int *err) 2077 2077 { 2078 2078 struct inode *inode = &ci->vfs_inode; 2079 2079 int ret = 0; 2080 - int have, implemented, _got = 0; 2080 + int have, implemented; 2081 2081 int file_wanted; 2082 2082 2083 2083 dout("get_cap_refs %p need %s want %s\n", inode, 2084 2084 ceph_cap_string(need), ceph_cap_string(want)); 2085 - again: 2085 + 2086 2086 spin_lock(&ci->i_ceph_lock); 2087 2087 2088 2088 /* make sure file is actually open */ ··· 2136 2138 inode, ceph_cap_string(have), ceph_cap_string(not), 2137 2139 ceph_cap_string(revoking)); 2138 2140 if ((revoking & not) == 0) { 2139 - _got = need | (have & want); 2140 - __take_cap_refs(ci, _got); 2141 + *got = need | (have & want); 2142 + __take_cap_refs(ci, *got); 2141 2143 ret = 1; 2142 2144 } 2143 2145 } else { 2146 + int session_readonly = false; 2147 + if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { 2148 + struct ceph_mds_session *s = ci->i_auth_cap->session; 2149 + spin_lock(&s->s_cap_lock); 2150 + session_readonly = s->s_readonly; 2151 + spin_unlock(&s->s_cap_lock); 2152 + } 2153 + if (session_readonly) { 2154 + dout("get_cap_refs %p needed %s but mds%d readonly\n", 2155 + inode, ceph_cap_string(need), ci->i_auth_cap->mds); 2156 + *err = -EROFS; 2157 + ret = 1; 2158 + goto out_unlock; 2159 + } 2160 + 2144 2161 dout("get_cap_refs %p have %s needed %s\n", inode, 2145 2162 ceph_cap_string(have), ceph_cap_string(need)); 2146 2163 } 2147 2164 out_unlock: 2148 2165 spin_unlock(&ci->i_ceph_lock); 2149 2166 2150 - if (ci->i_inline_version != CEPH_INLINE_NONE && 2151 - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2152 - i_size_read(inode) > 0) { 2153 - int ret1; 2154 - struct page *page = find_get_page(inode->i_mapping, 0); 2155 - if (page) { 2156 - if (PageUptodate(page)) { 2157 - *pinned_page = page; 2158 - goto out; 2159 - } 2160 - page_cache_release(page); 2161 - } 2162 - /* 2163 - * drop cap refs first because getattr while holding 2164 - * caps refs can cause deadlock. 2165 - */ 2166 - ceph_put_cap_refs(ci, _got); 2167 - _got = 0; 2168 - 2169 - /* getattr request will bring inline data into page cache */ 2170 - ret1 = __ceph_do_getattr(inode, NULL, 2171 - CEPH_STAT_CAP_INLINE_DATA, true); 2172 - if (ret1 >= 0) { 2173 - ret = 0; 2174 - goto again; 2175 - } 2176 - *err = ret1; 2177 - ret = 1; 2178 - } 2179 - out: 2180 2167 dout("get_cap_refs %p ret %d got %s\n", inode, 2181 - ret, ceph_cap_string(_got)); 2182 - *got = _got; 2168 + ret, ceph_cap_string(*got)); 2183 2169 return ret; 2184 2170 } 2185 2171 ··· 2203 2221 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2204 2222 loff_t endoff, int *got, struct page **pinned_page) 2205 2223 { 2206 - int check_max, ret, err; 2224 + int _got, check_max, ret, err = 0; 2207 2225 2208 2226 retry: 2209 2227 if (endoff > 0) 2210 2228 check_max_size(&ci->vfs_inode, endoff); 2229 + _got = 0; 2211 2230 check_max = 0; 2212 - err = 0; 2213 2231 ret = wait_event_interruptible(ci->i_cap_wq, 2214 - try_get_cap_refs(ci, need, want, endoff, 2215 - got, pinned_page, 2216 - &check_max, &err)); 2232 + try_get_cap_refs(ci, need, want, endoff, 2233 + &_got, &check_max, &err)); 2217 2234 if (err) 2218 2235 ret = err; 2236 + if (ret < 0) 2237 + return ret; 2238 + 2219 2239 if (check_max) 2220 2240 goto retry; 2221 - return ret; 2241 + 2242 + if (ci->i_inline_version != CEPH_INLINE_NONE && 2243 + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2244 + i_size_read(&ci->vfs_inode) > 0) { 2245 + struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); 2246 + if (page) { 2247 + if (PageUptodate(page)) { 2248 + *pinned_page = page; 2249 + goto out; 2250 + } 2251 + page_cache_release(page); 2252 + } 2253 + /* 2254 + * drop cap refs first because getattr while holding 2255 + * caps refs can cause deadlock. 2256 + */ 2257 + ceph_put_cap_refs(ci, _got); 2258 + _got = 0; 2259 + 2260 + /* getattr request will bring inline data into page cache */ 2261 + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, 2262 + CEPH_STAT_CAP_INLINE_DATA, true); 2263 + if (ret < 0) 2264 + return ret; 2265 + goto retry; 2266 + } 2267 + out: 2268 + *got = _got; 2269 + return 0; 2222 2270 } 2223 2271 2224 2272 /* ··· 2444 2432 */ 2445 2433 static void handle_cap_grant(struct ceph_mds_client *mdsc, 2446 2434 struct inode *inode, struct ceph_mds_caps *grant, 2447 - void *snaptrace, int snaptrace_len, 2448 2435 u64 inline_version, 2449 2436 void *inline_data, int inline_len, 2450 2437 struct ceph_buffer *xattr_buf, 2451 2438 struct ceph_mds_session *session, 2452 2439 struct ceph_cap *cap, int issued) 2453 2440 __releases(ci->i_ceph_lock) 2441 + __releases(mdsc->snap_rwsem) 2454 2442 { 2455 2443 struct ceph_inode_info *ci = ceph_inode(inode); 2456 2444 int mds = session->s_mds; ··· 2651 2639 spin_unlock(&ci->i_ceph_lock); 2652 2640 2653 2641 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 2654 - down_write(&mdsc->snap_rwsem); 2655 - ceph_update_snap_trace(mdsc, snaptrace, 2656 - snaptrace + snaptrace_len, false); 2657 - downgrade_write(&mdsc->snap_rwsem); 2658 2642 kick_flushing_inode_caps(mdsc, session, inode); 2659 2643 up_read(&mdsc->snap_rwsem); 2660 2644 if (newcaps & ~issued) ··· 3060 3052 struct ceph_cap *cap; 3061 3053 struct ceph_mds_caps *h; 3062 3054 struct ceph_mds_cap_peer *peer = NULL; 3055 + struct ceph_snap_realm *realm; 3063 3056 int mds = session->s_mds; 3064 3057 int op, issued; 3065 3058 u32 seq, mseq; ··· 3162 3153 goto done_unlocked; 3163 3154 3164 3155 case CEPH_CAP_OP_IMPORT: 3156 + realm = NULL; 3157 + if (snaptrace_len) { 3158 + down_write(&mdsc->snap_rwsem); 3159 + ceph_update_snap_trace(mdsc, snaptrace, 3160 + snaptrace + snaptrace_len, 3161 + false, &realm); 3162 + downgrade_write(&mdsc->snap_rwsem); 3163 + } else { 3164 + down_read(&mdsc->snap_rwsem); 3165 + } 3165 3166 handle_cap_import(mdsc, inode, h, peer, session, 3166 3167 &cap, &issued); 3167 - handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, 3168 + handle_cap_grant(mdsc, inode, h, 3168 3169 inline_version, inline_data, inline_len, 3169 3170 msg->middle, session, cap, issued); 3171 + if (realm) 3172 + ceph_put_snap_realm(mdsc, realm); 3170 3173 goto done_unlocked; 3171 3174 } 3172 3175 ··· 3198 3177 case CEPH_CAP_OP_GRANT: 3199 3178 __ceph_caps_issued(ci, &issued); 3200 3179 issued |= __ceph_caps_dirty(ci); 3201 - handle_cap_grant(mdsc, inode, h, NULL, 0, 3180 + handle_cap_grant(mdsc, inode, h, 3202 3181 inline_version, inline_data, inline_len, 3203 3182 msg->middle, session, cap, issued); 3204 3183 goto done_unlocked;
+25 -8
fs/ceph/dir.c
··· 26 26 * point by name. 27 27 */ 28 28 29 - const struct inode_operations ceph_dir_iops; 30 - const struct file_operations ceph_dir_fops; 31 29 const struct dentry_operations ceph_dentry_ops; 32 30 33 31 /* ··· 670 672 /* 671 673 * We created the item, then did a lookup, and found 672 674 * it was already linked to another inode we already 673 - * had in our cache (and thus got spliced). Link our 674 - * dentry to that inode, but don't hash it, just in 675 - * case the VFS wants to dereference it. 675 + * had in our cache (and thus got spliced). To not 676 + * confuse VFS (especially when inode is a directory), 677 + * we don't link our dentry to that inode, return an 678 + * error instead. 679 + * 680 + * This event should be rare and it happens only when 681 + * we talk to old MDS. Recent MDS does not send traceless 682 + * reply for request that creates new inode. 676 683 */ 677 - BUG_ON(!result->d_inode); 678 - d_instantiate(dentry, result->d_inode); 679 - return 0; 684 + d_drop(result); 685 + return -ESTALE; 680 686 } 681 687 return PTR_ERR(result); 682 688 } ··· 1337 1335 .fsync = ceph_dir_fsync, 1338 1336 }; 1339 1337 1338 + const struct file_operations ceph_snapdir_fops = { 1339 + .iterate = ceph_readdir, 1340 + .llseek = ceph_dir_llseek, 1341 + .open = ceph_open, 1342 + .release = ceph_release, 1343 + }; 1344 + 1340 1345 const struct inode_operations ceph_dir_iops = { 1341 1346 .lookup = ceph_lookup, 1342 1347 .permission = ceph_permission, ··· 1364 1355 .rename = ceph_rename, 1365 1356 .create = ceph_create, 1366 1357 .atomic_open = ceph_atomic_open, 1358 + }; 1359 + 1360 + const struct inode_operations ceph_snapdir_iops = { 1361 + .lookup = ceph_lookup, 1362 + .permission = ceph_permission, 1363 + .getattr = ceph_getattr, 1364 + .mkdir = ceph_mkdir, 1365 + .rmdir = ceph_unlink, 1367 1366 }; 1368 1367 1369 1368 const struct dentry_operations ceph_dentry_ops = {
+22 -15
fs/ceph/file.c
··· 275 275 err = ceph_mdsc_do_request(mdsc, 276 276 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 277 277 req); 278 + err = ceph_handle_snapdir(req, dentry, err); 278 279 if (err) 279 280 goto out_req; 280 281 281 - err = ceph_handle_snapdir(req, dentry, err); 282 282 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 283 283 err = ceph_handle_notrace_create(dir, dentry); 284 284 ··· 392 392 if (ret >= 0) { 393 393 int didpages; 394 394 if (was_short && (pos + ret < inode->i_size)) { 395 - u64 tmp = min(this_len - ret, 396 - inode->i_size - pos - ret); 395 + int zlen = min(this_len - ret, 396 + inode->i_size - pos - ret); 397 + int zoff = (o_direct ? buf_align : io_align) + 398 + read + ret; 397 399 dout(" zero gap %llu to %llu\n", 398 - pos + ret, pos + ret + tmp); 399 - ceph_zero_page_vector_range(page_align + read + ret, 400 - tmp, pages); 401 - ret += tmp; 400 + pos + ret, pos + ret + zlen); 401 + ceph_zero_page_vector_range(zoff, zlen, pages); 402 + ret += zlen; 402 403 } 403 404 404 405 didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; ··· 879 878 880 879 i_size = i_size_read(inode); 881 880 if (retry_op == READ_INLINE) { 882 - /* does not support inline data > PAGE_SIZE */ 883 - if (i_size > PAGE_CACHE_SIZE) { 884 - ret = -EIO; 885 - } else if (iocb->ki_pos < i_size) { 881 + BUG_ON(ret > 0 || read > 0); 882 + if (iocb->ki_pos < i_size && 883 + iocb->ki_pos < PAGE_CACHE_SIZE) { 886 884 loff_t end = min_t(loff_t, i_size, 887 885 iocb->ki_pos + len); 886 + end = min_t(loff_t, end, PAGE_CACHE_SIZE); 888 887 if (statret < end) 889 888 zero_user_segment(page, statret, end); 890 889 ret = copy_page_to_iter(page, 891 890 iocb->ki_pos & ~PAGE_MASK, 892 891 end - iocb->ki_pos, to); 893 892 iocb->ki_pos += ret; 894 - } else { 895 - ret = 0; 893 + read += ret; 894 + } 895 + if (iocb->ki_pos < i_size && read < len) { 896 + size_t zlen = min_t(size_t, len - read, 897 + i_size - iocb->ki_pos); 898 + ret = iov_iter_zero(zlen, to); 899 + iocb->ki_pos += ret; 900 + read += ret; 896 901 } 897 902 __free_pages(page, 0); 898 - return ret; 903 + return read; 899 904 } 900 905 901 906 /* hit EOF or hole? */ 902 907 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 903 - ret < len) { 908 + ret < len) { 904 909 dout("sync_read hit hole, ppos %lld < size %lld" 905 910 ", reading more\n", iocb->ki_pos, 906 911 inode->i_size);
+22 -19
fs/ceph/inode.c
··· 82 82 inode->i_mode = parent->i_mode; 83 83 inode->i_uid = parent->i_uid; 84 84 inode->i_gid = parent->i_gid; 85 - inode->i_op = &ceph_dir_iops; 86 - inode->i_fop = &ceph_dir_fops; 85 + inode->i_op = &ceph_snapdir_iops; 86 + inode->i_fop = &ceph_snapdir_fops; 87 87 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 88 88 ci->i_rbytes = 0; 89 89 return inode; ··· 838 838 ceph_vinop(inode), inode->i_mode); 839 839 } 840 840 841 - /* set dir completion flag? */ 842 - if (S_ISDIR(inode->i_mode) && 843 - ci->i_files == 0 && ci->i_subdirs == 0 && 844 - ceph_snap(inode) == CEPH_NOSNAP && 845 - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 846 - (issued & CEPH_CAP_FILE_EXCL) == 0 && 847 - !__ceph_dir_is_complete(ci)) { 848 - dout(" marking %p complete (empty)\n", inode); 849 - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), 850 - ci->i_ordered_count); 851 - } 852 - 853 841 /* were we issued a capability? */ 854 842 if (info->cap.caps) { 855 843 if (ceph_snap(inode) == CEPH_NOSNAP) { 844 + unsigned caps = le32_to_cpu(info->cap.caps); 856 845 ceph_add_cap(inode, session, 857 846 le64_to_cpu(info->cap.cap_id), 858 - cap_fmode, 859 - le32_to_cpu(info->cap.caps), 847 + cap_fmode, caps, 860 848 le32_to_cpu(info->cap.wanted), 861 849 le32_to_cpu(info->cap.seq), 862 850 le32_to_cpu(info->cap.mseq), 863 851 le64_to_cpu(info->cap.realm), 864 852 info->cap.flags, &new_cap); 853 + 854 + /* set dir completion flag? */ 855 + if (S_ISDIR(inode->i_mode) && 856 + ci->i_files == 0 && ci->i_subdirs == 0 && 857 + (caps & CEPH_CAP_FILE_SHARED) && 858 + (issued & CEPH_CAP_FILE_EXCL) == 0 && 859 + !__ceph_dir_is_complete(ci)) { 860 + dout(" marking %p complete (empty)\n", inode); 861 + __ceph_dir_set_complete(ci, 862 + atomic_read(&ci->i_release_count), 863 + ci->i_ordered_count); 864 + } 865 + 865 866 wake = true; 866 867 } else { 867 868 dout(" %p got snap_caps %s\n", inode, ··· 1447 1446 } 1448 1447 1449 1448 if (!dn->d_inode) { 1450 - dn = splice_dentry(dn, in, NULL); 1451 - if (IS_ERR(dn)) { 1452 - err = PTR_ERR(dn); 1449 + struct dentry *realdn = splice_dentry(dn, in, NULL); 1450 + if (IS_ERR(realdn)) { 1451 + err = PTR_ERR(realdn); 1452 + d_drop(dn); 1453 1453 dn = NULL; 1454 1454 goto next_item; 1455 1455 } 1456 + dn = realdn; 1456 1457 } 1457 1458 1458 1459 di = dn->d_fsdata;
+93 -34
fs/ceph/mds_client.c
··· 480 480 mdsc->max_sessions = newmax; 481 481 } 482 482 mdsc->sessions[mds] = s; 483 + atomic_inc(&mdsc->num_sessions); 483 484 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ 484 485 485 486 ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ··· 504 503 mdsc->sessions[s->s_mds] = NULL; 505 504 ceph_con_close(&s->s_con); 506 505 ceph_put_mds_session(s); 506 + atomic_dec(&mdsc->num_sessions); 507 507 } 508 508 509 509 /* ··· 844 842 struct ceph_options *opt = mdsc->fsc->client->options; 845 843 void *p; 846 844 847 - const char* metadata[3][2] = { 845 + const char* metadata[][2] = { 848 846 {"hostname", utsname()->nodename}, 847 + {"kernel_version", utsname()->release}, 849 848 {"entity_id", opt->name ? opt->name : ""}, 850 849 {NULL, NULL} 851 850 }; ··· 1467 1464 return err; 1468 1465 } 1469 1466 1467 + static int check_cap_flush(struct inode *inode, u64 want_flush_seq) 1468 + { 1469 + struct ceph_inode_info *ci = ceph_inode(inode); 1470 + int ret; 1471 + spin_lock(&ci->i_ceph_lock); 1472 + if (ci->i_flushing_caps) 1473 + ret = ci->i_cap_flush_seq >= want_flush_seq; 1474 + else 1475 + ret = 1; 1476 + spin_unlock(&ci->i_ceph_lock); 1477 + return ret; 1478 + } 1479 + 1470 1480 /* 1471 1481 * flush all dirty inode data to disk. 1472 1482 * 1473 1483 * returns true if we've flushed through want_flush_seq 1474 1484 */ 1475 - static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) 1485 + static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) 1476 1486 { 1477 - int mds, ret = 1; 1487 + int mds; 1478 1488 1479 1489 dout("check_cap_flush want %lld\n", want_flush_seq); 1480 1490 mutex_lock(&mdsc->mutex); 1481 - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { 1491 + for (mds = 0; mds < mdsc->max_sessions; mds++) { 1482 1492 struct ceph_mds_session *session = mdsc->sessions[mds]; 1493 + struct inode *inode = NULL; 1483 1494 1484 1495 if (!session) 1485 1496 continue; ··· 1506 1489 list_entry(session->s_cap_flushing.next, 1507 1490 struct ceph_inode_info, 1508 1491 i_flushing_item); 1509 - struct inode *inode = &ci->vfs_inode; 1510 1492 1511 - spin_lock(&ci->i_ceph_lock); 1512 - if (ci->i_cap_flush_seq <= want_flush_seq) { 1493 + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { 1513 1494 dout("check_cap_flush still flushing %p " 1514 - "seq %lld <= %lld to mds%d\n", inode, 1515 - ci->i_cap_flush_seq, want_flush_seq, 1516 - session->s_mds); 1517 - ret = 0; 1495 + "seq %lld <= %lld to mds%d\n", 1496 + &ci->vfs_inode, ci->i_cap_flush_seq, 1497 + want_flush_seq, session->s_mds); 1498 + inode = igrab(&ci->vfs_inode); 1518 1499 } 1519 - spin_unlock(&ci->i_ceph_lock); 1520 1500 } 1521 1501 mutex_unlock(&session->s_mutex); 1522 1502 ceph_put_mds_session(session); 1523 1503 1524 - if (!ret) 1525 - return ret; 1504 + if (inode) { 1505 + wait_event(mdsc->cap_flushing_wq, 1506 + check_cap_flush(inode, want_flush_seq)); 1507 + iput(inode); 1508 + } 1509 + 1526 1510 mutex_lock(&mdsc->mutex); 1527 1511 } 1528 1512 1529 1513 mutex_unlock(&mdsc->mutex); 1530 1514 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); 1531 - return ret; 1532 1515 } 1533 1516 1534 1517 /* ··· 1940 1923 head->num_releases = cpu_to_le16(releases); 1941 1924 1942 1925 /* time stamp */ 1943 - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 1926 + { 1927 + struct ceph_timespec ts; 1928 + ceph_encode_timespec(&ts, &req->r_stamp); 1929 + ceph_encode_copy(&p, &ts, sizeof(ts)); 1930 + } 1944 1931 1945 1932 BUG_ON(p > end); 1946 1933 msg->front.iov_len = p - msg->front.iov_base; ··· 2033 2012 2034 2013 /* time stamp */ 2035 2014 p = msg->front.iov_base + req->r_request_release_offset; 2036 - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 2015 + { 2016 + struct ceph_timespec ts; 2017 + ceph_encode_timespec(&ts, &req->r_stamp); 2018 + ceph_encode_copy(&p, &ts, sizeof(ts)); 2019 + } 2037 2020 2038 2021 msg->front.iov_len = p - msg->front.iov_base; 2039 2022 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); ··· 2184 2159 p = rb_next(p); 2185 2160 if (req->r_got_unsafe) 2186 2161 continue; 2162 + if (req->r_attempts > 0) 2163 + continue; /* only new requests */ 2187 2164 if (req->r_session && 2188 2165 req->r_session->s_mds == mds) { 2189 2166 dout(" kicking tid %llu\n", req->r_tid); ··· 2313 2286 struct ceph_mds_request *req; 2314 2287 struct ceph_mds_reply_head *head = msg->front.iov_base; 2315 2288 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ 2289 + struct ceph_snap_realm *realm; 2316 2290 u64 tid; 2317 2291 int err, result; 2318 2292 int mds = session->s_mds; ··· 2429 2401 } 2430 2402 2431 2403 /* snap trace */ 2404 + realm = NULL; 2432 2405 if (rinfo->snapblob_len) { 2433 2406 down_write(&mdsc->snap_rwsem); 2434 2407 ceph_update_snap_trace(mdsc, rinfo->snapblob, 2435 - rinfo->snapblob + rinfo->snapblob_len, 2436 - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); 2408 + rinfo->snapblob + rinfo->snapblob_len, 2409 + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, 2410 + &realm); 2437 2411 downgrade_write(&mdsc->snap_rwsem); 2438 2412 } else { 2439 2413 down_read(&mdsc->snap_rwsem); ··· 2453 2423 mutex_unlock(&req->r_fill_mutex); 2454 2424 2455 2425 up_read(&mdsc->snap_rwsem); 2426 + if (realm) 2427 + ceph_put_snap_realm(mdsc, realm); 2456 2428 out_err: 2457 2429 mutex_lock(&mdsc->mutex); 2458 2430 if (!req->r_aborted) { ··· 2519 2487 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); 2520 2488 BUG_ON(req->r_err); 2521 2489 BUG_ON(req->r_got_result); 2490 + req->r_attempts = 0; 2522 2491 req->r_num_fwd = fwd_seq; 2523 2492 req->r_resend_mds = next_mds; 2524 2493 put_request_session(req); ··· 2613 2580 send_flushmsg_ack(mdsc, session, seq); 2614 2581 break; 2615 2582 2583 + case CEPH_SESSION_FORCE_RO: 2584 + dout("force_session_readonly %p\n", session); 2585 + spin_lock(&session->s_cap_lock); 2586 + session->s_readonly = true; 2587 + spin_unlock(&session->s_cap_lock); 2588 + wake_up_session_caps(session, 0); 2589 + break; 2590 + 2616 2591 default: 2617 2592 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); 2618 2593 WARN_ON(1); ··· 2651 2610 struct ceph_mds_session *session) 2652 2611 { 2653 2612 struct ceph_mds_request *req, *nreq; 2613 + struct rb_node *p; 2654 2614 int err; 2655 2615 2656 2616 dout("replay_unsafe_requests mds%d\n", session->s_mds); ··· 2662 2620 if (!err) { 2663 2621 ceph_msg_get(req->r_request); 2664 2622 ceph_con_send(&session->s_con, req->r_request); 2623 + } 2624 + } 2625 + 2626 + /* 2627 + * also re-send old requests when MDS enters reconnect stage. So that MDS 2628 + * can process completed request in clientreplay stage. 2629 + */ 2630 + p = rb_first(&mdsc->request_tree); 2631 + while (p) { 2632 + req = rb_entry(p, struct ceph_mds_request, r_node); 2633 + p = rb_next(p); 2634 + if (req->r_got_unsafe) 2635 + continue; 2636 + if (req->r_attempts == 0) 2637 + continue; /* only old requests */ 2638 + if (req->r_session && 2639 + req->r_session->s_mds == session->s_mds) { 2640 + err = __prepare_send_request(mdsc, req, session->s_mds); 2641 + if (!err) { 2642 + ceph_msg_get(req->r_request); 2643 + ceph_con_send(&session->s_con, req->r_request); 2644 + } 2665 2645 } 2666 2646 } 2667 2647 mutex_unlock(&mdsc->mutex); ··· 2851 2787 spin_unlock(&session->s_gen_ttl_lock); 2852 2788 2853 2789 spin_lock(&session->s_cap_lock); 2790 + /* don't know if session is readonly */ 2791 + session->s_readonly = 0; 2854 2792 /* 2855 2793 * notify __ceph_remove_cap() that we are composing cap reconnect. 2856 2794 * If a cap get released before being added to the cap reconnect, ··· 2999 2933 mutex_unlock(&s->s_mutex); 3000 2934 s->s_state = CEPH_MDS_SESSION_RESTARTING; 3001 2935 } 3002 - 3003 - /* kick any requests waiting on the recovering mds */ 3004 - kick_requests(mdsc, i); 3005 2936 } else if (oldstate == newstate) { 3006 2937 continue; /* nothing new with this mds */ 3007 2938 } ··· 3358 3295 init_waitqueue_head(&mdsc->session_close_wq); 3359 3296 INIT_LIST_HEAD(&mdsc->waiting_for_map); 3360 3297 mdsc->sessions = NULL; 3298 + atomic_set(&mdsc->num_sessions, 0); 3361 3299 mdsc->max_sessions = 0; 3362 3300 mdsc->stopping = 0; 3363 3301 init_rwsem(&mdsc->snap_rwsem); ··· 3492 3428 dout("sync\n"); 3493 3429 mutex_lock(&mdsc->mutex); 3494 3430 want_tid = mdsc->last_tid; 3495 - want_flush = mdsc->cap_flush_seq; 3496 3431 mutex_unlock(&mdsc->mutex); 3497 - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); 3498 3432 3499 3433 ceph_flush_dirty_caps(mdsc); 3434 + spin_lock(&mdsc->cap_dirty_lock); 3435 + want_flush = mdsc->cap_flush_seq; 3436 + spin_unlock(&mdsc->cap_dirty_lock); 3437 + 3438 + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); 3500 3439 3501 3440 wait_unsafe_requests(mdsc, want_tid); 3502 - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3441 + wait_caps_flush(mdsc, want_flush); 3503 3442 } 3504 3443 3505 3444 /* ··· 3510 3443 */ 3511 3444 static bool done_closing_sessions(struct ceph_mds_client *mdsc) 3512 3445 { 3513 - int i, n = 0; 3514 - 3515 3446 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3516 3447 return true; 3517 - 3518 - mutex_lock(&mdsc->mutex); 3519 - for (i = 0; i < mdsc->max_sessions; i++) 3520 - if (mdsc->sessions[i]) 3521 - n++; 3522 - mutex_unlock(&mdsc->mutex); 3523 - return n == 0; 3448 + return atomic_read(&mdsc->num_sessions) == 0; 3524 3449 } 3525 3450 3526 3451 /*
+2
fs/ceph/mds_client.h
··· 137 137 int s_nr_caps, s_trim_caps; 138 138 int s_num_cap_releases; 139 139 int s_cap_reconnect; 140 + int s_readonly; 140 141 struct list_head s_cap_releases; /* waiting cap_release messages */ 141 142 struct list_head s_cap_releases_done; /* ready to send */ 142 143 struct ceph_cap *s_cap_iterator; ··· 273 272 struct list_head waiting_for_map; 274 273 275 274 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 275 + atomic_t num_sessions; 276 276 int max_sessions; /* len of s_mds_sessions */ 277 277 int stopping; /* true if shutting down */ 278 278
+38 -16
fs/ceph/snap.c
··· 70 70 * safe. we do need to protect against concurrent empty list 71 71 * additions, however. 72 72 */ 73 - if (atomic_read(&realm->nref) == 0) { 73 + if (atomic_inc_return(&realm->nref) == 1) { 74 74 spin_lock(&mdsc->snap_empty_lock); 75 75 list_del_init(&realm->empty_item); 76 76 spin_unlock(&mdsc->snap_empty_lock); 77 77 } 78 - 79 - atomic_inc(&realm->nref); 80 78 } 81 79 82 80 static void __insert_snap_realm(struct rb_root *root, ··· 114 116 if (!realm) 115 117 return ERR_PTR(-ENOMEM); 116 118 117 - atomic_set(&realm->nref, 0); /* tree does not take a ref */ 119 + atomic_set(&realm->nref, 1); /* for caller */ 118 120 realm->ino = ino; 119 121 INIT_LIST_HEAD(&realm->children); 120 122 INIT_LIST_HEAD(&realm->child_item); ··· 132 134 * 133 135 * caller must hold snap_rwsem for write. 134 136 */ 135 - struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 136 - u64 ino) 137 + static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, 138 + u64 ino) 137 139 { 138 140 struct rb_node *n = mdsc->snap_realms.rb_node; 139 141 struct ceph_snap_realm *r; ··· 150 152 } 151 153 } 152 154 return NULL; 155 + } 156 + 157 + struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 158 + u64 ino) 159 + { 160 + struct ceph_snap_realm *r; 161 + r = __lookup_snap_realm(mdsc, ino); 162 + if (r) 163 + ceph_get_snap_realm(mdsc, r); 164 + return r; 153 165 } 154 166 155 167 static void __put_snap_realm(struct ceph_mds_client *mdsc, ··· 281 273 } 282 274 realm->parent_ino = parentino; 283 275 realm->parent = parent; 284 - ceph_get_snap_realm(mdsc, parent); 285 276 list_add(&realm->child_item, &parent->children); 286 277 return 1; 287 278 } ··· 638 631 * Caller must hold snap_rwsem for write. 639 632 */ 640 633 int ceph_update_snap_trace(struct ceph_mds_client *mdsc, 641 - void *p, void *e, bool deletion) 634 + void *p, void *e, bool deletion, 635 + struct ceph_snap_realm **realm_ret) 642 636 { 643 637 struct ceph_mds_snap_realm *ri; /* encoded */ 644 638 __le64 *snaps; /* encoded */ 645 639 __le64 *prior_parent_snaps; /* encoded */ 646 - struct ceph_snap_realm *realm; 640 + struct ceph_snap_realm *realm = NULL; 641 + struct ceph_snap_realm *first_realm = NULL; 647 642 int invalidate = 0; 648 643 int err = -ENOMEM; 649 644 LIST_HEAD(dirty_realms); ··· 713 704 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 714 705 realm, invalidate, p, e); 715 706 707 + /* invalidate when we reach the _end_ (root) of the trace */ 708 + if (invalidate && p >= e) 709 + rebuild_snap_realms(realm); 710 + 711 + if (!first_realm) 712 + first_realm = realm; 713 + else 714 + ceph_put_snap_realm(mdsc, realm); 715 + 716 716 if (p < e) 717 717 goto more; 718 - 719 - /* invalidate when we reach the _end_ (root) of the trace */ 720 - if (invalidate) 721 - rebuild_snap_realms(realm); 722 718 723 719 /* 724 720 * queue cap snaps _after_ we've built the new snap contexts, ··· 735 721 queue_realm_cap_snaps(realm); 736 722 } 737 723 724 + if (realm_ret) 725 + *realm_ret = first_realm; 726 + else 727 + ceph_put_snap_realm(mdsc, first_realm); 728 + 738 729 __cleanup_empty_realms(mdsc); 739 730 return 0; 740 731 741 732 bad: 742 733 err = -EINVAL; 743 734 fail: 735 + if (realm && !IS_ERR(realm)) 736 + ceph_put_snap_realm(mdsc, realm); 737 + if (first_realm) 738 + ceph_put_snap_realm(mdsc, first_realm); 744 739 pr_err("update_snap_trace error %d\n", err); 745 740 return err; 746 741 } ··· 867 844 if (IS_ERR(realm)) 868 845 goto out; 869 846 } 870 - ceph_get_snap_realm(mdsc, realm); 871 847 872 848 dout("splitting snap_realm %llx %p\n", realm->ino, realm); 873 849 for (i = 0; i < num_split_inos; i++) { ··· 927 905 /* we may have taken some of the old realm's children. */ 928 906 for (i = 0; i < num_split_realms; i++) { 929 907 struct ceph_snap_realm *child = 930 - ceph_lookup_snap_realm(mdsc, 908 + __lookup_snap_realm(mdsc, 931 909 le64_to_cpu(split_realms[i])); 932 910 if (!child) 933 911 continue; ··· 940 918 * snap, we can avoid queueing cap_snaps. 941 919 */ 942 920 ceph_update_snap_trace(mdsc, p, e, 943 - op == CEPH_SNAP_OP_DESTROY); 921 + op == CEPH_SNAP_OP_DESTROY, NULL); 944 922 945 923 if (op == CEPH_SNAP_OP_SPLIT) 946 924 /* we took a reference when we created the realm, above */
+4
fs/ceph/super.c
··· 414 414 seq_puts(m, ",noshare"); 415 415 if (opt->flags & CEPH_OPT_NOCRC) 416 416 seq_puts(m, ",nocrc"); 417 + if (opt->flags & CEPH_OPT_NOMSGAUTH) 418 + seq_puts(m, ",nocephx_require_signatures"); 419 + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) 420 + seq_puts(m, ",notcp_nodelay"); 417 421 418 422 if (opt->name) 419 423 seq_printf(m, ",name=%s", opt->name);
+4 -1
fs/ceph/super.h
··· 693 693 extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, 694 694 struct ceph_snap_realm *realm); 695 695 extern int ceph_update_snap_trace(struct ceph_mds_client *m, 696 - void *p, void *e, bool deletion); 696 + void *p, void *e, bool deletion, 697 + struct ceph_snap_realm **realm_ret); 697 698 extern void ceph_handle_snap(struct ceph_mds_client *mdsc, 698 699 struct ceph_mds_session *session, 699 700 struct ceph_msg *msg); ··· 893 892 int ceph_uninline_data(struct file *filp, struct page *locked_page); 894 893 /* dir.c */ 895 894 extern const struct file_operations ceph_dir_fops; 895 + extern const struct file_operations ceph_snapdir_fops; 896 896 extern const struct inode_operations ceph_dir_iops; 897 + extern const struct inode_operations ceph_snapdir_iops; 897 898 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 898 899 ceph_snapdir_dentry_ops; 899 900
+1 -36
include/linux/ceph/ceph_fs.h
··· 158 158 }; 159 159 160 160 161 - /* pool operations */ 162 - enum { 163 - POOL_OP_CREATE = 0x01, 164 - POOL_OP_DELETE = 0x02, 165 - POOL_OP_AUID_CHANGE = 0x03, 166 - POOL_OP_CREATE_SNAP = 0x11, 167 - POOL_OP_DELETE_SNAP = 0x12, 168 - POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, 169 - POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, 170 - }; 171 - 172 161 struct ceph_mon_request_header { 173 162 __le64 have_version; 174 163 __le16 session_mon; ··· 178 189 struct ceph_fsid fsid; 179 190 __le64 version; 180 191 struct ceph_statfs st; 181 - } __attribute__ ((packed)); 182 - 183 - const char *ceph_pool_op_name(int op); 184 - 185 - struct ceph_mon_poolop { 186 - struct ceph_mon_request_header monhdr; 187 - struct ceph_fsid fsid; 188 - __le32 pool; 189 - __le32 op; 190 - __le64 auid; 191 - __le64 snapid; 192 - __le32 name_len; 193 - } __attribute__ ((packed)); 194 - 195 - struct ceph_mon_poolop_reply { 196 - struct ceph_mon_request_header monhdr; 197 - struct ceph_fsid fsid; 198 - __le32 reply_code; 199 - __le32 epoch; 200 - char has_data; 201 - char data[0]; 202 - } __attribute__ ((packed)); 203 - 204 - struct ceph_mon_unmanaged_snap { 205 - __le64 snapid; 206 192 } __attribute__ ((packed)); 207 193 208 194 struct ceph_osd_getmap { ··· 271 307 CEPH_SESSION_RECALL_STATE, 272 308 CEPH_SESSION_FLUSHMSG, 273 309 CEPH_SESSION_FLUSHMSG_ACK, 310 + CEPH_SESSION_FORCE_RO, 274 311 }; 275 312 276 313 extern const char *ceph_session_op_name(int op);
+2 -1
include/linux/ceph/libceph.h
··· 30 30 #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ 31 31 #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ 32 32 #define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */ 33 + #define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ 33 34 34 - #define CEPH_OPT_DEFAULT (0) 35 + #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) 35 36 36 37 #define ceph_set_opt(client, opt) \ 37 38 (client)->options->flags |= CEPH_OPT_##opt;
+3 -1
include/linux/ceph/messenger.h
··· 57 57 58 58 atomic_t stopping; 59 59 bool nocrc; 60 + bool tcp_nodelay; 60 61 61 62 /* 62 63 * the global_seq counts connections i (attempt to) initiate ··· 265 264 struct ceph_entity_addr *myaddr, 266 265 u64 supported_features, 267 266 u64 required_features, 268 - bool nocrc); 267 + bool nocrc, 268 + bool tcp_nodelay); 269 269 270 270 extern void ceph_con_init(struct ceph_connection *con, void *private, 271 271 const struct ceph_connection_operations *ops,
+1 -8
include/linux/ceph/mon_client.h
··· 40 40 }; 41 41 42 42 /* 43 - * ceph_mon_generic_request is being used for the statfs, poolop and 43 + * ceph_mon_generic_request is being used for the statfs and 44 44 * mon_get_version requests which are being done a bit differently 45 45 * because we need to get data back to the caller 46 46 */ ··· 50 50 struct rb_node node; 51 51 int result; 52 52 void *buf; 53 - int buf_len; 54 53 struct completion completion; 55 54 struct ceph_msg *request; /* original request */ 56 55 struct ceph_msg *reply; /* and reply */ ··· 115 116 extern int ceph_monc_open_session(struct ceph_mon_client *monc); 116 117 117 118 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); 118 - 119 - extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, 120 - u32 pool, u64 *snapid); 121 - 122 - extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 123 - u32 pool, u64 snapid); 124 119 125 120 #endif
+15 -1
net/ceph/ceph_common.c
··· 239 239 Opt_nocrc, 240 240 Opt_cephx_require_signatures, 241 241 Opt_nocephx_require_signatures, 242 + Opt_tcp_nodelay, 243 + Opt_notcp_nodelay, 242 244 }; 243 245 244 246 static match_table_t opt_tokens = { ··· 261 259 {Opt_nocrc, "nocrc"}, 262 260 {Opt_cephx_require_signatures, "cephx_require_signatures"}, 263 261 {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, 262 + {Opt_tcp_nodelay, "tcp_nodelay"}, 263 + {Opt_notcp_nodelay, "notcp_nodelay"}, 264 264 {-1, NULL} 265 265 }; 266 266 ··· 461 457 case Opt_nocrc: 462 458 opt->flags |= CEPH_OPT_NOCRC; 463 459 break; 460 + 464 461 case Opt_cephx_require_signatures: 465 462 opt->flags &= ~CEPH_OPT_NOMSGAUTH; 466 463 break; 467 464 case Opt_nocephx_require_signatures: 468 465 opt->flags |= CEPH_OPT_NOMSGAUTH; 466 + break; 467 + 468 + case Opt_tcp_nodelay: 469 + opt->flags |= CEPH_OPT_TCP_NODELAY; 470 + break; 471 + case Opt_notcp_nodelay: 472 + opt->flags &= ~CEPH_OPT_TCP_NODELAY; 469 473 break; 470 474 471 475 default: ··· 530 518 /* msgr */ 531 519 if (ceph_test_opt(client, MYIP)) 532 520 myaddr = &client->options->my_addr; 521 + 533 522 ceph_messenger_init(&client->msgr, myaddr, 534 523 client->supported_features, 535 524 client->required_features, 536 - ceph_test_opt(client, NOCRC)); 525 + ceph_test_opt(client, NOCRC), 526 + ceph_test_opt(client, TCP_NODELAY)); 537 527 538 528 /* subsystems */ 539 529 err = ceph_monc_init(&client->monc, client);
-14
net/ceph/ceph_strings.c
··· 42 42 return "???"; 43 43 } 44 44 } 45 - 46 - const char *ceph_pool_op_name(int op) 47 - { 48 - switch (op) { 49 - case POOL_OP_CREATE: return "create"; 50 - case POOL_OP_DELETE: return "delete"; 51 - case POOL_OP_AUID_CHANGE: return "auid change"; 52 - case POOL_OP_CREATE_SNAP: return "create snap"; 53 - case POOL_OP_DELETE_SNAP: return "delete snap"; 54 - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; 55 - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; 56 - } 57 - return "???"; 58 - }
-2
net/ceph/debugfs.c
··· 127 127 op = le16_to_cpu(req->request->hdr.type); 128 128 if (op == CEPH_MSG_STATFS) 129 129 seq_printf(s, "%llu statfs\n", req->tid); 130 - else if (op == CEPH_MSG_POOLOP) 131 - seq_printf(s, "%llu poolop\n", req->tid); 132 130 else if (op == CEPH_MSG_MON_GET_VERSION) 133 131 seq_printf(s, "%llu mon_get_version", req->tid); 134 132 else
+13 -1
net/ceph/messenger.c
··· 510 510 return ret; 511 511 } 512 512 513 + if (con->msgr->tcp_nodelay) { 514 + int optval = 1; 515 + 516 + ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, 517 + (char *)&optval, sizeof(optval)); 518 + if (ret) 519 + pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", 520 + ret); 521 + } 522 + 513 523 sk_set_memalloc(sock->sk); 514 524 515 525 con->sock = sock; ··· 2932 2922 struct ceph_entity_addr *myaddr, 2933 2923 u64 supported_features, 2934 2924 u64 required_features, 2935 - bool nocrc) 2925 + bool nocrc, 2926 + bool tcp_nodelay) 2936 2927 { 2937 2928 msgr->supported_features = supported_features; 2938 2929 msgr->required_features = required_features; ··· 2948 2937 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); 2949 2938 encode_my_addr(msgr); 2950 2939 msgr->nocrc = nocrc; 2940 + msgr->tcp_nodelay = tcp_nodelay; 2951 2941 2952 2942 atomic_set(&msgr->stopping, 0); 2953 2943
+5 -134
net/ceph/mon_client.c
··· 410 410 } 411 411 412 412 /* 413 - * generic requests (e.g., statfs, poolop) 413 + * generic requests (currently statfs, mon_get_version) 414 414 */ 415 415 static struct ceph_mon_generic_request *__lookup_generic_req( 416 416 struct ceph_mon_client *monc, u64 tid) ··· 569 569 return; 570 570 571 571 bad: 572 - pr_err("corrupt generic reply, tid %llu\n", tid); 572 + pr_err("corrupt statfs reply, tid %llu\n", tid); 573 573 ceph_msg_dump(msg); 574 574 } 575 575 ··· 588 588 589 589 kref_init(&req->kref); 590 590 req->buf = buf; 591 - req->buf_len = sizeof(*buf); 592 591 init_completion(&req->completion); 593 592 594 593 err = -ENOMEM; ··· 610 611 err = do_generic_request(monc, req); 611 612 612 613 out: 613 - kref_put(&req->kref, release_generic_request); 614 + put_generic_request(req); 614 615 return err; 615 616 } 616 617 EXPORT_SYMBOL(ceph_monc_do_statfs); ··· 646 647 647 648 return; 648 649 bad: 649 - pr_err("corrupt mon_get_version reply\n"); 650 + pr_err("corrupt mon_get_version reply, tid %llu\n", tid); 650 651 ceph_msg_dump(msg); 651 652 } 652 653 ··· 669 670 670 671 kref_init(&req->kref); 671 672 req->buf = newest; 672 - req->buf_len = sizeof(*newest); 673 673 init_completion(&req->completion); 674 674 675 675 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, ··· 699 701 700 702 mutex_unlock(&monc->mutex); 701 703 out: 702 - kref_put(&req->kref, release_generic_request); 704 + put_generic_request(req); 703 705 return err; 704 706 } 705 707 EXPORT_SYMBOL(ceph_monc_do_get_version); 706 - 707 - /* 708 - * pool ops 709 - */ 710 - static int get_poolop_reply_buf(const char *src, size_t src_len, 711 - char *dst, size_t dst_len) 712 - { 713 - u32 buf_len; 714 - 715 - if (src_len != sizeof(u32) + dst_len) 716 - return -EINVAL; 717 - 718 - buf_len = le32_to_cpu(*(__le32 *)src); 719 - if (buf_len != dst_len) 720 - return -EINVAL; 721 - 722 - memcpy(dst, src + sizeof(u32), dst_len); 723 - return 0; 724 - } 725 - 726 - static void handle_poolop_reply(struct ceph_mon_client *monc, 727 - struct ceph_msg *msg) 728 - { 729 - struct ceph_mon_generic_request *req; 730 - struct ceph_mon_poolop_reply *reply = msg->front.iov_base; 731 - u64 tid = le64_to_cpu(msg->hdr.tid); 732 - 733 - if (msg->front.iov_len < sizeof(*reply)) 734 - goto bad; 735 - dout("handle_poolop_reply %p tid %llu\n", msg, tid); 736 - 737 - mutex_lock(&monc->mutex); 738 - req = __lookup_generic_req(monc, tid); 739 - if (req) { 740 - if (req->buf_len && 741 - get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), 742 - msg->front.iov_len - sizeof(*reply), 743 - req->buf, req->buf_len) < 0) { 744 - mutex_unlock(&monc->mutex); 745 - goto bad; 746 - } 747 - req->result = le32_to_cpu(reply->reply_code); 748 - get_generic_request(req); 749 - } 750 - mutex_unlock(&monc->mutex); 751 - if (req) { 752 - complete(&req->completion); 753 - put_generic_request(req); 754 - } 755 - return; 756 - 757 - bad: 758 - pr_err("corrupt generic reply, tid %llu\n", tid); 759 - ceph_msg_dump(msg); 760 - } 761 - 762 - /* 763 - * Do a synchronous pool op. 764 - */ 765 - static int do_poolop(struct ceph_mon_client *monc, u32 op, 766 - u32 pool, u64 snapid, 767 - char *buf, int len) 768 - { 769 - struct ceph_mon_generic_request *req; 770 - struct ceph_mon_poolop *h; 771 - int err; 772 - 773 - req = kzalloc(sizeof(*req), GFP_NOFS); 774 - if (!req) 775 - return -ENOMEM; 776 - 777 - kref_init(&req->kref); 778 - req->buf = buf; 779 - req->buf_len = len; 780 - init_completion(&req->completion); 781 - 782 - err = -ENOMEM; 783 - req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, 784 - true); 785 - if (!req->request) 786 - goto out; 787 - req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, 788 - true); 789 - if (!req->reply) 790 - goto out; 791 - 792 - /* fill out request */ 793 - req->request->hdr.version = cpu_to_le16(2); 794 - h = req->request->front.iov_base; 795 - h->monhdr.have_version = 0; 796 - h->monhdr.session_mon = cpu_to_le16(-1); 797 - h->monhdr.session_mon_tid = 0; 798 - h->fsid = monc->monmap->fsid; 799 - h->pool = cpu_to_le32(pool); 800 - h->op = cpu_to_le32(op); 801 - h->auid = 0; 802 - h->snapid = cpu_to_le64(snapid); 803 - h->name_len = 0; 804 - 805 - err = do_generic_request(monc, req); 806 - 807 - out: 808 - kref_put(&req->kref, release_generic_request); 809 - return err; 810 - } 811 - 812 - int ceph_monc_create_snapid(struct ceph_mon_client *monc, 813 - u32 pool, u64 *snapid) 814 - { 815 - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 816 - pool, 0, (char *)snapid, sizeof(*snapid)); 817 - 818 - } 819 - EXPORT_SYMBOL(ceph_monc_create_snapid); 820 - 821 - int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 822 - u32 pool, u64 snapid) 823 - { 824 - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 825 - pool, snapid, NULL, 0); 826 - 827 - } 828 708 829 709 /* 830 710 * Resend pending generic requests. ··· 988 1112 handle_get_version_reply(monc, msg); 989 1113 break; 990 1114 991 - case CEPH_MSG_POOLOP_REPLY: 992 - handle_poolop_reply(monc, msg); 993 - break; 994 - 995 1115 case CEPH_MSG_MON_MAP: 996 1116 ceph_monc_handle_map(monc, msg); 997 1117 break; ··· 1026 1154 case CEPH_MSG_MON_SUBSCRIBE_ACK: 1027 1155 m = ceph_msg_get(monc->m_subscribe_ack); 1028 1156 break; 1029 - case CEPH_MSG_POOLOP_REPLY: 1030 1157 case CEPH_MSG_STATFS_REPLY: 1031 1158 return get_generic_reply(con, hdr, skip); 1032 1159 case CEPH_MSG_AUTH_REPLY:
+21 -10
net/ceph/osd_client.c
··· 1035 1035 { 1036 1036 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 1037 1037 atomic_read(&osd->o_ref) - 1); 1038 - if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { 1038 + if (atomic_dec_and_test(&osd->o_ref)) { 1039 1039 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; 1040 1040 1041 - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); 1041 + if (osd->o_auth.authorizer) 1042 + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); 1042 1043 kfree(osd); 1043 1044 } 1044 1045 } ··· 1049 1048 */ 1050 1049 static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 1051 1050 { 1052 - dout("__remove_osd %p\n", osd); 1051 + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); 1053 1052 WARN_ON(!list_empty(&osd->o_requests)); 1054 1053 WARN_ON(!list_empty(&osd->o_linger_requests)); 1055 1054 1056 - rb_erase(&osd->o_node, &osdc->osds); 1057 1055 list_del_init(&osd->o_osd_lru); 1058 - ceph_con_close(&osd->o_con); 1059 - put_osd(osd); 1056 + rb_erase(&osd->o_node, &osdc->osds); 1057 + RB_CLEAR_NODE(&osd->o_node); 1058 + } 1059 + 1060 + static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 1061 + { 1062 + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); 1063 + 1064 + if (!RB_EMPTY_NODE(&osd->o_node)) { 1065 + ceph_con_close(&osd->o_con); 1066 + __remove_osd(osdc, osd); 1067 + put_osd(osd); 1068 + } 1060 1069 } 1061 1070 1062 1071 static void remove_all_osds(struct ceph_osd_client *osdc) ··· 1076 1065 while (!RB_EMPTY_ROOT(&osdc->osds)) { 1077 1066 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), 1078 1067 struct ceph_osd, o_node); 1079 - __remove_osd(osdc, osd); 1068 + remove_osd(osdc, osd); 1080 1069 } 1081 1070 mutex_unlock(&osdc->request_mutex); 1082 1071 } ··· 1117 1106 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { 1118 1107 if (time_before(jiffies, osd->lru_ttl)) 1119 1108 break; 1120 - __remove_osd(osdc, osd); 1109 + remove_osd(osdc, osd); 1121 1110 } 1122 1111 mutex_unlock(&osdc->request_mutex); 1123 1112 } ··· 1132 1121 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 1133 1122 if (list_empty(&osd->o_requests) && 1134 1123 list_empty(&osd->o_linger_requests)) { 1135 - __remove_osd(osdc, osd); 1136 - 1124 + remove_osd(osdc, osd); 1137 1125 return -ENODEV; 1138 1126 } 1139 1127 ··· 1936 1926 { 1937 1927 struct rb_node *p, *n; 1938 1928 1929 + dout("%s %p\n", __func__, osdc); 1939 1930 for (p = rb_first(&osdc->osds); p; p = n) { 1940 1931 struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); 1941 1932