Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

+4 -3

MAINTAINERS

··· 2433 2433 F: arch/powerpc/platforms/cell/ 2434 2434 2435 2435 CEPH DISTRIBUTED FILE SYSTEM CLIENT 2436 - M: Sage Weil <sage@inktank.com> 2436 + M: Yan, Zheng <zyan@redhat.com> 2437 + M: Sage Weil <sage@redhat.com> 2437 2438 L: ceph-devel@vger.kernel.org 2438 2439 W: http://ceph.com/ 2439 2440 T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git ··· 7999 7998 F: drivers/net/wireless/ath/wcn36xx/ 8000 7999 8001 8000 RADOS BLOCK DEVICE (RBD) 8002 - M: Yehuda Sadeh <yehuda@inktank.com> 8003 - M: Sage Weil <sage@inktank.com> 8001 + M: Ilya Dryomov <idryomov@gmail.com> 8002 + M: Sage Weil <sage@redhat.com> 8004 8003 M: Alex Elder <elder@kernel.org> 8005 8004 M: ceph-devel@vger.kernel.org 8006 8005 W: http://ceph.com/

+83 -110

drivers/block/rbd.c

··· 38 38 #include <linux/kernel.h> 39 39 #include <linux/device.h> 40 40 #include <linux/module.h> 41 + #include <linux/blk-mq.h> 41 42 #include <linux/fs.h> 42 43 #include <linux/blkdev.h> 43 44 #include <linux/slab.h> ··· 341 340 342 341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 343 342 344 - struct list_head rq_queue; /* incoming rq queue */ 345 343 spinlock_t lock; /* queue, flags, open_count */ 346 - struct work_struct rq_work; 347 344 348 345 struct rbd_image_header header; 349 346 unsigned long flags; /* possibly lock protected */ ··· 358 359 u64 parent_overlap; 359 360 atomic_t parent_ref; 360 361 struct rbd_device *parent; 362 + 363 + /* Block layer tags. */ 364 + struct blk_mq_tag_set tag_set; 361 365 362 366 /* protects updating the header */ 363 367 struct rw_semaphore header_rwsem; ··· 1819 1817 1820 1818 /* 1821 1819 * We support a 64-bit length, but ultimately it has to be 1822 - * passed to blk_end_request(), which takes an unsigned int. 1820 + * passed to the block layer, which just supports a 32-bit 1821 + * length field. 1823 1822 */ 1824 1823 obj_request->xferred = osd_req->r_reply_op_len[0]; 1825 1824 rbd_assert(obj_request->xferred < (u64)UINT_MAX); ··· 2278 2275 more = obj_request->which < img_request->obj_request_count - 1; 2279 2276 } else { 2280 2277 rbd_assert(img_request->rq != NULL); 2281 - more = blk_end_request(img_request->rq, result, xferred); 2278 + 2279 + more = blk_update_request(img_request->rq, result, xferred); 2280 + if (!more) 2281 + __blk_mq_end_request(img_request->rq, result); 2282 2282 } 2283 2283 2284 2284 return more; ··· 3310 3304 return ret; 3311 3305 } 3312 3306 3313 - static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 3307 + static void rbd_queue_workfn(struct work_struct *work) 3314 3308 { 3309 + struct request *rq = blk_mq_rq_from_pdu(work); 3310 + struct rbd_device *rbd_dev = rq->q->queuedata; 3315 3311 struct rbd_img_request *img_request; 3316 3312 struct ceph_snap_context *snapc = NULL; 3317 3313 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; ··· 3321 3313 enum obj_operation_type op_type; 3322 3314 u64 mapping_size; 3323 3315 int result; 3316 + 3317 + if (rq->cmd_type != REQ_TYPE_FS) { 3318 + dout("%s: non-fs request type %d\n", __func__, 3319 + (int) rq->cmd_type); 3320 + result = -EIO; 3321 + goto err; 3322 + } 3324 3323 3325 3324 if (rq->cmd_flags & REQ_DISCARD) 3326 3325 op_type = OBJ_OP_DISCARD; ··· 3374 3359 goto err_rq; /* Shouldn't happen */ 3375 3360 } 3376 3361 3362 + blk_mq_start_request(rq); 3363 + 3377 3364 down_read(&rbd_dev->header_rwsem); 3378 3365 mapping_size = rbd_dev->mapping.size; 3379 3366 if (op_type != OBJ_OP_READ) { ··· 3421 3404 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3422 3405 obj_op_name(op_type), length, offset, result); 3423 3406 ceph_put_snap_context(snapc); 3424 - blk_end_request_all(rq, result); 3407 + err: 3408 + blk_mq_end_request(rq, result); 3425 3409 } 3426 3410 3427 - static void rbd_request_workfn(struct work_struct *work) 3411 + static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 3412 + const struct blk_mq_queue_data *bd) 3428 3413 { 3429 - struct rbd_device *rbd_dev = 3430 - container_of(work, struct rbd_device, rq_work); 3431 - struct request *rq, *next; 3432 - LIST_HEAD(requests); 3414 + struct request *rq = bd->rq; 3415 + struct work_struct *work = blk_mq_rq_to_pdu(rq); 3433 3416 3434 - spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ 3435 - list_splice_init(&rbd_dev->rq_queue, &requests); 3436 - spin_unlock_irq(&rbd_dev->lock); 3437 - 3438 - list_for_each_entry_safe(rq, next, &requests, queuelist) { 3439 - list_del_init(&rq->queuelist); 3440 - rbd_handle_request(rbd_dev, rq); 3441 - } 3442 - } 3443 - 3444 - /* 3445 - * Called with q->queue_lock held and interrupts disabled, possibly on 3446 - * the way to schedule(). Do not sleep here! 3447 - */ 3448 - static void rbd_request_fn(struct request_queue *q) 3449 - { 3450 - struct rbd_device *rbd_dev = q->queuedata; 3451 - struct request *rq; 3452 - int queued = 0; 3453 - 3454 - rbd_assert(rbd_dev); 3455 - 3456 - while ((rq = blk_fetch_request(q))) { 3457 - /* Ignore any non-FS requests that filter through. */ 3458 - if (rq->cmd_type != REQ_TYPE_FS) { 3459 - dout("%s: non-fs request type %d\n", __func__, 3460 - (int) rq->cmd_type); 3461 - __blk_end_request_all(rq, 0); 3462 - continue; 3463 - } 3464 - 3465 - list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); 3466 - queued++; 3467 - } 3468 - 3469 - if (queued) 3470 - queue_work(rbd_wq, &rbd_dev->rq_work); 3417 + queue_work(rbd_wq, work); 3418 + return BLK_MQ_RQ_QUEUE_OK; 3471 3419 } 3472 3420 3473 3421 /* ··· 3493 3511 del_gendisk(disk); 3494 3512 if (disk->queue) 3495 3513 blk_cleanup_queue(disk->queue); 3514 + blk_mq_free_tag_set(&rbd_dev->tag_set); 3496 3515 } 3497 3516 put_disk(disk); 3498 3517 } ··· 3677 3694 3678 3695 ret = rbd_dev_header_info(rbd_dev); 3679 3696 if (ret) 3680 - return ret; 3697 + goto out; 3681 3698 3682 3699 /* 3683 3700 * If there is a parent, see if it has disappeared due to the ··· 3686 3703 if (rbd_dev->parent) { 3687 3704 ret = rbd_dev_v2_parent_info(rbd_dev); 3688 3705 if (ret) 3689 - return ret; 3706 + goto out; 3690 3707 } 3691 3708 3692 3709 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 3693 - if (rbd_dev->mapping.size != rbd_dev->header.image_size) 3694 - rbd_dev->mapping.size = rbd_dev->header.image_size; 3710 + rbd_dev->mapping.size = rbd_dev->header.image_size; 3695 3711 } else { 3696 3712 /* validate mapped snapshot's EXISTS flag */ 3697 3713 rbd_exists_validate(rbd_dev); 3698 3714 } 3699 3715 3716 + out: 3700 3717 up_write(&rbd_dev->header_rwsem); 3701 - 3702 - if (mapping_size != rbd_dev->mapping.size) 3718 + if (!ret && mapping_size != rbd_dev->mapping.size) 3703 3719 rbd_dev_update_size(rbd_dev); 3704 3720 3721 + return ret; 3722 + } 3723 + 3724 + static int rbd_init_request(void *data, struct request *rq, 3725 + unsigned int hctx_idx, unsigned int request_idx, 3726 + unsigned int numa_node) 3727 + { 3728 + struct work_struct *work = blk_mq_rq_to_pdu(rq); 3729 + 3730 + INIT_WORK(work, rbd_queue_workfn); 3705 3731 return 0; 3706 3732 } 3733 + 3734 + static struct blk_mq_ops rbd_mq_ops = { 3735 + .queue_rq = rbd_queue_rq, 3736 + .map_queue = blk_mq_map_queue, 3737 + .init_request = rbd_init_request, 3738 + }; 3707 3739 3708 3740 static int rbd_init_disk(struct rbd_device *rbd_dev) 3709 3741 { 3710 3742 struct gendisk *disk; 3711 3743 struct request_queue *q; 3712 3744 u64 segment_size; 3745 + int err; 3713 3746 3714 3747 /* create gendisk info */ 3715 3748 disk = alloc_disk(single_major ? ··· 3743 3744 disk->fops = &rbd_bd_ops; 3744 3745 disk->private_data = rbd_dev; 3745 3746 3746 - q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3747 - if (!q) 3747 + memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 3748 + rbd_dev->tag_set.ops = &rbd_mq_ops; 3749 + rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; 3750 + rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 3751 + rbd_dev->tag_set.flags = 3752 + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 3753 + rbd_dev->tag_set.nr_hw_queues = 1; 3754 + rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 3755 + 3756 + err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 3757 + if (err) 3748 3758 goto out_disk; 3759 + 3760 + q = blk_mq_init_queue(&rbd_dev->tag_set); 3761 + if (IS_ERR(q)) { 3762 + err = PTR_ERR(q); 3763 + goto out_tag_set; 3764 + } 3749 3765 3750 3766 /* We use the default size, but let's be explicit about it. */ 3751 3767 blk_queue_physical_block_size(q, SECTOR_SIZE); ··· 3787 3773 rbd_dev->disk = disk; 3788 3774 3789 3775 return 0; 3776 + out_tag_set: 3777 + blk_mq_free_tag_set(&rbd_dev->tag_set); 3790 3778 out_disk: 3791 3779 put_disk(disk); 3792 - 3793 - return -ENOMEM; 3780 + return err; 3794 3781 } 3795 3782 3796 3783 /* ··· 4048 4033 return NULL; 4049 4034 4050 4035 spin_lock_init(&rbd_dev->lock); 4051 - INIT_LIST_HEAD(&rbd_dev->rq_queue); 4052 - INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); 4053 4036 rbd_dev->flags = 0; 4054 4037 atomic_set(&rbd_dev->parent_ref, 0); 4055 4038 INIT_LIST_HEAD(&rbd_dev->node); ··· 4287 4274 } 4288 4275 4289 4276 /* 4290 - * We always update the parent overlap. If it's zero we 4291 - * treat it specially. 4277 + * We always update the parent overlap. If it's zero we issue 4278 + * a warning, as we will proceed as if there was no parent. 4292 4279 */ 4293 - rbd_dev->parent_overlap = overlap; 4294 4280 if (!overlap) { 4295 - 4296 - /* A null parent_spec indicates it's the initial probe */ 4297 - 4298 4281 if (parent_spec) { 4299 - /* 4300 - * The overlap has become zero, so the clone 4301 - * must have been resized down to 0 at some 4302 - * point. Treat this the same as a flatten. 4303 - */ 4304 - rbd_dev_parent_put(rbd_dev); 4305 - pr_info("%s: clone image now standalone\n", 4306 - rbd_dev->disk->disk_name); 4282 + /* refresh, careful to warn just once */ 4283 + if (rbd_dev->parent_overlap) 4284 + rbd_warn(rbd_dev, 4285 + "clone now standalone (overlap became 0)"); 4307 4286 } else { 4308 - /* 4309 - * For the initial probe, if we find the 4310 - * overlap is zero we just pretend there was 4311 - * no parent image. 4312 - */ 4313 - rbd_warn(rbd_dev, "ignoring parent with overlap 0"); 4287 + /* initial probe */ 4288 + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 4314 4289 } 4315 4290 } 4291 + rbd_dev->parent_overlap = overlap; 4292 + 4316 4293 out: 4317 4294 ret = 0; 4318 4295 out_err: ··· 4771 4768 *buf += strspn(*buf, spaces); /* Find start of token */ 4772 4769 4773 4770 return strcspn(*buf, spaces); /* Return token length */ 4774 - } 4775 - 4776 - /* 4777 - * Finds the next token in *buf, and if the provided token buffer is 4778 - * big enough, copies the found token into it. The result, if 4779 - * copied, is guaranteed to be terminated with '\0'. Note that *buf 4780 - * must be terminated with '\0' on entry. 4781 - * 4782 - * Returns the length of the token found (not including the '\0'). 4783 - * Return value will be 0 if no token is found, and it will be >= 4784 - * token_size if the token would not fit. 4785 - * 4786 - * The *buf pointer will be updated to point beyond the end of the 4787 - * found token. Note that this occurs even if the token buffer is 4788 - * too small to hold it. 4789 - */ 4790 - static inline size_t copy_token(const char **buf, 4791 - char *token, 4792 - size_t token_size) 4793 - { 4794 - size_t len; 4795 - 4796 - len = next_token(buf); 4797 - if (len < token_size) { 4798 - memcpy(token, *buf, len); 4799 - *(token + len) = '\0'; 4800 - } 4801 - *buf += len; 4802 - 4803 - return len; 4804 4771 } 4805 4772 4806 4773 /*

-14

fs/ceph/acl.c

··· 40 40 spin_unlock(&ci->i_ceph_lock); 41 41 } 42 42 43 - static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, 44 - int type) 45 - { 46 - struct ceph_inode_info *ci = ceph_inode(inode); 47 - struct posix_acl *acl = ACL_NOT_CACHED; 48 - 49 - spin_lock(&ci->i_ceph_lock); 50 - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) 51 - acl = get_cached_acl(inode, type); 52 - spin_unlock(&ci->i_ceph_lock); 53 - 54 - return acl; 55 - } 56 - 57 43 struct posix_acl *ceph_get_acl(struct inode *inode, int type) 58 44 { 59 45 int size;

+12 -7

fs/ceph/addr.c

··· 196 196 u64 len = PAGE_CACHE_SIZE; 197 197 198 198 if (off >= i_size_read(inode)) { 199 - zero_user_segment(page, err, PAGE_CACHE_SIZE); 199 + zero_user_segment(page, 0, PAGE_CACHE_SIZE); 200 200 SetPageUptodate(page); 201 201 return 0; 202 202 } 203 203 204 - /* 205 - * Uptodate inline data should have been added into page cache 206 - * while getting Fcr caps. 207 - */ 208 - if (ci->i_inline_version != CEPH_INLINE_NONE) 209 - return -EINVAL; 204 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 205 + /* 206 + * Uptodate inline data should have been added 207 + * into page cache while getting Fcr caps. 208 + */ 209 + if (off == 0) 210 + return -EINVAL; 211 + zero_user_segment(page, 0, PAGE_CACHE_SIZE); 212 + SetPageUptodate(page); 213 + return 0; 214 + } 210 215 211 216 err = ceph_readpage_from_fscache(inode, page); 212 217 if (err == 0)

+74 -53

fs/ceph/caps.c

··· 577 577 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, 578 578 realmino); 579 579 if (realm) { 580 - ceph_get_snap_realm(mdsc, realm); 581 580 spin_lock(&realm->inodes_with_caps_lock); 582 581 ci->i_snap_realm = realm; 583 582 list_add(&ci->i_snap_realm_item, ··· 1450 1451 spin_lock(&mdsc->cap_dirty_lock); 1451 1452 list_del_init(&ci->i_dirty_item); 1452 1453 1453 - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; 1454 1454 if (list_empty(&ci->i_flushing_item)) { 1455 + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; 1455 1456 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1456 1457 mdsc->num_cap_flushing++; 1457 1458 dout(" inode %p now flushing seq %lld\n", inode, ··· 2072 2073 * requested from the MDS. 2073 2074 */ 2074 2075 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2075 - loff_t endoff, int *got, struct page **pinned_page, 2076 - int *check_max, int *err) 2076 + loff_t endoff, int *got, int *check_max, int *err) 2077 2077 { 2078 2078 struct inode *inode = &ci->vfs_inode; 2079 2079 int ret = 0; 2080 - int have, implemented, _got = 0; 2080 + int have, implemented; 2081 2081 int file_wanted; 2082 2082 2083 2083 dout("get_cap_refs %p need %s want %s\n", inode, 2084 2084 ceph_cap_string(need), ceph_cap_string(want)); 2085 - again: 2085 + 2086 2086 spin_lock(&ci->i_ceph_lock); 2087 2087 2088 2088 /* make sure file is actually open */ ··· 2136 2138 inode, ceph_cap_string(have), ceph_cap_string(not), 2137 2139 ceph_cap_string(revoking)); 2138 2140 if ((revoking & not) == 0) { 2139 - _got = need | (have & want); 2140 - __take_cap_refs(ci, _got); 2141 + *got = need | (have & want); 2142 + __take_cap_refs(ci, *got); 2141 2143 ret = 1; 2142 2144 } 2143 2145 } else { 2146 + int session_readonly = false; 2147 + if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { 2148 + struct ceph_mds_session *s = ci->i_auth_cap->session; 2149 + spin_lock(&s->s_cap_lock); 2150 + session_readonly = s->s_readonly; 2151 + spin_unlock(&s->s_cap_lock); 2152 + } 2153 + if (session_readonly) { 2154 + dout("get_cap_refs %p needed %s but mds%d readonly\n", 2155 + inode, ceph_cap_string(need), ci->i_auth_cap->mds); 2156 + *err = -EROFS; 2157 + ret = 1; 2158 + goto out_unlock; 2159 + } 2160 + 2144 2161 dout("get_cap_refs %p have %s needed %s\n", inode, 2145 2162 ceph_cap_string(have), ceph_cap_string(need)); 2146 2163 } 2147 2164 out_unlock: 2148 2165 spin_unlock(&ci->i_ceph_lock); 2149 2166 2150 - if (ci->i_inline_version != CEPH_INLINE_NONE && 2151 - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2152 - i_size_read(inode) > 0) { 2153 - int ret1; 2154 - struct page *page = find_get_page(inode->i_mapping, 0); 2155 - if (page) { 2156 - if (PageUptodate(page)) { 2157 - *pinned_page = page; 2158 - goto out; 2159 - } 2160 - page_cache_release(page); 2161 - } 2162 - /* 2163 - * drop cap refs first because getattr while holding 2164 - * caps refs can cause deadlock. 2165 - */ 2166 - ceph_put_cap_refs(ci, _got); 2167 - _got = 0; 2168 - 2169 - /* getattr request will bring inline data into page cache */ 2170 - ret1 = __ceph_do_getattr(inode, NULL, 2171 - CEPH_STAT_CAP_INLINE_DATA, true); 2172 - if (ret1 >= 0) { 2173 - ret = 0; 2174 - goto again; 2175 - } 2176 - *err = ret1; 2177 - ret = 1; 2178 - } 2179 - out: 2180 2167 dout("get_cap_refs %p ret %d got %s\n", inode, 2181 - ret, ceph_cap_string(_got)); 2182 - *got = _got; 2168 + ret, ceph_cap_string(*got)); 2183 2169 return ret; 2184 2170 } 2185 2171 ··· 2203 2221 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2204 2222 loff_t endoff, int *got, struct page **pinned_page) 2205 2223 { 2206 - int check_max, ret, err; 2224 + int _got, check_max, ret, err = 0; 2207 2225 2208 2226 retry: 2209 2227 if (endoff > 0) 2210 2228 check_max_size(&ci->vfs_inode, endoff); 2229 + _got = 0; 2211 2230 check_max = 0; 2212 - err = 0; 2213 2231 ret = wait_event_interruptible(ci->i_cap_wq, 2214 - try_get_cap_refs(ci, need, want, endoff, 2215 - got, pinned_page, 2216 - &check_max, &err)); 2232 + try_get_cap_refs(ci, need, want, endoff, 2233 + &_got, &check_max, &err)); 2217 2234 if (err) 2218 2235 ret = err; 2236 + if (ret < 0) 2237 + return ret; 2238 + 2219 2239 if (check_max) 2220 2240 goto retry; 2221 - return ret; 2241 + 2242 + if (ci->i_inline_version != CEPH_INLINE_NONE && 2243 + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2244 + i_size_read(&ci->vfs_inode) > 0) { 2245 + struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); 2246 + if (page) { 2247 + if (PageUptodate(page)) { 2248 + *pinned_page = page; 2249 + goto out; 2250 + } 2251 + page_cache_release(page); 2252 + } 2253 + /* 2254 + * drop cap refs first because getattr while holding 2255 + * caps refs can cause deadlock. 2256 + */ 2257 + ceph_put_cap_refs(ci, _got); 2258 + _got = 0; 2259 + 2260 + /* getattr request will bring inline data into page cache */ 2261 + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, 2262 + CEPH_STAT_CAP_INLINE_DATA, true); 2263 + if (ret < 0) 2264 + return ret; 2265 + goto retry; 2266 + } 2267 + out: 2268 + *got = _got; 2269 + return 0; 2222 2270 } 2223 2271 2224 2272 /* ··· 2444 2432 */ 2445 2433 static void handle_cap_grant(struct ceph_mds_client *mdsc, 2446 2434 struct inode *inode, struct ceph_mds_caps *grant, 2447 - void *snaptrace, int snaptrace_len, 2448 2435 u64 inline_version, 2449 2436 void *inline_data, int inline_len, 2450 2437 struct ceph_buffer *xattr_buf, 2451 2438 struct ceph_mds_session *session, 2452 2439 struct ceph_cap *cap, int issued) 2453 2440 __releases(ci->i_ceph_lock) 2441 + __releases(mdsc->snap_rwsem) 2454 2442 { 2455 2443 struct ceph_inode_info *ci = ceph_inode(inode); 2456 2444 int mds = session->s_mds; ··· 2651 2639 spin_unlock(&ci->i_ceph_lock); 2652 2640 2653 2641 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 2654 - down_write(&mdsc->snap_rwsem); 2655 - ceph_update_snap_trace(mdsc, snaptrace, 2656 - snaptrace + snaptrace_len, false); 2657 - downgrade_write(&mdsc->snap_rwsem); 2658 2642 kick_flushing_inode_caps(mdsc, session, inode); 2659 2643 up_read(&mdsc->snap_rwsem); 2660 2644 if (newcaps & ~issued) ··· 3060 3052 struct ceph_cap *cap; 3061 3053 struct ceph_mds_caps *h; 3062 3054 struct ceph_mds_cap_peer *peer = NULL; 3055 + struct ceph_snap_realm *realm; 3063 3056 int mds = session->s_mds; 3064 3057 int op, issued; 3065 3058 u32 seq, mseq; ··· 3162 3153 goto done_unlocked; 3163 3154 3164 3155 case CEPH_CAP_OP_IMPORT: 3156 + realm = NULL; 3157 + if (snaptrace_len) { 3158 + down_write(&mdsc->snap_rwsem); 3159 + ceph_update_snap_trace(mdsc, snaptrace, 3160 + snaptrace + snaptrace_len, 3161 + false, &realm); 3162 + downgrade_write(&mdsc->snap_rwsem); 3163 + } else { 3164 + down_read(&mdsc->snap_rwsem); 3165 + } 3165 3166 handle_cap_import(mdsc, inode, h, peer, session, 3166 3167 &cap, &issued); 3167 - handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, 3168 + handle_cap_grant(mdsc, inode, h, 3168 3169 inline_version, inline_data, inline_len, 3169 3170 msg->middle, session, cap, issued); 3171 + if (realm) 3172 + ceph_put_snap_realm(mdsc, realm); 3170 3173 goto done_unlocked; 3171 3174 } 3172 3175 ··· 3198 3177 case CEPH_CAP_OP_GRANT: 3199 3178 __ceph_caps_issued(ci, &issued); 3200 3179 issued |= __ceph_caps_dirty(ci); 3201 - handle_cap_grant(mdsc, inode, h, NULL, 0, 3180 + handle_cap_grant(mdsc, inode, h, 3202 3181 inline_version, inline_data, inline_len, 3203 3182 msg->middle, session, cap, issued); 3204 3183 goto done_unlocked;

+25 -8

fs/ceph/dir.c

··· 26 26 * point by name. 27 27 */ 28 28 29 - const struct inode_operations ceph_dir_iops; 30 - const struct file_operations ceph_dir_fops; 31 29 const struct dentry_operations ceph_dentry_ops; 32 30 33 31 /* ··· 670 672 /* 671 673 * We created the item, then did a lookup, and found 672 674 * it was already linked to another inode we already 673 - * had in our cache (and thus got spliced). Link our 674 - * dentry to that inode, but don't hash it, just in 675 - * case the VFS wants to dereference it. 675 + * had in our cache (and thus got spliced). To not 676 + * confuse VFS (especially when inode is a directory), 677 + * we don't link our dentry to that inode, return an 678 + * error instead. 679 + * 680 + * This event should be rare and it happens only when 681 + * we talk to old MDS. Recent MDS does not send traceless 682 + * reply for request that creates new inode. 676 683 */ 677 - BUG_ON(!result->d_inode); 678 - d_instantiate(dentry, result->d_inode); 679 - return 0; 684 + d_drop(result); 685 + return -ESTALE; 680 686 } 681 687 return PTR_ERR(result); 682 688 } ··· 1337 1335 .fsync = ceph_dir_fsync, 1338 1336 }; 1339 1337 1338 + const struct file_operations ceph_snapdir_fops = { 1339 + .iterate = ceph_readdir, 1340 + .llseek = ceph_dir_llseek, 1341 + .open = ceph_open, 1342 + .release = ceph_release, 1343 + }; 1344 + 1340 1345 const struct inode_operations ceph_dir_iops = { 1341 1346 .lookup = ceph_lookup, 1342 1347 .permission = ceph_permission, ··· 1364 1355 .rename = ceph_rename, 1365 1356 .create = ceph_create, 1366 1357 .atomic_open = ceph_atomic_open, 1358 + }; 1359 + 1360 + const struct inode_operations ceph_snapdir_iops = { 1361 + .lookup = ceph_lookup, 1362 + .permission = ceph_permission, 1363 + .getattr = ceph_getattr, 1364 + .mkdir = ceph_mkdir, 1365 + .rmdir = ceph_unlink, 1367 1366 }; 1368 1367 1369 1368 const struct dentry_operations ceph_dentry_ops = {

+22 -15

fs/ceph/file.c

··· 275 275 err = ceph_mdsc_do_request(mdsc, 276 276 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 277 277 req); 278 + err = ceph_handle_snapdir(req, dentry, err); 278 279 if (err) 279 280 goto out_req; 280 281 281 - err = ceph_handle_snapdir(req, dentry, err); 282 282 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 283 283 err = ceph_handle_notrace_create(dir, dentry); 284 284 ··· 392 392 if (ret >= 0) { 393 393 int didpages; 394 394 if (was_short && (pos + ret < inode->i_size)) { 395 - u64 tmp = min(this_len - ret, 396 - inode->i_size - pos - ret); 395 + int zlen = min(this_len - ret, 396 + inode->i_size - pos - ret); 397 + int zoff = (o_direct ? buf_align : io_align) + 398 + read + ret; 397 399 dout(" zero gap %llu to %llu\n", 398 - pos + ret, pos + ret + tmp); 399 - ceph_zero_page_vector_range(page_align + read + ret, 400 - tmp, pages); 401 - ret += tmp; 400 + pos + ret, pos + ret + zlen); 401 + ceph_zero_page_vector_range(zoff, zlen, pages); 402 + ret += zlen; 402 403 } 403 404 404 405 didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; ··· 879 878 880 879 i_size = i_size_read(inode); 881 880 if (retry_op == READ_INLINE) { 882 - /* does not support inline data > PAGE_SIZE */ 883 - if (i_size > PAGE_CACHE_SIZE) { 884 - ret = -EIO; 885 - } else if (iocb->ki_pos < i_size) { 881 + BUG_ON(ret > 0 || read > 0); 882 + if (iocb->ki_pos < i_size && 883 + iocb->ki_pos < PAGE_CACHE_SIZE) { 886 884 loff_t end = min_t(loff_t, i_size, 887 885 iocb->ki_pos + len); 886 + end = min_t(loff_t, end, PAGE_CACHE_SIZE); 888 887 if (statret < end) 889 888 zero_user_segment(page, statret, end); 890 889 ret = copy_page_to_iter(page, 891 890 iocb->ki_pos & ~PAGE_MASK, 892 891 end - iocb->ki_pos, to); 893 892 iocb->ki_pos += ret; 894 - } else { 895 - ret = 0; 893 + read += ret; 894 + } 895 + if (iocb->ki_pos < i_size && read < len) { 896 + size_t zlen = min_t(size_t, len - read, 897 + i_size - iocb->ki_pos); 898 + ret = iov_iter_zero(zlen, to); 899 + iocb->ki_pos += ret; 900 + read += ret; 896 901 } 897 902 __free_pages(page, 0); 898 - return ret; 903 + return read; 899 904 } 900 905 901 906 /* hit EOF or hole? */ 902 907 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 903 - ret < len) { 908 + ret < len) { 904 909 dout("sync_read hit hole, ppos %lld < size %lld" 905 910 ", reading more\n", iocb->ki_pos, 906 911 inode->i_size);

+22 -19

fs/ceph/inode.c

··· 82 82 inode->i_mode = parent->i_mode; 83 83 inode->i_uid = parent->i_uid; 84 84 inode->i_gid = parent->i_gid; 85 - inode->i_op = &ceph_dir_iops; 86 - inode->i_fop = &ceph_dir_fops; 85 + inode->i_op = &ceph_snapdir_iops; 86 + inode->i_fop = &ceph_snapdir_fops; 87 87 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 88 88 ci->i_rbytes = 0; 89 89 return inode; ··· 838 838 ceph_vinop(inode), inode->i_mode); 839 839 } 840 840 841 - /* set dir completion flag? */ 842 - if (S_ISDIR(inode->i_mode) && 843 - ci->i_files == 0 && ci->i_subdirs == 0 && 844 - ceph_snap(inode) == CEPH_NOSNAP && 845 - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 846 - (issued & CEPH_CAP_FILE_EXCL) == 0 && 847 - !__ceph_dir_is_complete(ci)) { 848 - dout(" marking %p complete (empty)\n", inode); 849 - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), 850 - ci->i_ordered_count); 851 - } 852 - 853 841 /* were we issued a capability? */ 854 842 if (info->cap.caps) { 855 843 if (ceph_snap(inode) == CEPH_NOSNAP) { 844 + unsigned caps = le32_to_cpu(info->cap.caps); 856 845 ceph_add_cap(inode, session, 857 846 le64_to_cpu(info->cap.cap_id), 858 - cap_fmode, 859 - le32_to_cpu(info->cap.caps), 847 + cap_fmode, caps, 860 848 le32_to_cpu(info->cap.wanted), 861 849 le32_to_cpu(info->cap.seq), 862 850 le32_to_cpu(info->cap.mseq), 863 851 le64_to_cpu(info->cap.realm), 864 852 info->cap.flags, &new_cap); 853 + 854 + /* set dir completion flag? */ 855 + if (S_ISDIR(inode->i_mode) && 856 + ci->i_files == 0 && ci->i_subdirs == 0 && 857 + (caps & CEPH_CAP_FILE_SHARED) && 858 + (issued & CEPH_CAP_FILE_EXCL) == 0 && 859 + !__ceph_dir_is_complete(ci)) { 860 + dout(" marking %p complete (empty)\n", inode); 861 + __ceph_dir_set_complete(ci, 862 + atomic_read(&ci->i_release_count), 863 + ci->i_ordered_count); 864 + } 865 + 865 866 wake = true; 866 867 } else { 867 868 dout(" %p got snap_caps %s\n", inode, ··· 1447 1446 } 1448 1447 1449 1448 if (!dn->d_inode) { 1450 - dn = splice_dentry(dn, in, NULL); 1451 - if (IS_ERR(dn)) { 1452 - err = PTR_ERR(dn); 1449 + struct dentry *realdn = splice_dentry(dn, in, NULL); 1450 + if (IS_ERR(realdn)) { 1451 + err = PTR_ERR(realdn); 1452 + d_drop(dn); 1453 1453 dn = NULL; 1454 1454 goto next_item; 1455 1455 } 1456 + dn = realdn; 1456 1457 } 1457 1458 1458 1459 di = dn->d_fsdata;

+93 -34

fs/ceph/mds_client.c

··· 480 480 mdsc->max_sessions = newmax; 481 481 } 482 482 mdsc->sessions[mds] = s; 483 + atomic_inc(&mdsc->num_sessions); 483 484 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ 484 485 485 486 ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ··· 504 503 mdsc->sessions[s->s_mds] = NULL; 505 504 ceph_con_close(&s->s_con); 506 505 ceph_put_mds_session(s); 506 + atomic_dec(&mdsc->num_sessions); 507 507 } 508 508 509 509 /* ··· 844 842 struct ceph_options *opt = mdsc->fsc->client->options; 845 843 void *p; 846 844 847 - const char* metadata[3][2] = { 845 + const char* metadata[][2] = { 848 846 {"hostname", utsname()->nodename}, 847 + {"kernel_version", utsname()->release}, 849 848 {"entity_id", opt->name ? opt->name : ""}, 850 849 {NULL, NULL} 851 850 }; ··· 1467 1464 return err; 1468 1465 } 1469 1466 1467 + static int check_cap_flush(struct inode *inode, u64 want_flush_seq) 1468 + { 1469 + struct ceph_inode_info *ci = ceph_inode(inode); 1470 + int ret; 1471 + spin_lock(&ci->i_ceph_lock); 1472 + if (ci->i_flushing_caps) 1473 + ret = ci->i_cap_flush_seq >= want_flush_seq; 1474 + else 1475 + ret = 1; 1476 + spin_unlock(&ci->i_ceph_lock); 1477 + return ret; 1478 + } 1479 + 1470 1480 /* 1471 1481 * flush all dirty inode data to disk. 1472 1482 * 1473 1483 * returns true if we've flushed through want_flush_seq 1474 1484 */ 1475 - static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) 1485 + static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) 1476 1486 { 1477 - int mds, ret = 1; 1487 + int mds; 1478 1488 1479 1489 dout("check_cap_flush want %lld\n", want_flush_seq); 1480 1490 mutex_lock(&mdsc->mutex); 1481 - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { 1491 + for (mds = 0; mds < mdsc->max_sessions; mds++) { 1482 1492 struct ceph_mds_session *session = mdsc->sessions[mds]; 1493 + struct inode *inode = NULL; 1483 1494 1484 1495 if (!session) 1485 1496 continue; ··· 1506 1489 list_entry(session->s_cap_flushing.next, 1507 1490 struct ceph_inode_info, 1508 1491 i_flushing_item); 1509 - struct inode *inode = &ci->vfs_inode; 1510 1492 1511 - spin_lock(&ci->i_ceph_lock); 1512 - if (ci->i_cap_flush_seq <= want_flush_seq) { 1493 + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { 1513 1494 dout("check_cap_flush still flushing %p " 1514 - "seq %lld <= %lld to mds%d\n", inode, 1515 - ci->i_cap_flush_seq, want_flush_seq, 1516 - session->s_mds); 1517 - ret = 0; 1495 + "seq %lld <= %lld to mds%d\n", 1496 + &ci->vfs_inode, ci->i_cap_flush_seq, 1497 + want_flush_seq, session->s_mds); 1498 + inode = igrab(&ci->vfs_inode); 1518 1499 } 1519 - spin_unlock(&ci->i_ceph_lock); 1520 1500 } 1521 1501 mutex_unlock(&session->s_mutex); 1522 1502 ceph_put_mds_session(session); 1523 1503 1524 - if (!ret) 1525 - return ret; 1504 + if (inode) { 1505 + wait_event(mdsc->cap_flushing_wq, 1506 + check_cap_flush(inode, want_flush_seq)); 1507 + iput(inode); 1508 + } 1509 + 1526 1510 mutex_lock(&mdsc->mutex); 1527 1511 } 1528 1512 1529 1513 mutex_unlock(&mdsc->mutex); 1530 1514 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); 1531 - return ret; 1532 1515 } 1533 1516 1534 1517 /* ··· 1940 1923 head->num_releases = cpu_to_le16(releases); 1941 1924 1942 1925 /* time stamp */ 1943 - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 1926 + { 1927 + struct ceph_timespec ts; 1928 + ceph_encode_timespec(&ts, &req->r_stamp); 1929 + ceph_encode_copy(&p, &ts, sizeof(ts)); 1930 + } 1944 1931 1945 1932 BUG_ON(p > end); 1946 1933 msg->front.iov_len = p - msg->front.iov_base; ··· 2033 2012 2034 2013 /* time stamp */ 2035 2014 p = msg->front.iov_base + req->r_request_release_offset; 2036 - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 2015 + { 2016 + struct ceph_timespec ts; 2017 + ceph_encode_timespec(&ts, &req->r_stamp); 2018 + ceph_encode_copy(&p, &ts, sizeof(ts)); 2019 + } 2037 2020 2038 2021 msg->front.iov_len = p - msg->front.iov_base; 2039 2022 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); ··· 2184 2159 p = rb_next(p); 2185 2160 if (req->r_got_unsafe) 2186 2161 continue; 2162 + if (req->r_attempts > 0) 2163 + continue; /* only new requests */ 2187 2164 if (req->r_session && 2188 2165 req->r_session->s_mds == mds) { 2189 2166 dout(" kicking tid %llu\n", req->r_tid); ··· 2313 2286 struct ceph_mds_request *req; 2314 2287 struct ceph_mds_reply_head *head = msg->front.iov_base; 2315 2288 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ 2289 + struct ceph_snap_realm *realm; 2316 2290 u64 tid; 2317 2291 int err, result; 2318 2292 int mds = session->s_mds; ··· 2429 2401 } 2430 2402 2431 2403 /* snap trace */ 2404 + realm = NULL; 2432 2405 if (rinfo->snapblob_len) { 2433 2406 down_write(&mdsc->snap_rwsem); 2434 2407 ceph_update_snap_trace(mdsc, rinfo->snapblob, 2435 - rinfo->snapblob + rinfo->snapblob_len, 2436 - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); 2408 + rinfo->snapblob + rinfo->snapblob_len, 2409 + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, 2410 + &realm); 2437 2411 downgrade_write(&mdsc->snap_rwsem); 2438 2412 } else { 2439 2413 down_read(&mdsc->snap_rwsem); ··· 2453 2423 mutex_unlock(&req->r_fill_mutex); 2454 2424 2455 2425 up_read(&mdsc->snap_rwsem); 2426 + if (realm) 2427 + ceph_put_snap_realm(mdsc, realm); 2456 2428 out_err: 2457 2429 mutex_lock(&mdsc->mutex); 2458 2430 if (!req->r_aborted) { ··· 2519 2487 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); 2520 2488 BUG_ON(req->r_err); 2521 2489 BUG_ON(req->r_got_result); 2490 + req->r_attempts = 0; 2522 2491 req->r_num_fwd = fwd_seq; 2523 2492 req->r_resend_mds = next_mds; 2524 2493 put_request_session(req); ··· 2613 2580 send_flushmsg_ack(mdsc, session, seq); 2614 2581 break; 2615 2582 2583 + case CEPH_SESSION_FORCE_RO: 2584 + dout("force_session_readonly %p\n", session); 2585 + spin_lock(&session->s_cap_lock); 2586 + session->s_readonly = true; 2587 + spin_unlock(&session->s_cap_lock); 2588 + wake_up_session_caps(session, 0); 2589 + break; 2590 + 2616 2591 default: 2617 2592 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); 2618 2593 WARN_ON(1); ··· 2651 2610 struct ceph_mds_session *session) 2652 2611 { 2653 2612 struct ceph_mds_request *req, *nreq; 2613 + struct rb_node *p; 2654 2614 int err; 2655 2615 2656 2616 dout("replay_unsafe_requests mds%d\n", session->s_mds); ··· 2662 2620 if (!err) { 2663 2621 ceph_msg_get(req->r_request); 2664 2622 ceph_con_send(&session->s_con, req->r_request); 2623 + } 2624 + } 2625 + 2626 + /* 2627 + * also re-send old requests when MDS enters reconnect stage. So that MDS 2628 + * can process completed request in clientreplay stage. 2629 + */ 2630 + p = rb_first(&mdsc->request_tree); 2631 + while (p) { 2632 + req = rb_entry(p, struct ceph_mds_request, r_node); 2633 + p = rb_next(p); 2634 + if (req->r_got_unsafe) 2635 + continue; 2636 + if (req->r_attempts == 0) 2637 + continue; /* only old requests */ 2638 + if (req->r_session && 2639 + req->r_session->s_mds == session->s_mds) { 2640 + err = __prepare_send_request(mdsc, req, session->s_mds); 2641 + if (!err) { 2642 + ceph_msg_get(req->r_request); 2643 + ceph_con_send(&session->s_con, req->r_request); 2644 + } 2665 2645 } 2666 2646 } 2667 2647 mutex_unlock(&mdsc->mutex); ··· 2851 2787 spin_unlock(&session->s_gen_ttl_lock); 2852 2788 2853 2789 spin_lock(&session->s_cap_lock); 2790 + /* don't know if session is readonly */ 2791 + session->s_readonly = 0; 2854 2792 /* 2855 2793 * notify __ceph_remove_cap() that we are composing cap reconnect. 2856 2794 * If a cap get released before being added to the cap reconnect, ··· 2999 2933 mutex_unlock(&s->s_mutex); 3000 2934 s->s_state = CEPH_MDS_SESSION_RESTARTING; 3001 2935 } 3002 - 3003 - /* kick any requests waiting on the recovering mds */ 3004 - kick_requests(mdsc, i); 3005 2936 } else if (oldstate == newstate) { 3006 2937 continue; /* nothing new with this mds */ 3007 2938 } ··· 3358 3295 init_waitqueue_head(&mdsc->session_close_wq); 3359 3296 INIT_LIST_HEAD(&mdsc->waiting_for_map); 3360 3297 mdsc->sessions = NULL; 3298 + atomic_set(&mdsc->num_sessions, 0); 3361 3299 mdsc->max_sessions = 0; 3362 3300 mdsc->stopping = 0; 3363 3301 init_rwsem(&mdsc->snap_rwsem); ··· 3492 3428 dout("sync\n"); 3493 3429 mutex_lock(&mdsc->mutex); 3494 3430 want_tid = mdsc->last_tid; 3495 - want_flush = mdsc->cap_flush_seq; 3496 3431 mutex_unlock(&mdsc->mutex); 3497 - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); 3498 3432 3499 3433 ceph_flush_dirty_caps(mdsc); 3434 + spin_lock(&mdsc->cap_dirty_lock); 3435 + want_flush = mdsc->cap_flush_seq; 3436 + spin_unlock(&mdsc->cap_dirty_lock); 3437 + 3438 + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); 3500 3439 3501 3440 wait_unsafe_requests(mdsc, want_tid); 3502 - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3441 + wait_caps_flush(mdsc, want_flush); 3503 3442 } 3504 3443 3505 3444 /* ··· 3510 3443 */ 3511 3444 static bool done_closing_sessions(struct ceph_mds_client *mdsc) 3512 3445 { 3513 - int i, n = 0; 3514 - 3515 3446 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3516 3447 return true; 3517 - 3518 - mutex_lock(&mdsc->mutex); 3519 - for (i = 0; i < mdsc->max_sessions; i++) 3520 - if (mdsc->sessions[i]) 3521 - n++; 3522 - mutex_unlock(&mdsc->mutex); 3523 - return n == 0; 3448 + return atomic_read(&mdsc->num_sessions) == 0; 3524 3449 } 3525 3450 3526 3451 /*

+2

fs/ceph/mds_client.h

··· 137 137 int s_nr_caps, s_trim_caps; 138 138 int s_num_cap_releases; 139 139 int s_cap_reconnect; 140 + int s_readonly; 140 141 struct list_head s_cap_releases; /* waiting cap_release messages */ 141 142 struct list_head s_cap_releases_done; /* ready to send */ 142 143 struct ceph_cap *s_cap_iterator; ··· 273 272 struct list_head waiting_for_map; 274 273 275 274 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 275 + atomic_t num_sessions; 276 276 int max_sessions; /* len of s_mds_sessions */ 277 277 int stopping; /* true if shutting down */ 278 278

+38 -16

fs/ceph/snap.c

··· 70 70 * safe. we do need to protect against concurrent empty list 71 71 * additions, however. 72 72 */ 73 - if (atomic_read(&realm->nref) == 0) { 73 + if (atomic_inc_return(&realm->nref) == 1) { 74 74 spin_lock(&mdsc->snap_empty_lock); 75 75 list_del_init(&realm->empty_item); 76 76 spin_unlock(&mdsc->snap_empty_lock); 77 77 } 78 - 79 - atomic_inc(&realm->nref); 80 78 } 81 79 82 80 static void __insert_snap_realm(struct rb_root *root, ··· 114 116 if (!realm) 115 117 return ERR_PTR(-ENOMEM); 116 118 117 - atomic_set(&realm->nref, 0); /* tree does not take a ref */ 119 + atomic_set(&realm->nref, 1); /* for caller */ 118 120 realm->ino = ino; 119 121 INIT_LIST_HEAD(&realm->children); 120 122 INIT_LIST_HEAD(&realm->child_item); ··· 132 134 * 133 135 * caller must hold snap_rwsem for write. 134 136 */ 135 - struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 136 - u64 ino) 137 + static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, 138 + u64 ino) 137 139 { 138 140 struct rb_node *n = mdsc->snap_realms.rb_node; 139 141 struct ceph_snap_realm *r; ··· 150 152 } 151 153 } 152 154 return NULL; 155 + } 156 + 157 + struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 158 + u64 ino) 159 + { 160 + struct ceph_snap_realm *r; 161 + r = __lookup_snap_realm(mdsc, ino); 162 + if (r) 163 + ceph_get_snap_realm(mdsc, r); 164 + return r; 153 165 } 154 166 155 167 static void __put_snap_realm(struct ceph_mds_client *mdsc, ··· 281 273 } 282 274 realm->parent_ino = parentino; 283 275 realm->parent = parent; 284 - ceph_get_snap_realm(mdsc, parent); 285 276 list_add(&realm->child_item, &parent->children); 286 277 return 1; 287 278 } ··· 638 631 * Caller must hold snap_rwsem for write. 639 632 */ 640 633 int ceph_update_snap_trace(struct ceph_mds_client *mdsc, 641 - void *p, void *e, bool deletion) 634 + void *p, void *e, bool deletion, 635 + struct ceph_snap_realm **realm_ret) 642 636 { 643 637 struct ceph_mds_snap_realm *ri; /* encoded */ 644 638 __le64 *snaps; /* encoded */ 645 639 __le64 *prior_parent_snaps; /* encoded */ 646 - struct ceph_snap_realm *realm; 640 + struct ceph_snap_realm *realm = NULL; 641 + struct ceph_snap_realm *first_realm = NULL; 647 642 int invalidate = 0; 648 643 int err = -ENOMEM; 649 644 LIST_HEAD(dirty_realms); ··· 713 704 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 714 705 realm, invalidate, p, e); 715 706 707 + /* invalidate when we reach the _end_ (root) of the trace */ 708 + if (invalidate && p >= e) 709 + rebuild_snap_realms(realm); 710 + 711 + if (!first_realm) 712 + first_realm = realm; 713 + else 714 + ceph_put_snap_realm(mdsc, realm); 715 + 716 716 if (p < e) 717 717 goto more; 718 - 719 - /* invalidate when we reach the _end_ (root) of the trace */ 720 - if (invalidate) 721 - rebuild_snap_realms(realm); 722 718 723 719 /* 724 720 * queue cap snaps _after_ we've built the new snap contexts, ··· 735 721 queue_realm_cap_snaps(realm); 736 722 } 737 723 724 + if (realm_ret) 725 + *realm_ret = first_realm; 726 + else 727 + ceph_put_snap_realm(mdsc, first_realm); 728 + 738 729 __cleanup_empty_realms(mdsc); 739 730 return 0; 740 731 741 732 bad: 742 733 err = -EINVAL; 743 734 fail: 735 + if (realm && !IS_ERR(realm)) 736 + ceph_put_snap_realm(mdsc, realm); 737 + if (first_realm) 738 + ceph_put_snap_realm(mdsc, first_realm); 744 739 pr_err("update_snap_trace error %d\n", err); 745 740 return err; 746 741 } ··· 867 844 if (IS_ERR(realm)) 868 845 goto out; 869 846 } 870 - ceph_get_snap_realm(mdsc, realm); 871 847 872 848 dout("splitting snap_realm %llx %p\n", realm->ino, realm); 873 849 for (i = 0; i < num_split_inos; i++) { ··· 927 905 /* we may have taken some of the old realm's children. */ 928 906 for (i = 0; i < num_split_realms; i++) { 929 907 struct ceph_snap_realm *child = 930 - ceph_lookup_snap_realm(mdsc, 908 + __lookup_snap_realm(mdsc, 931 909 le64_to_cpu(split_realms[i])); 932 910 if (!child) 933 911 continue; ··· 940 918 * snap, we can avoid queueing cap_snaps. 941 919 */ 942 920 ceph_update_snap_trace(mdsc, p, e, 943 - op == CEPH_SNAP_OP_DESTROY); 921 + op == CEPH_SNAP_OP_DESTROY, NULL); 944 922 945 923 if (op == CEPH_SNAP_OP_SPLIT) 946 924 /* we took a reference when we created the realm, above */

+4

fs/ceph/super.c

··· 414 414 seq_puts(m, ",noshare"); 415 415 if (opt->flags & CEPH_OPT_NOCRC) 416 416 seq_puts(m, ",nocrc"); 417 + if (opt->flags & CEPH_OPT_NOMSGAUTH) 418 + seq_puts(m, ",nocephx_require_signatures"); 419 + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) 420 + seq_puts(m, ",notcp_nodelay"); 417 421 418 422 if (opt->name) 419 423 seq_printf(m, ",name=%s", opt->name);

+4 -1

fs/ceph/super.h

··· 693 693 extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, 694 694 struct ceph_snap_realm *realm); 695 695 extern int ceph_update_snap_trace(struct ceph_mds_client *m, 696 - void *p, void *e, bool deletion); 696 + void *p, void *e, bool deletion, 697 + struct ceph_snap_realm **realm_ret); 697 698 extern void ceph_handle_snap(struct ceph_mds_client *mdsc, 698 699 struct ceph_mds_session *session, 699 700 struct ceph_msg *msg); ··· 893 892 int ceph_uninline_data(struct file *filp, struct page *locked_page); 894 893 /* dir.c */ 895 894 extern const struct file_operations ceph_dir_fops; 895 + extern const struct file_operations ceph_snapdir_fops; 896 896 extern const struct inode_operations ceph_dir_iops; 897 + extern const struct inode_operations ceph_snapdir_iops; 897 898 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 898 899 ceph_snapdir_dentry_ops; 899 900

+1 -36

include/linux/ceph/ceph_fs.h

··· 158 158 }; 159 159 160 160 161 - /* pool operations */ 162 - enum { 163 - POOL_OP_CREATE = 0x01, 164 - POOL_OP_DELETE = 0x02, 165 - POOL_OP_AUID_CHANGE = 0x03, 166 - POOL_OP_CREATE_SNAP = 0x11, 167 - POOL_OP_DELETE_SNAP = 0x12, 168 - POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, 169 - POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, 170 - }; 171 - 172 161 struct ceph_mon_request_header { 173 162 __le64 have_version; 174 163 __le16 session_mon; ··· 178 189 struct ceph_fsid fsid; 179 190 __le64 version; 180 191 struct ceph_statfs st; 181 - } __attribute__ ((packed)); 182 - 183 - const char *ceph_pool_op_name(int op); 184 - 185 - struct ceph_mon_poolop { 186 - struct ceph_mon_request_header monhdr; 187 - struct ceph_fsid fsid; 188 - __le32 pool; 189 - __le32 op; 190 - __le64 auid; 191 - __le64 snapid; 192 - __le32 name_len; 193 - } __attribute__ ((packed)); 194 - 195 - struct ceph_mon_poolop_reply { 196 - struct ceph_mon_request_header monhdr; 197 - struct ceph_fsid fsid; 198 - __le32 reply_code; 199 - __le32 epoch; 200 - char has_data; 201 - char data[0]; 202 - } __attribute__ ((packed)); 203 - 204 - struct ceph_mon_unmanaged_snap { 205 - __le64 snapid; 206 192 } __attribute__ ((packed)); 207 193 208 194 struct ceph_osd_getmap { ··· 271 307 CEPH_SESSION_RECALL_STATE, 272 308 CEPH_SESSION_FLUSHMSG, 273 309 CEPH_SESSION_FLUSHMSG_ACK, 310 + CEPH_SESSION_FORCE_RO, 274 311 }; 275 312 276 313 extern const char *ceph_session_op_name(int op);

+2 -1

include/linux/ceph/libceph.h

··· 30 30 #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ 31 31 #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ 32 32 #define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */ 33 + #define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ 33 34 34 - #define CEPH_OPT_DEFAULT (0) 35 + #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) 35 36 36 37 #define ceph_set_opt(client, opt) \ 37 38 (client)->options->flags |= CEPH_OPT_##opt;

+3 -1

include/linux/ceph/messenger.h

··· 57 57 58 58 atomic_t stopping; 59 59 bool nocrc; 60 + bool tcp_nodelay; 60 61 61 62 /* 62 63 * the global_seq counts connections i (attempt to) initiate ··· 265 264 struct ceph_entity_addr *myaddr, 266 265 u64 supported_features, 267 266 u64 required_features, 268 - bool nocrc); 267 + bool nocrc, 268 + bool tcp_nodelay); 269 269 270 270 extern void ceph_con_init(struct ceph_connection *con, void *private, 271 271 const struct ceph_connection_operations *ops,

+1 -8

include/linux/ceph/mon_client.h

··· 40 40 }; 41 41 42 42 /* 43 - * ceph_mon_generic_request is being used for the statfs, poolop and 43 + * ceph_mon_generic_request is being used for the statfs and 44 44 * mon_get_version requests which are being done a bit differently 45 45 * because we need to get data back to the caller 46 46 */ ··· 50 50 struct rb_node node; 51 51 int result; 52 52 void *buf; 53 - int buf_len; 54 53 struct completion completion; 55 54 struct ceph_msg *request; /* original request */ 56 55 struct ceph_msg *reply; /* and reply */ ··· 115 116 extern int ceph_monc_open_session(struct ceph_mon_client *monc); 116 117 117 118 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); 118 - 119 - extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, 120 - u32 pool, u64 *snapid); 121 - 122 - extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 123 - u32 pool, u64 snapid); 124 119 125 120 #endif

+15 -1

net/ceph/ceph_common.c

··· 239 239 Opt_nocrc, 240 240 Opt_cephx_require_signatures, 241 241 Opt_nocephx_require_signatures, 242 + Opt_tcp_nodelay, 243 + Opt_notcp_nodelay, 242 244 }; 243 245 244 246 static match_table_t opt_tokens = { ··· 261 259 {Opt_nocrc, "nocrc"}, 262 260 {Opt_cephx_require_signatures, "cephx_require_signatures"}, 263 261 {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, 262 + {Opt_tcp_nodelay, "tcp_nodelay"}, 263 + {Opt_notcp_nodelay, "notcp_nodelay"}, 264 264 {-1, NULL} 265 265 }; 266 266 ··· 461 457 case Opt_nocrc: 462 458 opt->flags |= CEPH_OPT_NOCRC; 463 459 break; 460 + 464 461 case Opt_cephx_require_signatures: 465 462 opt->flags &= ~CEPH_OPT_NOMSGAUTH; 466 463 break; 467 464 case Opt_nocephx_require_signatures: 468 465 opt->flags |= CEPH_OPT_NOMSGAUTH; 466 + break; 467 + 468 + case Opt_tcp_nodelay: 469 + opt->flags |= CEPH_OPT_TCP_NODELAY; 470 + break; 471 + case Opt_notcp_nodelay: 472 + opt->flags &= ~CEPH_OPT_TCP_NODELAY; 469 473 break; 470 474 471 475 default: ··· 530 518 /* msgr */ 531 519 if (ceph_test_opt(client, MYIP)) 532 520 myaddr = &client->options->my_addr; 521 + 533 522 ceph_messenger_init(&client->msgr, myaddr, 534 523 client->supported_features, 535 524 client->required_features, 536 - ceph_test_opt(client, NOCRC)); 525 + ceph_test_opt(client, NOCRC), 526 + ceph_test_opt(client, TCP_NODELAY)); 537 527 538 528 /* subsystems */ 539 529 err = ceph_monc_init(&client->monc, client);

-14

net/ceph/ceph_strings.c

··· 42 42 return "???"; 43 43 } 44 44 } 45 - 46 - const char *ceph_pool_op_name(int op) 47 - { 48 - switch (op) { 49 - case POOL_OP_CREATE: return "create"; 50 - case POOL_OP_DELETE: return "delete"; 51 - case POOL_OP_AUID_CHANGE: return "auid change"; 52 - case POOL_OP_CREATE_SNAP: return "create snap"; 53 - case POOL_OP_DELETE_SNAP: return "delete snap"; 54 - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; 55 - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; 56 - } 57 - return "???"; 58 - }

-2

net/ceph/debugfs.c

··· 127 127 op = le16_to_cpu(req->request->hdr.type); 128 128 if (op == CEPH_MSG_STATFS) 129 129 seq_printf(s, "%llu statfs\n", req->tid); 130 - else if (op == CEPH_MSG_POOLOP) 131 - seq_printf(s, "%llu poolop\n", req->tid); 132 130 else if (op == CEPH_MSG_MON_GET_VERSION) 133 131 seq_printf(s, "%llu mon_get_version", req->tid); 134 132 else

+13 -1

net/ceph/messenger.c

··· 510 510 return ret; 511 511 } 512 512 513 + if (con->msgr->tcp_nodelay) { 514 + int optval = 1; 515 + 516 + ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, 517 + (char *)&optval, sizeof(optval)); 518 + if (ret) 519 + pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", 520 + ret); 521 + } 522 + 513 523 sk_set_memalloc(sock->sk); 514 524 515 525 con->sock = sock; ··· 2932 2922 struct ceph_entity_addr *myaddr, 2933 2923 u64 supported_features, 2934 2924 u64 required_features, 2935 - bool nocrc) 2925 + bool nocrc, 2926 + bool tcp_nodelay) 2936 2927 { 2937 2928 msgr->supported_features = supported_features; 2938 2929 msgr->required_features = required_features; ··· 2948 2937 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); 2949 2938 encode_my_addr(msgr); 2950 2939 msgr->nocrc = nocrc; 2940 + msgr->tcp_nodelay = tcp_nodelay; 2951 2941 2952 2942 atomic_set(&msgr->stopping, 0); 2953 2943

+5 -134

net/ceph/mon_client.c

··· 410 410 } 411 411 412 412 /* 413 - * generic requests (e.g., statfs, poolop) 413 + * generic requests (currently statfs, mon_get_version) 414 414 */ 415 415 static struct ceph_mon_generic_request *__lookup_generic_req( 416 416 struct ceph_mon_client *monc, u64 tid) ··· 569 569 return; 570 570 571 571 bad: 572 - pr_err("corrupt generic reply, tid %llu\n", tid); 572 + pr_err("corrupt statfs reply, tid %llu\n", tid); 573 573 ceph_msg_dump(msg); 574 574 } 575 575 ··· 588 588 589 589 kref_init(&req->kref); 590 590 req->buf = buf; 591 - req->buf_len = sizeof(*buf); 592 591 init_completion(&req->completion); 593 592 594 593 err = -ENOMEM; ··· 610 611 err = do_generic_request(monc, req); 611 612 612 613 out: 613 - kref_put(&req->kref, release_generic_request); 614 + put_generic_request(req); 614 615 return err; 615 616 } 616 617 EXPORT_SYMBOL(ceph_monc_do_statfs); ··· 646 647 647 648 return; 648 649 bad: 649 - pr_err("corrupt mon_get_version reply\n"); 650 + pr_err("corrupt mon_get_version reply, tid %llu\n", tid); 650 651 ceph_msg_dump(msg); 651 652 } 652 653 ··· 669 670 670 671 kref_init(&req->kref); 671 672 req->buf = newest; 672 - req->buf_len = sizeof(*newest); 673 673 init_completion(&req->completion); 674 674 675 675 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, ··· 699 701 700 702 mutex_unlock(&monc->mutex); 701 703 out: 702 - kref_put(&req->kref, release_generic_request); 704 + put_generic_request(req); 703 705 return err; 704 706 } 705 707 EXPORT_SYMBOL(ceph_monc_do_get_version); 706 - 707 - /* 708 - * pool ops 709 - */ 710 - static int get_poolop_reply_buf(const char *src, size_t src_len, 711 - char *dst, size_t dst_len) 712 - { 713 - u32 buf_len; 714 - 715 - if (src_len != sizeof(u32) + dst_len) 716 - return -EINVAL; 717 - 718 - buf_len = le32_to_cpu(*(__le32 *)src); 719 - if (buf_len != dst_len) 720 - return -EINVAL; 721 - 722 - memcpy(dst, src + sizeof(u32), dst_len); 723 - return 0; 724 - } 725 - 726 - static void handle_poolop_reply(struct ceph_mon_client *monc, 727 - struct ceph_msg *msg) 728 - { 729 - struct ceph_mon_generic_request *req; 730 - struct ceph_mon_poolop_reply *reply = msg->front.iov_base; 731 - u64 tid = le64_to_cpu(msg->hdr.tid); 732 - 733 - if (msg->front.iov_len < sizeof(*reply)) 734 - goto bad; 735 - dout("handle_poolop_reply %p tid %llu\n", msg, tid); 736 - 737 - mutex_lock(&monc->mutex); 738 - req = __lookup_generic_req(monc, tid); 739 - if (req) { 740 - if (req->buf_len && 741 - get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), 742 - msg->front.iov_len - sizeof(*reply), 743 - req->buf, req->buf_len) < 0) { 744 - mutex_unlock(&monc->mutex); 745 - goto bad; 746 - } 747 - req->result = le32_to_cpu(reply->reply_code); 748 - get_generic_request(req); 749 - } 750 - mutex_unlock(&monc->mutex); 751 - if (req) { 752 - complete(&req->completion); 753 - put_generic_request(req); 754 - } 755 - return; 756 - 757 - bad: 758 - pr_err("corrupt generic reply, tid %llu\n", tid); 759 - ceph_msg_dump(msg); 760 - } 761 - 762 - /* 763 - * Do a synchronous pool op. 764 - */ 765 - static int do_poolop(struct ceph_mon_client *monc, u32 op, 766 - u32 pool, u64 snapid, 767 - char *buf, int len) 768 - { 769 - struct ceph_mon_generic_request *req; 770 - struct ceph_mon_poolop *h; 771 - int err; 772 - 773 - req = kzalloc(sizeof(*req), GFP_NOFS); 774 - if (!req) 775 - return -ENOMEM; 776 - 777 - kref_init(&req->kref); 778 - req->buf = buf; 779 - req->buf_len = len; 780 - init_completion(&req->completion); 781 - 782 - err = -ENOMEM; 783 - req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, 784 - true); 785 - if (!req->request) 786 - goto out; 787 - req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, 788 - true); 789 - if (!req->reply) 790 - goto out; 791 - 792 - /* fill out request */ 793 - req->request->hdr.version = cpu_to_le16(2); 794 - h = req->request->front.iov_base; 795 - h->monhdr.have_version = 0; 796 - h->monhdr.session_mon = cpu_to_le16(-1); 797 - h->monhdr.session_mon_tid = 0; 798 - h->fsid = monc->monmap->fsid; 799 - h->pool = cpu_to_le32(pool); 800 - h->op = cpu_to_le32(op); 801 - h->auid = 0; 802 - h->snapid = cpu_to_le64(snapid); 803 - h->name_len = 0; 804 - 805 - err = do_generic_request(monc, req); 806 - 807 - out: 808 - kref_put(&req->kref, release_generic_request); 809 - return err; 810 - } 811 - 812 - int ceph_monc_create_snapid(struct ceph_mon_client *monc, 813 - u32 pool, u64 *snapid) 814 - { 815 - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 816 - pool, 0, (char *)snapid, sizeof(*snapid)); 817 - 818 - } 819 - EXPORT_SYMBOL(ceph_monc_create_snapid); 820 - 821 - int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 822 - u32 pool, u64 snapid) 823 - { 824 - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 825 - pool, snapid, NULL, 0); 826 - 827 - } 828 708 829 709 /* 830 710 * Resend pending generic requests. ··· 988 1112 handle_get_version_reply(monc, msg); 989 1113 break; 990 1114 991 - case CEPH_MSG_POOLOP_REPLY: 992 - handle_poolop_reply(monc, msg); 993 - break; 994 - 995 1115 case CEPH_MSG_MON_MAP: 996 1116 ceph_monc_handle_map(monc, msg); 997 1117 break; ··· 1026 1154 case CEPH_MSG_MON_SUBSCRIBE_ACK: 1027 1155 m = ceph_msg_get(monc->m_subscribe_ack); 1028 1156 break; 1029 - case CEPH_MSG_POOLOP_REPLY: 1030 1157 case CEPH_MSG_STATFS_REPLY: 1031 1158 return get_generic_reply(con, hdr, skip); 1032 1159 case CEPH_MSG_AUTH_REPLY:

+21 -10

net/ceph/osd_client.c

··· 1035 1035 { 1036 1036 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 1037 1037 atomic_read(&osd->o_ref) - 1); 1038 - if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { 1038 + if (atomic_dec_and_test(&osd->o_ref)) { 1039 1039 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; 1040 1040 1041 - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); 1041 + if (osd->o_auth.authorizer) 1042 + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); 1042 1043 kfree(osd); 1043 1044 } 1044 1045 } ··· 1049 1048 */ 1050 1049 static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 1051 1050 { 1052 - dout("__remove_osd %p\n", osd); 1051 + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); 1053 1052 WARN_ON(!list_empty(&osd->o_requests)); 1054 1053 WARN_ON(!list_empty(&osd->o_linger_requests)); 1055 1054 1056 - rb_erase(&osd->o_node, &osdc->osds); 1057 1055 list_del_init(&osd->o_osd_lru); 1058 - ceph_con_close(&osd->o_con); 1059 - put_osd(osd); 1056 + rb_erase(&osd->o_node, &osdc->osds); 1057 + RB_CLEAR_NODE(&osd->o_node); 1058 + } 1059 + 1060 + static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 1061 + { 1062 + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); 1063 + 1064 + if (!RB_EMPTY_NODE(&osd->o_node)) { 1065 + ceph_con_close(&osd->o_con); 1066 + __remove_osd(osdc, osd); 1067 + put_osd(osd); 1068 + } 1060 1069 } 1061 1070 1062 1071 static void remove_all_osds(struct ceph_osd_client *osdc) ··· 1076 1065 while (!RB_EMPTY_ROOT(&osdc->osds)) { 1077 1066 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), 1078 1067 struct ceph_osd, o_node); 1079 - __remove_osd(osdc, osd); 1068 + remove_osd(osdc, osd); 1080 1069 } 1081 1070 mutex_unlock(&osdc->request_mutex); 1082 1071 } ··· 1117 1106 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { 1118 1107 if (time_before(jiffies, osd->lru_ttl)) 1119 1108 break; 1120 - __remove_osd(osdc, osd); 1109 + remove_osd(osdc, osd); 1121 1110 } 1122 1111 mutex_unlock(&osdc->request_mutex); 1123 1112 } ··· 1132 1121 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 1133 1122 if (list_empty(&osd->o_requests) && 1134 1123 list_empty(&osd->o_linger_requests)) { 1135 - __remove_osd(osdc, osd); 1136 - 1124 + remove_osd(osdc, osd); 1137 1125 return -ENODEV; 1138 1126 } 1139 1127 ··· 1936 1926 { 1937 1927 struct rb_node *p, *n; 1938 1928 1929 + dout("%s %p\n", __func__, osdc); 1939 1930 for (p = rb_first(&osdc->osds); p; p = n) { 1940 1931 struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); 1941 1932