Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
"The biggest chunk is a series of patches from Ilya that add support
for new Ceph osd and crush map features, including some new tunables,
primary affinity, and the new encoding that is needed for erasure
coding support. This brings things into parity with the server side
and the looming firefly release. There is also support for allocation
hints in RBD that help limit fragmentation on the server side.

There is also a series of patches from Zheng fixing NFS reexport,
directory fragmentation support, flock vs fnctl behavior, and some
issues with clustered MDS.

Finally, there are some miscellaneous fixes from Yunchuan Wen for
fscache, Fabian Frederick for ACLs, and from me for fsync(dirfd)
behavior"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (79 commits)
ceph: skip invalid dentry during dcache readdir
libceph: dump pool {read,write}_tier to debugfs
libceph: output primary affinity values on osdmap updates
ceph: flush cap release queue when trimming session caps
ceph: don't grabs open file reference for aborted request
ceph: drop extra open file reference in ceph_atomic_open()
ceph: preallocate buffer for readdir reply
libceph: enable PRIMARY_AFFINITY feature bit
libceph: redo ceph_calc_pg_primary() in terms of ceph_calc_pg_acting()
libceph: add support for osd primary affinity
libceph: add support for primary_temp mappings
libceph: return primary from ceph_calc_pg_acting()
libceph: switch ceph_calc_pg_acting() to new helpers
libceph: introduce apply_temps() helper
libceph: introduce pg_to_raw_osds() and raw_to_up_osds() helpers
libceph: ceph_can_shift_osds(pool) and pool type defines
libceph: ceph_osd_{exists,is_up,is_down}(osd) definitions
libceph: enable OSDMAP_ENC feature bit
libceph: primary_affinity decode bits
libceph: primary_affinity infrastructure
...

+1424 -638
+59 -28
drivers/block/rbd.c
··· 1654 1654 if (osd_req->r_result < 0) 1655 1655 obj_request->result = osd_req->r_result; 1656 1656 1657 - BUG_ON(osd_req->r_num_ops > 2); 1657 + rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1658 1658 1659 1659 /* 1660 1660 * We support a 64-bit length, but ultimately it has to be ··· 1662 1662 */ 1663 1663 obj_request->xferred = osd_req->r_reply_op_len[0]; 1664 1664 rbd_assert(obj_request->xferred < (u64)UINT_MAX); 1665 + 1665 1666 opcode = osd_req->r_ops[0].op; 1666 1667 switch (opcode) { 1667 1668 case CEPH_OSD_OP_READ: 1668 1669 rbd_osd_read_callback(obj_request); 1669 1670 break; 1671 + case CEPH_OSD_OP_SETALLOCHINT: 1672 + rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); 1673 + /* fall through */ 1670 1674 case CEPH_OSD_OP_WRITE: 1671 1675 rbd_osd_write_callback(obj_request); 1672 1676 break; ··· 1719 1715 snapc, CEPH_NOSNAP, &mtime); 1720 1716 } 1721 1717 1718 + /* 1719 + * Create an osd request. A read request has one osd op (read). 1720 + * A write request has either one (watch) or two (hint+write) osd ops. 1721 + * (All rbd data writes are prefixed with an allocation hint op, but 1722 + * technically osd watch is a write request, hence this distinction.) 1723 + */ 1722 1724 static struct ceph_osd_request *rbd_osd_req_create( 1723 1725 struct rbd_device *rbd_dev, 1724 1726 bool write_request, 1727 + unsigned int num_ops, 1725 1728 struct rbd_obj_request *obj_request) 1726 1729 { 1727 1730 struct ceph_snap_context *snapc = NULL; ··· 1744 1733 snapc = img_request->snapc; 1745 1734 } 1746 1735 1747 - /* Allocate and initialize the request, for the single op */ 1736 + rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); 1737 + 1738 + /* Allocate and initialize the request, for the num_ops ops */ 1748 1739 1749 1740 osdc = &rbd_dev->rbd_client->client->osdc; 1750 - osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1741 + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1742 + GFP_ATOMIC); 1751 1743 if (!osd_req) 1752 1744 return NULL; /* ENOMEM */ 1753 1745 ··· 1770 1756 1771 1757 /* 1772 1758 * Create a copyup osd request based on the information in the 1773 - * object request supplied. A copyup request has two osd ops, 1774 - * a copyup method call, and a "normal" write request. 1759 + * object request supplied. A copyup request has three osd ops, 1760 + * a copyup method call, a hint op, and a write op. 1775 1761 */ 1776 1762 static struct ceph_osd_request * 1777 1763 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) ··· 1787 1773 rbd_assert(img_request); 1788 1774 rbd_assert(img_request_write_test(img_request)); 1789 1775 1790 - /* Allocate and initialize the request, for the two ops */ 1776 + /* Allocate and initialize the request, for the three ops */ 1791 1777 1792 1778 snapc = img_request->snapc; 1793 1779 rbd_dev = img_request->rbd_dev; 1794 1780 osdc = &rbd_dev->rbd_client->client->osdc; 1795 - osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); 1781 + osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); 1796 1782 if (!osd_req) 1797 1783 return NULL; /* ENOMEM */ 1798 1784 ··· 2192 2178 const char *object_name; 2193 2179 u64 offset; 2194 2180 u64 length; 2181 + unsigned int which = 0; 2195 2182 2196 2183 object_name = rbd_segment_name(rbd_dev, img_offset); 2197 2184 if (!object_name) ··· 2205 2190 rbd_segment_name_free(object_name); 2206 2191 if (!obj_request) 2207 2192 goto out_unwind; 2193 + 2208 2194 /* 2209 2195 * set obj_request->img_request before creating the 2210 2196 * osd_request so that it gets the right snapc ··· 2223 2207 clone_size, 2224 2208 GFP_ATOMIC); 2225 2209 if (!obj_request->bio_list) 2226 - goto out_partial; 2210 + goto out_unwind; 2227 2211 } else { 2228 2212 unsigned int page_count; 2229 2213 ··· 2236 2220 } 2237 2221 2238 2222 osd_req = rbd_osd_req_create(rbd_dev, write_request, 2239 - obj_request); 2223 + (write_request ? 2 : 1), 2224 + obj_request); 2240 2225 if (!osd_req) 2241 - goto out_partial; 2226 + goto out_unwind; 2242 2227 obj_request->osd_req = osd_req; 2243 2228 obj_request->callback = rbd_img_obj_callback; 2244 2229 2245 - osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 2246 - 0, 0); 2230 + if (write_request) { 2231 + osd_req_op_alloc_hint_init(osd_req, which, 2232 + rbd_obj_bytes(&rbd_dev->header), 2233 + rbd_obj_bytes(&rbd_dev->header)); 2234 + which++; 2235 + } 2236 + 2237 + osd_req_op_extent_init(osd_req, which, opcode, offset, length, 2238 + 0, 0); 2247 2239 if (type == OBJ_REQUEST_BIO) 2248 - osd_req_op_extent_osd_data_bio(osd_req, 0, 2240 + osd_req_op_extent_osd_data_bio(osd_req, which, 2249 2241 obj_request->bio_list, length); 2250 2242 else 2251 - osd_req_op_extent_osd_data_pages(osd_req, 0, 2243 + osd_req_op_extent_osd_data_pages(osd_req, which, 2252 2244 obj_request->pages, length, 2253 2245 offset & ~PAGE_MASK, false, false); 2254 2246 ··· 2273 2249 2274 2250 return 0; 2275 2251 2276 - out_partial: 2277 - rbd_obj_request_put(obj_request); 2278 2252 out_unwind: 2279 2253 for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2280 - rbd_obj_request_put(obj_request); 2254 + rbd_img_obj_request_del(img_request, obj_request); 2281 2255 2282 2256 return -ENOMEM; 2283 2257 } ··· 2375 2353 2376 2354 /* 2377 2355 * The original osd request is of no use to use any more. 2378 - * We need a new one that can hold the two ops in a copyup 2356 + * We need a new one that can hold the three ops in a copyup 2379 2357 * request. Allocate the new copyup osd request for the 2380 2358 * original request, and release the old one. 2381 2359 */ ··· 2394 2372 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 2395 2373 false, false); 2396 2374 2397 - /* Then the original write request op */ 2375 + /* Then the hint op */ 2376 + 2377 + osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), 2378 + rbd_obj_bytes(&rbd_dev->header)); 2379 + 2380 + /* And the original write request op */ 2398 2381 2399 2382 offset = orig_request->offset; 2400 2383 length = orig_request->length; 2401 - osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, 2384 + osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE, 2402 2385 offset, length, 0, 0); 2403 2386 if (orig_request->type == OBJ_REQUEST_BIO) 2404 - osd_req_op_extent_osd_data_bio(osd_req, 1, 2387 + osd_req_op_extent_osd_data_bio(osd_req, 2, 2405 2388 orig_request->bio_list, length); 2406 2389 else 2407 - osd_req_op_extent_osd_data_pages(osd_req, 1, 2390 + osd_req_op_extent_osd_data_pages(osd_req, 2, 2408 2391 orig_request->pages, length, 2409 2392 offset & ~PAGE_MASK, false, false); 2410 2393 ··· 2630 2603 2631 2604 rbd_assert(obj_request->img_request); 2632 2605 rbd_dev = obj_request->img_request->rbd_dev; 2633 - stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 2634 - stat_request); 2606 + stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 2607 + stat_request); 2635 2608 if (!stat_request->osd_req) 2636 2609 goto out; 2637 2610 stat_request->callback = rbd_img_obj_exists_callback; ··· 2834 2807 return -ENOMEM; 2835 2808 2836 2809 ret = -ENOMEM; 2837 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2810 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 2811 + obj_request); 2838 2812 if (!obj_request->osd_req) 2839 2813 goto out; 2840 2814 ··· 2898 2870 if (!obj_request) 2899 2871 goto out_cancel; 2900 2872 2901 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 2873 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 2874 + obj_request); 2902 2875 if (!obj_request->osd_req) 2903 2876 goto out_cancel; 2904 2877 ··· 3007 2978 obj_request->pages = pages; 3008 2979 obj_request->page_count = page_count; 3009 2980 3010 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2981 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 2982 + obj_request); 3011 2983 if (!obj_request->osd_req) 3012 2984 goto out; 3013 2985 ··· 3241 3211 obj_request->pages = pages; 3242 3212 obj_request->page_count = page_count; 3243 3213 3244 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 3214 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 3215 + obj_request); 3245 3216 if (!obj_request->osd_req) 3246 3217 goto out; 3247 3218
+1
fs/ceph/cache.c
··· 205 205 ci->fscache = fscache_acquire_cookie(fsc->fscache, 206 206 &ceph_fscache_inode_object_def, 207 207 ci, true); 208 + fscache_check_consistency(ci->fscache); 208 209 done: 209 210 mutex_unlock(&inode->i_mutex); 210 211
+10
fs/ceph/cache.h
··· 48 48 void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); 49 49 void ceph_queue_revalidate(struct inode *inode); 50 50 51 + static inline void ceph_fscache_update_objectsize(struct inode *inode) 52 + { 53 + struct ceph_inode_info *ci = ceph_inode(inode); 54 + fscache_attr_changed(ci->fscache); 55 + } 56 + 51 57 static inline void ceph_fscache_invalidate(struct inode *inode) 52 58 { 53 59 fscache_invalidate(ceph_inode(inode)->fscache); ··· 138 132 139 133 static inline void ceph_readpage_to_fscache(struct inode *inode, 140 134 struct page *page) 135 + { 136 + } 137 + 138 + static inline void ceph_fscache_update_objectsize(struct inode *inode) 141 139 { 142 140 } 143 141
+7 -2
fs/ceph/caps.c
··· 622 622 623 623 if (flags & CEPH_CAP_FLAG_AUTH) { 624 624 if (ci->i_auth_cap == NULL || 625 - ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 625 + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { 626 626 ci->i_auth_cap = cap; 627 + cap->mds_wanted = wanted; 628 + } 627 629 ci->i_cap_exporting_issued = 0; 628 630 } else { 629 631 WARN_ON(ci->i_auth_cap == cap); ··· 887 885 cap = rb_entry(p, struct ceph_cap, ci_node); 888 886 if (!__cap_is_valid(cap)) 889 887 continue; 890 - mds_wanted |= cap->mds_wanted; 888 + if (cap == ci->i_auth_cap) 889 + mds_wanted |= cap->mds_wanted; 890 + else 891 + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); 891 892 } 892 893 return mds_wanted; 893 894 }
+4 -1
fs/ceph/debugfs.c
··· 93 93 } else if (req->r_path1) { 94 94 seq_printf(s, " #%llx/%s", req->r_ino1.ino, 95 95 req->r_path1); 96 + } else { 97 + seq_printf(s, " #%llx", req->r_ino1.ino); 96 98 } 97 99 98 100 if (req->r_old_dentry) { ··· 104 102 path = NULL; 105 103 spin_lock(&req->r_old_dentry->d_lock); 106 104 seq_printf(s, " #%llx/%.*s (%s)", 107 - ceph_ino(req->r_old_dentry_dir), 105 + req->r_old_dentry_dir ? 106 + ceph_ino(req->r_old_dentry_dir) : 0, 108 107 req->r_old_dentry->d_name.len, 109 108 req->r_old_dentry->d_name.name, 110 109 path ? path : "");
+28 -25
fs/ceph/dir.c
··· 119 119 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 120 120 * the MDS if/when the directory is modified). 121 121 */ 122 - static int __dcache_readdir(struct file *file, struct dir_context *ctx) 122 + static int __dcache_readdir(struct file *file, struct dir_context *ctx, 123 + u32 shared_gen) 123 124 { 124 125 struct ceph_file_info *fi = file->private_data; 125 126 struct dentry *parent = file->f_dentry; ··· 134 133 last = fi->dentry; 135 134 fi->dentry = NULL; 136 135 137 - dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, 138 - last); 136 + dout("__dcache_readdir %p v%u at %llu (last %p)\n", 137 + dir, shared_gen, ctx->pos, last); 139 138 140 139 spin_lock(&parent->d_lock); 141 140 ··· 162 161 goto out_unlock; 163 162 } 164 163 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 165 - if (!d_unhashed(dentry) && dentry->d_inode && 164 + if (di->lease_shared_gen == shared_gen && 165 + !d_unhashed(dentry) && dentry->d_inode && 166 166 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 167 167 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 168 168 fpos_cmp(ctx->pos, di->offset) <= 0) ··· 192 190 if (last) { 193 191 /* remember our position */ 194 192 fi->dentry = last; 195 - fi->next_offset = di->offset; 193 + fi->next_offset = fpos_off(di->offset); 196 194 } 197 195 dput(dentry); 198 196 return 0; ··· 254 252 int err; 255 253 u32 ftype; 256 254 struct ceph_mds_reply_info_parsed *rinfo; 257 - const int max_entries = fsc->mount_options->max_readdir; 258 - const int max_bytes = fsc->mount_options->max_readdir_bytes; 259 255 260 256 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); 261 257 if (fi->flags & CEPH_F_ATEND) ··· 291 291 ceph_snap(inode) != CEPH_SNAPDIR && 292 292 __ceph_dir_is_complete(ci) && 293 293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 294 + u32 shared_gen = ci->i_shared_gen; 294 295 spin_unlock(&ci->i_ceph_lock); 295 - err = __dcache_readdir(file, ctx); 296 + err = __dcache_readdir(file, ctx, shared_gen); 296 297 if (err != -EAGAIN) 297 298 return err; 298 299 } else { ··· 323 322 fi->last_readdir = NULL; 324 323 } 325 324 326 - /* requery frag tree, as the frag topology may have changed */ 327 - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); 328 - 329 325 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 330 326 ceph_vinop(inode), frag, fi->last_name); 331 327 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 332 328 if (IS_ERR(req)) 333 329 return PTR_ERR(req); 330 + err = ceph_alloc_readdir_reply_buffer(req, inode); 331 + if (err) { 332 + ceph_mdsc_put_request(req); 333 + return err; 334 + } 334 335 req->r_inode = inode; 335 336 ihold(inode); 336 337 req->r_dentry = dget(file->f_dentry); ··· 343 340 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 344 341 req->r_readdir_offset = fi->next_offset; 345 342 req->r_args.readdir.frag = cpu_to_le32(frag); 346 - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 347 - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); 348 - req->r_num_caps = max_entries + 1; 349 343 err = ceph_mdsc_do_request(mdsc, NULL, req); 350 344 if (err < 0) { 351 345 ceph_mdsc_put_request(req); ··· 369 369 fi->next_offset = 0; 370 370 off = fi->next_offset; 371 371 } 372 + fi->frag = frag; 372 373 fi->offset = fi->next_offset; 373 374 fi->last_readdir = req; 374 - fi->frag = frag; 375 375 376 376 if (req->r_reply_info.dir_end) { 377 377 kfree(fi->last_name); ··· 454 454 return 0; 455 455 } 456 456 457 - static void reset_readdir(struct ceph_file_info *fi) 457 + static void reset_readdir(struct ceph_file_info *fi, unsigned frag) 458 458 { 459 459 if (fi->last_readdir) { 460 460 ceph_mdsc_put_request(fi->last_readdir); ··· 462 462 } 463 463 kfree(fi->last_name); 464 464 fi->last_name = NULL; 465 - fi->next_offset = 2; /* compensate for . and .. */ 465 + if (ceph_frag_is_leftmost(frag)) 466 + fi->next_offset = 2; /* compensate for . and .. */ 467 + else 468 + fi->next_offset = 0; 466 469 if (fi->dentry) { 467 470 dput(fi->dentry); 468 471 fi->dentry = NULL; ··· 477 474 { 478 475 struct ceph_file_info *fi = file->private_data; 479 476 struct inode *inode = file->f_mapping->host; 480 - loff_t old_offset = offset; 477 + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); 481 478 loff_t retval; 482 479 483 480 mutex_lock(&inode->i_mutex); ··· 494 491 goto out; 495 492 } 496 493 497 - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 494 + if (offset >= 0) { 498 495 if (offset != file->f_pos) { 499 496 file->f_pos = offset; 500 497 file->f_version = 0; ··· 507 504 * seek to new frag, or seek prior to current chunk. 508 505 */ 509 506 if (offset == 0 || 510 - fpos_frag(offset) != fpos_frag(old_offset) || 507 + fpos_frag(offset) != fi->frag || 511 508 fpos_off(offset) < fi->offset) { 512 509 dout("dir_llseek dropping %p content\n", file); 513 - reset_readdir(fi); 510 + reset_readdir(fi, fpos_frag(offset)); 514 511 } 515 512 516 513 /* bump dir_release_count if we did a forward seek */ 517 - if (offset > old_offset) 514 + if (fpos_cmp(offset, old_offset) > 0) 518 515 fi->dir_release_count--; 519 516 } 520 517 out: ··· 815 812 } 816 813 req->r_dentry = dget(dentry); 817 814 req->r_num_caps = 2; 818 - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 819 - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 815 + req->r_old_dentry = dget(old_dentry); 820 816 req->r_locked_dir = dir; 821 817 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 822 818 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; ··· 913 911 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 914 912 if (IS_ERR(req)) 915 913 return PTR_ERR(req); 914 + ihold(old_dir); 916 915 req->r_dentry = dget(new_dentry); 917 916 req->r_num_caps = 2; 918 917 req->r_old_dentry = dget(old_dentry); 919 - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 918 + req->r_old_dentry_dir = old_dir; 920 919 req->r_locked_dir = new_dir; 921 920 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 922 921 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+123 -150
fs/ceph/export.c
··· 8 8 #include "mds_client.h" 9 9 10 10 /* 11 - * NFS export support 12 - * 13 - * NFS re-export of a ceph mount is, at present, only semireliable. 14 - * The basic issue is that the Ceph architectures doesn't lend itself 15 - * well to generating filehandles that will remain valid forever. 16 - * 17 - * So, we do our best. If you're lucky, your inode will be in the 18 - * client's cache. If it's not, and you have a connectable fh, then 19 - * the MDS server may be able to find it for you. Otherwise, you get 20 - * ESTALE. 21 - * 22 - * There are ways to this more reliable, but in the non-connectable fh 23 - * case, we won't every work perfectly, and in the connectable case, 24 - * some changes are needed on the MDS side to work better. 25 - */ 26 - 27 - /* 28 11 * Basic fh 29 12 */ 30 13 struct ceph_nfs_fh { ··· 15 32 } __attribute__ ((packed)); 16 33 17 34 /* 18 - * Larger 'connectable' fh that includes parent ino and name hash. 19 - * Use this whenever possible, as it works more reliably. 35 + * Larger fh that includes parent ino. 20 36 */ 21 37 struct ceph_nfs_confh { 22 38 u64 ino, parent_ino; 23 - u32 parent_name_hash; 24 39 } __attribute__ ((packed)); 25 40 26 - /* 27 - * The presence of @parent_inode here tells us whether NFS wants a 28 - * connectable file handle. However, we want to make a connectionable 29 - * file handle unconditionally so that the MDS gets as much of a hint 30 - * as possible. That means we only use @parent_dentry to indicate 31 - * whether nfsd wants a connectable fh, and whether we should indicate 32 - * failure from a too-small @max_len. 33 - */ 34 41 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 35 42 struct inode *parent_inode) 36 43 { ··· 29 56 struct ceph_nfs_confh *cfh = (void *)rawfh; 30 57 int connected_handle_length = sizeof(*cfh)/4; 31 58 int handle_length = sizeof(*fh)/4; 32 - struct dentry *dentry; 33 - struct dentry *parent; 34 59 35 60 /* don't re-export snaps */ 36 61 if (ceph_snap(inode) != CEPH_NOSNAP) 37 62 return -EINVAL; 38 63 39 - dentry = d_find_alias(inode); 40 - 41 - /* if we found an alias, generate a connectable fh */ 42 - if (*max_len >= connected_handle_length && dentry) { 43 - dout("encode_fh %p connectable\n", dentry); 44 - spin_lock(&dentry->d_lock); 45 - parent = dentry->d_parent; 46 - cfh->ino = ceph_ino(inode); 47 - cfh->parent_ino = ceph_ino(parent->d_inode); 48 - cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, 49 - dentry); 64 + if (parent_inode && (*max_len < connected_handle_length)) { 50 65 *max_len = connected_handle_length; 51 - type = 2; 52 - spin_unlock(&dentry->d_lock); 53 - } else if (*max_len >= handle_length) { 54 - if (parent_inode) { 55 - /* nfsd wants connectable */ 56 - *max_len = connected_handle_length; 57 - type = FILEID_INVALID; 58 - } else { 59 - dout("encode_fh %p\n", dentry); 60 - fh->ino = ceph_ino(inode); 61 - *max_len = handle_length; 62 - type = 1; 63 - } 64 - } else { 66 + return FILEID_INVALID; 67 + } else if (*max_len < handle_length) { 65 68 *max_len = handle_length; 66 - type = FILEID_INVALID; 69 + return FILEID_INVALID; 67 70 } 68 - if (dentry) 69 - dput(dentry); 71 + 72 + if (parent_inode) { 73 + dout("encode_fh %llx with parent %llx\n", 74 + ceph_ino(inode), ceph_ino(parent_inode)); 75 + cfh->ino = ceph_ino(inode); 76 + cfh->parent_ino = ceph_ino(parent_inode); 77 + *max_len = connected_handle_length; 78 + type = FILEID_INO32_GEN_PARENT; 79 + } else { 80 + dout("encode_fh %llx\n", ceph_ino(inode)); 81 + fh->ino = ceph_ino(inode); 82 + *max_len = handle_length; 83 + type = FILEID_INO32_GEN; 84 + } 70 85 return type; 71 86 } 72 87 73 - /* 74 - * convert regular fh to dentry 75 - * 76 - * FIXME: we should try harder by querying the mds for the ino. 77 - */ 78 - static struct dentry *__fh_to_dentry(struct super_block *sb, 79 - struct ceph_nfs_fh *fh, int fh_len) 88 + static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) 80 89 { 81 90 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 82 91 struct inode *inode; ··· 66 111 struct ceph_vino vino; 67 112 int err; 68 113 69 - if (fh_len < sizeof(*fh) / 4) 70 - return ERR_PTR(-ESTALE); 71 - 72 - dout("__fh_to_dentry %llx\n", fh->ino); 73 - vino.ino = fh->ino; 114 + vino.ino = ino; 74 115 vino.snap = CEPH_NOSNAP; 75 116 inode = ceph_find_inode(sb, vino); 76 117 if (!inode) { ··· 90 139 91 140 dentry = d_obtain_alias(inode); 92 141 if (IS_ERR(dentry)) { 93 - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 94 - fh->ino, inode); 95 142 iput(inode); 96 143 return dentry; 97 144 } 98 145 err = ceph_init_dentry(dentry); 99 146 if (err < 0) { 100 - iput(inode); 147 + dput(dentry); 101 148 return ERR_PTR(err); 102 149 } 103 - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); 150 + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); 104 151 return dentry; 105 152 } 106 153 107 154 /* 108 - * convert connectable fh to dentry 155 + * convert regular fh to dentry 109 156 */ 110 - static struct dentry *__cfh_to_dentry(struct super_block *sb, 111 - struct ceph_nfs_confh *cfh, int fh_len) 157 + static struct dentry *ceph_fh_to_dentry(struct super_block *sb, 158 + struct fid *fid, 159 + int fh_len, int fh_type) 160 + { 161 + struct ceph_nfs_fh *fh = (void *)fid->raw; 162 + 163 + if (fh_type != FILEID_INO32_GEN && 164 + fh_type != FILEID_INO32_GEN_PARENT) 165 + return NULL; 166 + if (fh_len < sizeof(*fh) / 4) 167 + return NULL; 168 + 169 + dout("fh_to_dentry %llx\n", fh->ino); 170 + return __fh_to_dentry(sb, fh->ino); 171 + } 172 + 173 + static struct dentry *__get_parent(struct super_block *sb, 174 + struct dentry *child, u64 ino) 112 175 { 113 176 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 177 + struct ceph_mds_request *req; 114 178 struct inode *inode; 115 179 struct dentry *dentry; 116 - struct ceph_vino vino; 117 180 int err; 118 181 119 - if (fh_len < sizeof(*cfh) / 4) 120 - return ERR_PTR(-ESTALE); 182 + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, 183 + USE_ANY_MDS); 184 + if (IS_ERR(req)) 185 + return ERR_CAST(req); 121 186 122 - dout("__cfh_to_dentry %llx (%llx/%x)\n", 123 - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); 124 - 125 - vino.ino = cfh->ino; 126 - vino.snap = CEPH_NOSNAP; 127 - inode = ceph_find_inode(sb, vino); 128 - if (!inode) { 129 - struct ceph_mds_request *req; 130 - 131 - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, 132 - USE_ANY_MDS); 133 - if (IS_ERR(req)) 134 - return ERR_CAST(req); 135 - 136 - req->r_ino1 = vino; 137 - req->r_ino2.ino = cfh->parent_ino; 138 - req->r_ino2.snap = CEPH_NOSNAP; 139 - req->r_path2 = kmalloc(16, GFP_NOFS); 140 - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); 141 - req->r_num_caps = 1; 142 - err = ceph_mdsc_do_request(mdsc, NULL, req); 143 - inode = req->r_target_inode; 144 - if (inode) 145 - ihold(inode); 146 - ceph_mdsc_put_request(req); 147 - if (!inode) 148 - return ERR_PTR(err ? err : -ESTALE); 187 + if (child) { 188 + req->r_inode = child->d_inode; 189 + ihold(child->d_inode); 190 + } else { 191 + req->r_ino1 = (struct ceph_vino) { 192 + .ino = ino, 193 + .snap = CEPH_NOSNAP, 194 + }; 149 195 } 196 + req->r_num_caps = 1; 197 + err = ceph_mdsc_do_request(mdsc, NULL, req); 198 + inode = req->r_target_inode; 199 + if (inode) 200 + ihold(inode); 201 + ceph_mdsc_put_request(req); 202 + if (!inode) 203 + return ERR_PTR(-ENOENT); 150 204 151 205 dentry = d_obtain_alias(inode); 152 206 if (IS_ERR(dentry)) { 153 - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 154 - cfh->ino, inode); 155 207 iput(inode); 156 208 return dentry; 157 209 } 158 210 err = ceph_init_dentry(dentry); 159 211 if (err < 0) { 160 - iput(inode); 212 + dput(dentry); 161 213 return ERR_PTR(err); 162 214 } 163 - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); 215 + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", 216 + child ? ceph_ino(child->d_inode) : ino, 217 + dentry, ceph_vinop(inode)); 164 218 return dentry; 165 219 } 166 220 167 - static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, 168 - int fh_len, int fh_type) 221 + struct dentry *ceph_get_parent(struct dentry *child) 169 222 { 170 - if (fh_type == 1) 171 - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, 172 - fh_len); 173 - else 174 - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, 175 - fh_len); 223 + /* don't re-export snaps */ 224 + if (ceph_snap(child->d_inode) != CEPH_NOSNAP) 225 + return ERR_PTR(-EINVAL); 226 + 227 + dout("get_parent %p ino %llx.%llx\n", 228 + child, ceph_vinop(child->d_inode)); 229 + return __get_parent(child->d_sb, child, 0); 176 230 } 177 231 178 232 /* 179 - * get parent, if possible. 180 - * 181 - * FIXME: we could do better by querying the mds to discover the 182 - * parent. 233 + * convert regular fh to parent 183 234 */ 184 235 static struct dentry *ceph_fh_to_parent(struct super_block *sb, 185 - struct fid *fid, 236 + struct fid *fid, 186 237 int fh_len, int fh_type) 187 238 { 188 239 struct ceph_nfs_confh *cfh = (void *)fid->raw; 189 - struct ceph_vino vino; 190 - struct inode *inode; 191 240 struct dentry *dentry; 241 + 242 + if (fh_type != FILEID_INO32_GEN_PARENT) 243 + return NULL; 244 + if (fh_len < sizeof(*cfh) / 4) 245 + return NULL; 246 + 247 + dout("fh_to_parent %llx\n", cfh->parent_ino); 248 + dentry = __get_parent(sb, NULL, cfh->ino); 249 + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) 250 + dentry = __fh_to_dentry(sb, cfh->parent_ino); 251 + return dentry; 252 + } 253 + 254 + static int ceph_get_name(struct dentry *parent, char *name, 255 + struct dentry *child) 256 + { 257 + struct ceph_mds_client *mdsc; 258 + struct ceph_mds_request *req; 192 259 int err; 193 260 194 - if (fh_type == 1) 195 - return ERR_PTR(-ESTALE); 196 - if (fh_len < sizeof(*cfh) / 4) 197 - return ERR_PTR(-ESTALE); 261 + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; 262 + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, 263 + USE_ANY_MDS); 264 + if (IS_ERR(req)) 265 + return PTR_ERR(req); 198 266 199 - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, 200 - cfh->parent_name_hash); 267 + mutex_lock(&parent->d_inode->i_mutex); 201 268 202 - vino.ino = cfh->ino; 203 - vino.snap = CEPH_NOSNAP; 204 - inode = ceph_find_inode(sb, vino); 205 - if (!inode) 206 - return ERR_PTR(-ESTALE); 269 + req->r_inode = child->d_inode; 270 + ihold(child->d_inode); 271 + req->r_ino2 = ceph_vino(parent->d_inode); 272 + req->r_locked_dir = parent->d_inode; 273 + req->r_num_caps = 2; 274 + err = ceph_mdsc_do_request(mdsc, NULL, req); 207 275 208 - dentry = d_obtain_alias(inode); 209 - if (IS_ERR(dentry)) { 210 - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 211 - cfh->ino, inode); 212 - iput(inode); 213 - return dentry; 276 + mutex_unlock(&parent->d_inode->i_mutex); 277 + 278 + if (!err) { 279 + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 280 + memcpy(name, rinfo->dname, rinfo->dname_len); 281 + name[rinfo->dname_len] = 0; 282 + dout("get_name %p ino %llx.%llx name %s\n", 283 + child, ceph_vinop(child->d_inode), name); 284 + } else { 285 + dout("get_name %p ino %llx.%llx err %d\n", 286 + child, ceph_vinop(child->d_inode), err); 214 287 } 215 - err = ceph_init_dentry(dentry); 216 - if (err < 0) { 217 - iput(inode); 218 - return ERR_PTR(err); 219 - } 220 - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); 221 - return dentry; 288 + 289 + ceph_mdsc_put_request(req); 290 + return err; 222 291 } 223 292 224 293 const struct export_operations ceph_export_ops = { 225 294 .encode_fh = ceph_encode_fh, 226 295 .fh_to_dentry = ceph_fh_to_dentry, 227 296 .fh_to_parent = ceph_fh_to_parent, 297 + .get_parent = ceph_get_parent, 298 + .get_name = ceph_get_name, 228 299 };
+6 -2
fs/ceph/file.c
··· 210 210 ihold(inode); 211 211 212 212 req->r_num_caps = 1; 213 - if (flags & (O_CREAT|O_TRUNC)) 213 + if (flags & O_CREAT) 214 214 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 215 215 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 216 216 iput(parent_inode); ··· 291 291 } 292 292 err = finish_open(file, dentry, ceph_open, opened); 293 293 } 294 - 295 294 out_err: 295 + if (!req->r_err && req->r_target_inode) 296 + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); 296 297 ceph_mdsc_put_request(req); 297 298 dout("atomic_open result=%d\n", err); 298 299 return err; ··· 971 970 goto retry_snap; 972 971 } 973 972 } else { 973 + loff_t old_size = inode->i_size; 974 974 /* 975 975 * No need to acquire the i_truncate_mutex. Because 976 976 * the MDS revokes Fwb caps before sending truncate ··· 982 980 written = generic_file_buffered_write(iocb, iov, nr_segs, 983 981 pos, &iocb->ki_pos, 984 982 count, 0); 983 + if (inode->i_size > old_size) 984 + ceph_fscache_update_objectsize(inode); 985 985 mutex_unlock(&inode->i_mutex); 986 986 } 987 987
+60 -16
fs/ceph/inode.c
··· 659 659 le32_to_cpu(info->time_warp_seq), 660 660 &ctime, &mtime, &atime); 661 661 662 - /* only update max_size on auth cap */ 663 - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 664 - ci->i_max_size != le64_to_cpu(info->max_size)) { 665 - dout("max_size %lld -> %llu\n", ci->i_max_size, 666 - le64_to_cpu(info->max_size)); 667 - ci->i_max_size = le64_to_cpu(info->max_size); 668 - } 669 - 670 662 ci->i_layout = info->layout; 671 663 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 672 664 ··· 747 755 ci->i_max_offset = 2; 748 756 } 749 757 no_change: 758 + /* only update max_size on auth cap */ 759 + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 760 + ci->i_max_size != le64_to_cpu(info->max_size)) { 761 + dout("max_size %lld -> %llu\n", ci->i_max_size, 762 + le64_to_cpu(info->max_size)); 763 + ci->i_max_size = le64_to_cpu(info->max_size); 764 + } 765 + 750 766 spin_unlock(&ci->i_ceph_lock); 751 767 752 768 /* queue truncate if we saw i_size decrease */ ··· 1044 1044 session, req->r_request_started, -1, 1045 1045 &req->r_caps_reservation); 1046 1046 if (err < 0) 1047 - return err; 1047 + goto done; 1048 1048 } else { 1049 1049 WARN_ON_ONCE(1); 1050 + } 1051 + 1052 + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { 1053 + struct qstr dname; 1054 + struct dentry *dn, *parent; 1055 + 1056 + BUG_ON(!rinfo->head->is_target); 1057 + BUG_ON(req->r_dentry); 1058 + 1059 + parent = d_find_any_alias(dir); 1060 + BUG_ON(!parent); 1061 + 1062 + dname.name = rinfo->dname; 1063 + dname.len = rinfo->dname_len; 1064 + dname.hash = full_name_hash(dname.name, dname.len); 1065 + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); 1066 + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); 1067 + retry_lookup: 1068 + dn = d_lookup(parent, &dname); 1069 + dout("d_lookup on parent=%p name=%.*s got %p\n", 1070 + parent, dname.len, dname.name, dn); 1071 + 1072 + if (!dn) { 1073 + dn = d_alloc(parent, &dname); 1074 + dout("d_alloc %p '%.*s' = %p\n", parent, 1075 + dname.len, dname.name, dn); 1076 + if (dn == NULL) { 1077 + dput(parent); 1078 + err = -ENOMEM; 1079 + goto done; 1080 + } 1081 + err = ceph_init_dentry(dn); 1082 + if (err < 0) { 1083 + dput(dn); 1084 + dput(parent); 1085 + goto done; 1086 + } 1087 + } else if (dn->d_inode && 1088 + (ceph_ino(dn->d_inode) != vino.ino || 1089 + ceph_snap(dn->d_inode) != vino.snap)) { 1090 + dout(" dn %p points to wrong inode %p\n", 1091 + dn, dn->d_inode); 1092 + d_delete(dn); 1093 + dput(dn); 1094 + goto retry_lookup; 1095 + } 1096 + 1097 + req->r_dentry = dn; 1098 + dput(parent); 1050 1099 } 1051 1100 } 1052 1101 ··· 1112 1063 1113 1064 err = fill_inode(in, &rinfo->targeti, NULL, 1114 1065 session, req->r_request_started, 1115 - (le32_to_cpu(rinfo->head->result) == 0) ? 1066 + (!req->r_aborted && rinfo->head->result == 0) ? 1116 1067 req->r_fmode : -1, 1117 1068 &req->r_caps_reservation); 1118 1069 if (err < 0) { ··· 1665 1616 .getxattr = ceph_getxattr, 1666 1617 .listxattr = ceph_listxattr, 1667 1618 .removexattr = ceph_removexattr, 1668 - .get_acl = ceph_get_acl, 1669 - .set_acl = ceph_set_acl, 1670 1619 }; 1671 1620 1672 1621 /* ··· 1674 1627 { 1675 1628 struct inode *inode = dentry->d_inode; 1676 1629 struct ceph_inode_info *ci = ceph_inode(inode); 1677 - struct inode *parent_inode; 1678 1630 const unsigned int ia_valid = attr->ia_valid; 1679 1631 struct ceph_mds_request *req; 1680 1632 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; ··· 1865 1819 req->r_inode_drop = release; 1866 1820 req->r_args.setattr.mask = cpu_to_le32(mask); 1867 1821 req->r_num_caps = 1; 1868 - parent_inode = ceph_get_dentry_parent_inode(dentry); 1869 - err = ceph_mdsc_do_request(mdsc, parent_inode, req); 1870 - iput(parent_inode); 1822 + err = ceph_mdsc_do_request(mdsc, NULL, req); 1871 1823 } 1872 1824 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, 1873 1825 ceph_cap_string(dirtied), mask);
+1 -4
fs/ceph/ioctl.c
··· 64 64 static long ceph_ioctl_set_layout(struct file *file, void __user *arg) 65 65 { 66 66 struct inode *inode = file_inode(file); 67 - struct inode *parent_inode; 68 67 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 69 68 struct ceph_mds_request *req; 70 69 struct ceph_ioctl_layout l; ··· 120 121 cpu_to_le32(l.object_size); 121 122 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); 122 123 123 - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 124 - err = ceph_mdsc_do_request(mdsc, parent_inode, req); 125 - iput(parent_inode); 124 + err = ceph_mdsc_do_request(mdsc, NULL, req); 126 125 ceph_mdsc_put_request(req); 127 126 return err; 128 127 }
+62 -32
fs/ceph/locks.c
··· 2 2 3 3 #include <linux/file.h> 4 4 #include <linux/namei.h> 5 + #include <linux/random.h> 5 6 6 7 #include "super.h" 7 8 #include "mds_client.h" 8 9 #include <linux/ceph/pagelist.h> 10 + 11 + static u64 lock_secret; 12 + 13 + static inline u64 secure_addr(void *addr) 14 + { 15 + u64 v = lock_secret ^ (u64)(unsigned long)addr; 16 + /* 17 + * Set the most significant bit, so that MDS knows the 'owner' 18 + * is sufficient to identify the owner of lock. (old code uses 19 + * both 'owner' and 'pid') 20 + */ 21 + v |= (1ULL << 63); 22 + return v; 23 + } 24 + 25 + void __init ceph_flock_init(void) 26 + { 27 + get_random_bytes(&lock_secret, sizeof(lock_secret)); 28 + } 9 29 10 30 /** 11 31 * Implement fcntl and flock locking functions. ··· 34 14 int cmd, u8 wait, struct file_lock *fl) 35 15 { 36 16 struct inode *inode = file_inode(file); 37 - struct ceph_mds_client *mdsc = 38 - ceph_sb_to_client(inode->i_sb)->mdsc; 17 + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 39 18 struct ceph_mds_request *req; 40 19 int err; 41 20 u64 length = 0; 21 + u64 owner; 42 22 43 23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 44 24 if (IS_ERR(req)) ··· 52 32 else 53 33 length = fl->fl_end - fl->fl_start + 1; 54 34 55 - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 56 - "length: %llu, wait: %d, type: %d", (int)lock_type, 57 - (int)operation, (u64)fl->fl_pid, fl->fl_start, 58 - length, wait, fl->fl_type); 35 + if (lock_type == CEPH_LOCK_FCNTL) 36 + owner = secure_addr(fl->fl_owner); 37 + else 38 + owner = secure_addr(fl->fl_file); 39 + 40 + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " 41 + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, 42 + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, 43 + wait, fl->fl_type); 59 44 60 45 req->r_args.filelock_change.rule = lock_type; 61 46 req->r_args.filelock_change.type = cmd; 47 + req->r_args.filelock_change.owner = cpu_to_le64(owner); 62 48 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); 63 - /* This should be adjusted, but I'm not sure if 64 - namespaces actually get id numbers*/ 65 - req->r_args.filelock_change.pid_namespace = 66 - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); 67 49 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); 68 50 req->r_args.filelock_change.length = cpu_to_le64(length); 69 51 req->r_args.filelock_change.wait = wait; 70 52 71 53 err = ceph_mdsc_do_request(mdsc, inode, req); 72 54 73 - if ( operation == CEPH_MDS_OP_GETFILELOCK){ 55 + if (operation == CEPH_MDS_OP_GETFILELOCK) { 74 56 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); 75 57 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) 76 58 fl->fl_type = F_RDLCK; ··· 109 87 u8 wait = 0; 110 88 u16 op = CEPH_MDS_OP_SETFILELOCK; 111 89 112 - fl->fl_nspid = get_pid(task_tgid(current)); 113 - dout("ceph_lock, fl_pid:%d", fl->fl_pid); 90 + if (!(fl->fl_flags & FL_POSIX)) 91 + return -ENOLCK; 92 + /* No mandatory locks */ 93 + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) 94 + return -ENOLCK; 95 + 96 + dout("ceph_lock, fl_owner: %p", fl->fl_owner); 114 97 115 98 /* set wait bit as appropriate, then make command as Ceph expects it*/ 116 - if (F_SETLKW == cmd) 117 - wait = 1; 118 - if (F_GETLK == cmd) 99 + if (IS_GETLK(cmd)) 119 100 op = CEPH_MDS_OP_GETFILELOCK; 101 + else if (IS_SETLKW(cmd)) 102 + wait = 1; 120 103 121 104 if (F_RDLCK == fl->fl_type) 122 105 lock_cmd = CEPH_LOCK_SHARED; ··· 132 105 133 106 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); 134 107 if (!err) { 135 - if ( op != CEPH_MDS_OP_GETFILELOCK ){ 108 + if (op != CEPH_MDS_OP_GETFILELOCK) { 136 109 dout("mds locked, locking locally"); 137 110 err = posix_lock_file(file, fl, NULL); 138 111 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { ··· 158 131 { 159 132 u8 lock_cmd; 160 133 int err; 161 - u8 wait = 1; 134 + u8 wait = 0; 162 135 163 - fl->fl_nspid = get_pid(task_tgid(current)); 164 - dout("ceph_flock, fl_pid:%d", fl->fl_pid); 136 + if (!(fl->fl_flags & FL_FLOCK)) 137 + return -ENOLCK; 138 + /* No mandatory locks */ 139 + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) 140 + return -ENOLCK; 165 141 166 - /* set wait bit, then clear it out of cmd*/ 167 - if (cmd & LOCK_NB) 168 - wait = 0; 169 - cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); 170 - /* set command sequence that Ceph wants to see: 171 - shared lock, exclusive lock, or unlock */ 172 - if (LOCK_SH == cmd) 142 + dout("ceph_flock, fl_file: %p", fl->fl_file); 143 + 144 + if (IS_SETLKW(cmd)) 145 + wait = 1; 146 + 147 + if (F_RDLCK == fl->fl_type) 173 148 lock_cmd = CEPH_LOCK_SHARED; 174 - else if (LOCK_EX == cmd) 149 + else if (F_WRLCK == fl->fl_type) 175 150 lock_cmd = CEPH_LOCK_EXCL; 176 151 else 177 152 lock_cmd = CEPH_LOCK_UNLOCK; ··· 309 280 struct ceph_filelock *cephlock) 310 281 { 311 282 int err = 0; 312 - 313 283 cephlock->start = cpu_to_le64(lock->fl_start); 314 284 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 315 285 cephlock->client = cpu_to_le64(0); 316 - cephlock->pid = cpu_to_le64(lock->fl_pid); 317 - cephlock->pid_namespace = 318 - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); 286 + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); 287 + if (lock->fl_flags & FL_POSIX) 288 + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); 289 + else 290 + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file)); 319 291 320 292 switch (lock->fl_type) { 321 293 case F_RDLCK:
+70 -27
fs/ceph/mds_client.c
··· 3 3 #include <linux/fs.h> 4 4 #include <linux/wait.h> 5 5 #include <linux/slab.h> 6 + #include <linux/gfp.h> 6 7 #include <linux/sched.h> 7 8 #include <linux/debugfs.h> 8 9 #include <linux/seq_file.h> ··· 166 165 if (num == 0) 167 166 goto done; 168 167 169 - /* alloc large array */ 170 - info->dir_nr = num; 171 - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + 172 - sizeof(*info->dir_dname) + 173 - sizeof(*info->dir_dname_len) + 174 - sizeof(*info->dir_dlease), 175 - GFP_NOFS); 176 - if (info->dir_in == NULL) { 177 - err = -ENOMEM; 178 - goto out_bad; 179 - } 168 + BUG_ON(!info->dir_in); 180 169 info->dir_dname = (void *)(info->dir_in + num); 181 170 info->dir_dname_len = (void *)(info->dir_dname + num); 182 171 info->dir_dlease = (void *)(info->dir_dname_len + num); 172 + if ((unsigned long)(info->dir_dlease + num) > 173 + (unsigned long)info->dir_in + info->dir_buf_size) { 174 + pr_err("dir contents are larger than expected\n"); 175 + WARN_ON(1); 176 + goto bad; 177 + } 183 178 179 + info->dir_nr = num; 184 180 while (num) { 185 181 /* dentry */ 186 182 ceph_decode_need(p, end, sizeof(u32)*2, bad); ··· 325 327 326 328 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) 327 329 { 328 - kfree(info->dir_in); 330 + if (!info->dir_in) 331 + return; 332 + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); 329 333 } 330 334 331 335 ··· 512 512 struct ceph_mds_request *req = container_of(kref, 513 513 struct ceph_mds_request, 514 514 r_kref); 515 + destroy_reply_info(&req->r_reply_info); 515 516 if (req->r_request) 516 517 ceph_msg_put(req->r_request); 517 - if (req->r_reply) { 518 + if (req->r_reply) 518 519 ceph_msg_put(req->r_reply); 519 - destroy_reply_info(&req->r_reply_info); 520 - } 521 520 if (req->r_inode) { 522 521 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 523 522 iput(req->r_inode); ··· 527 528 iput(req->r_target_inode); 528 529 if (req->r_dentry) 529 530 dput(req->r_dentry); 530 - if (req->r_old_dentry) { 531 + if (req->r_old_dentry) 532 + dput(req->r_old_dentry); 533 + if (req->r_old_dentry_dir) { 531 534 /* 532 535 * track (and drop pins for) r_old_dentry_dir 533 536 * separately, since r_old_dentry's d_parent may have ··· 538 537 */ 539 538 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), 540 539 CEPH_CAP_PIN); 541 - dput(req->r_old_dentry); 542 540 iput(req->r_old_dentry_dir); 543 541 } 544 542 kfree(req->r_path1); ··· 1311 1311 trim_caps - session->s_trim_caps); 1312 1312 session->s_trim_caps = 0; 1313 1313 } 1314 + 1315 + ceph_add_cap_releases(mdsc, session); 1316 + ceph_send_cap_releases(mdsc, session); 1314 1317 return 0; 1315 1318 } 1316 1319 ··· 1464 1461 1465 1462 dout("discard_cap_releases mds%d\n", session->s_mds); 1466 1463 1467 - /* zero out the in-progress message */ 1468 - msg = list_first_entry(&session->s_cap_releases, 1469 - struct ceph_msg, list_head); 1470 - head = msg->front.iov_base; 1471 - num = le32_to_cpu(head->num); 1472 - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1473 - head->num = cpu_to_le32(0); 1474 - msg->front.iov_len = sizeof(*head); 1475 - session->s_num_cap_releases += num; 1464 + if (!list_empty(&session->s_cap_releases)) { 1465 + /* zero out the in-progress message */ 1466 + msg = list_first_entry(&session->s_cap_releases, 1467 + struct ceph_msg, list_head); 1468 + head = msg->front.iov_base; 1469 + num = le32_to_cpu(head->num); 1470 + dout("discard_cap_releases mds%d %p %u\n", 1471 + session->s_mds, msg, num); 1472 + head->num = cpu_to_le32(0); 1473 + msg->front.iov_len = sizeof(*head); 1474 + session->s_num_cap_releases += num; 1475 + } 1476 1476 1477 1477 /* requeue completed messages */ 1478 1478 while (!list_empty(&session->s_cap_releases_done)) { ··· 1497 1491 /* 1498 1492 * requests 1499 1493 */ 1494 + 1495 + int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, 1496 + struct inode *dir) 1497 + { 1498 + struct ceph_inode_info *ci = ceph_inode(dir); 1499 + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1500 + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; 1501 + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + 1502 + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); 1503 + int order, num_entries; 1504 + 1505 + spin_lock(&ci->i_ceph_lock); 1506 + num_entries = ci->i_files + ci->i_subdirs; 1507 + spin_unlock(&ci->i_ceph_lock); 1508 + num_entries = max(num_entries, 1); 1509 + num_entries = min(num_entries, opt->max_readdir); 1510 + 1511 + order = get_order(size * num_entries); 1512 + while (order >= 0) { 1513 + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, 1514 + order); 1515 + if (rinfo->dir_in) 1516 + break; 1517 + order--; 1518 + } 1519 + if (!rinfo->dir_in) 1520 + return -ENOMEM; 1521 + 1522 + num_entries = (PAGE_SIZE << order) / size; 1523 + num_entries = min(num_entries, opt->max_readdir); 1524 + 1525 + rinfo->dir_buf_size = PAGE_SIZE << order; 1526 + req->r_num_caps = num_entries + 1; 1527 + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); 1528 + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); 1529 + return 0; 1530 + } 1500 1531 1501 1532 /* 1502 1533 * Create an mds request. ··· 2096 2053 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 2097 2054 if (req->r_locked_dir) 2098 2055 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 2099 - if (req->r_old_dentry) 2056 + if (req->r_old_dentry_dir) 2100 2057 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), 2101 2058 CEPH_CAP_PIN); 2102 2059
+3 -1
fs/ceph/mds_client.h
··· 67 67 /* for readdir results */ 68 68 struct { 69 69 struct ceph_mds_reply_dirfrag *dir_dir; 70 + size_t dir_buf_size; 70 71 int dir_nr; 71 72 char **dir_dname; 72 73 u32 *dir_dname_len; ··· 347 346 struct dentry *dn); 348 347 349 348 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 350 - 349 + extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, 350 + struct inode *dir); 351 351 extern struct ceph_mds_request * 352 352 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 353 353 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+1
fs/ceph/strings.c
··· 54 54 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; 55 55 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; 56 56 case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; 57 + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; 57 58 case CEPH_MDS_OP_GETATTR: return "getattr"; 58 59 case CEPH_MDS_OP_SETXATTR: return "setxattr"; 59 60 case CEPH_MDS_OP_SETATTR: return "setattr";
+1
fs/ceph/super.c
··· 1026 1026 if (ret) 1027 1027 goto out; 1028 1028 1029 + ceph_flock_init(); 1029 1030 ceph_xattr_init(); 1030 1031 ret = register_filesystem(&ceph_fs_type); 1031 1032 if (ret)
+2 -1
fs/ceph/super.h
··· 577 577 578 578 /* readdir: position within a frag */ 579 579 unsigned offset; /* offset of last chunk, adjusted for . and .. */ 580 - u64 next_offset; /* offset of next chunk (last_name's + 1) */ 580 + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ 581 581 char *last_name; /* last entry in previous chunk */ 582 582 struct dentry *dentry; /* next dentry (for dcache readdir) */ 583 583 int dir_release_count; ··· 871 871 extern const struct export_operations ceph_export_ops; 872 872 873 873 /* locks.c */ 874 + extern __init void ceph_flock_init(void); 874 875 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 875 876 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 876 877 extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+29 -19
fs/ceph/xattr.c
··· 64 64 } 65 65 66 66 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, 67 - size_t size) 67 + size_t size) 68 68 { 69 69 int ret; 70 70 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); 71 71 struct ceph_osd_client *osdc = &fsc->client->osdc; 72 72 s64 pool = ceph_file_layout_pg_pool(ci->i_layout); 73 73 const char *pool_name; 74 + char buf[128]; 74 75 75 76 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); 76 77 down_read(&osdc->map_sem); 77 78 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 78 - if (pool_name) 79 - ret = snprintf(val, size, 80 - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", 79 + if (pool_name) { 80 + size_t len = strlen(pool_name); 81 + ret = snprintf(buf, sizeof(buf), 82 + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", 81 83 (unsigned long long)ceph_file_layout_su(ci->i_layout), 82 84 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 83 - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), 84 - pool_name); 85 - else 86 - ret = snprintf(val, size, 85 + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); 86 + if (!size) { 87 + ret += len; 88 + } else if (ret + len > size) { 89 + ret = -ERANGE; 90 + } else { 91 + memcpy(val, buf, ret); 92 + memcpy(val + ret, pool_name, len); 93 + ret += len; 94 + } 95 + } else { 96 + ret = snprintf(buf, sizeof(buf), 87 97 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", 88 98 (unsigned long long)ceph_file_layout_su(ci->i_layout), 89 99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 90 100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout), 91 101 (unsigned long long)pool); 92 - 102 + if (size) { 103 + if (ret <= size) 104 + memcpy(val, buf, ret); 105 + else 106 + ret = -ERANGE; 107 + } 108 + } 93 109 up_read(&osdc->map_sem); 94 110 return ret; 95 111 } ··· 231 215 .name_size = sizeof("ceph.dir.layout"), 232 216 .getxattr_cb = ceph_vxattrcb_layout, 233 217 .readonly = false, 234 - .hidden = false, 218 + .hidden = true, 235 219 .exists_cb = ceph_vxattrcb_layout_exists, 236 220 }, 237 221 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), ··· 258 242 .name_size = sizeof("ceph.file.layout"), 259 243 .getxattr_cb = ceph_vxattrcb_layout, 260 244 .readonly = false, 261 - .hidden = false, 245 + .hidden = true, 262 246 .exists_cb = ceph_vxattrcb_layout_exists, 263 247 }, 264 248 XATTR_LAYOUT_FIELD(file, layout, stripe_unit), ··· 858 842 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 859 843 struct inode *inode = dentry->d_inode; 860 844 struct ceph_inode_info *ci = ceph_inode(inode); 861 - struct inode *parent_inode; 862 845 struct ceph_mds_request *req; 863 846 struct ceph_mds_client *mdsc = fsc->mdsc; 864 847 int err; ··· 908 893 req->r_data_len = size; 909 894 910 895 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 911 - parent_inode = ceph_get_dentry_parent_inode(dentry); 912 - err = ceph_mdsc_do_request(mdsc, parent_inode, req); 913 - iput(parent_inode); 896 + err = ceph_mdsc_do_request(mdsc, NULL, req); 914 897 ceph_mdsc_put_request(req); 915 898 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); 916 899 ··· 1032 1019 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 1033 1020 struct ceph_mds_client *mdsc = fsc->mdsc; 1034 1021 struct inode *inode = dentry->d_inode; 1035 - struct inode *parent_inode; 1036 1022 struct ceph_mds_request *req; 1037 1023 int err; 1038 1024 ··· 1045 1033 req->r_num_caps = 1; 1046 1034 req->r_path2 = kstrdup(name, GFP_NOFS); 1047 1035 1048 - parent_inode = ceph_get_dentry_parent_inode(dentry); 1049 - err = ceph_mdsc_do_request(mdsc, parent_inode, req); 1050 - iput(parent_inode); 1036 + err = ceph_mdsc_do_request(mdsc, NULL, req); 1051 1037 ceph_mdsc_put_request(req); 1052 1038 return err; 1053 1039 }
+11 -1
include/linux/ceph/ceph_features.h
··· 43 43 #define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ 44 44 #define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) 45 45 #define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) 46 + #define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */ 47 + /* The process supports new-style OSDMap encoding. Monitors also use 48 + this bit to determine if peers support NAK messages. */ 49 + #define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39) 50 + #define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) 51 + #define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) 52 + #define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ 46 53 47 54 /* 48 55 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature ··· 89 82 CEPH_FEATURE_OSDHASHPSPOOL | \ 90 83 CEPH_FEATURE_OSD_CACHEPOOL | \ 91 84 CEPH_FEATURE_CRUSH_V2 | \ 92 - CEPH_FEATURE_EXPORT_PEER) 85 + CEPH_FEATURE_EXPORT_PEER | \ 86 + CEPH_FEATURE_OSDMAP_ENC | \ 87 + CEPH_FEATURE_CRUSH_TUNABLES3 | \ 88 + CEPH_FEATURE_OSD_PRIMARY_AFFINITY) 93 89 94 90 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 95 91 (CEPH_FEATURE_NOSRCADDR | \
+3 -2
include/linux/ceph/ceph_fs.h
··· 332 332 CEPH_MDS_OP_LOOKUPHASH = 0x00102, 333 333 CEPH_MDS_OP_LOOKUPPARENT = 0x00103, 334 334 CEPH_MDS_OP_LOOKUPINO = 0x00104, 335 + CEPH_MDS_OP_LOOKUPNAME = 0x00105, 335 336 336 337 CEPH_MDS_OP_SETXATTR = 0x01105, 337 338 CEPH_MDS_OP_RMXATTR = 0x01106, ··· 421 420 struct { 422 421 __u8 rule; /* currently fcntl or flock */ 423 422 __u8 type; /* shared, exclusive, remove*/ 423 + __le64 owner; /* owner of the lock */ 424 424 __le64 pid; /* process id requesting the lock */ 425 - __le64 pid_namespace; 426 425 __le64 start; /* initial location to lock */ 427 426 __le64 length; /* num bytes to lock from start */ 428 427 __u8 wait; /* will caller wait for lock to become available? */ ··· 533 532 __le64 start;/* file offset to start lock at */ 534 533 __le64 length; /* num bytes to lock; 0 for all following start */ 535 534 __le64 client; /* which client holds the lock */ 535 + __le64 owner; /* owner the lock */ 536 536 __le64 pid; /* process id holding the lock on the client */ 537 - __le64 pid_namespace; 538 537 __u8 type; /* shared lock, exclusive lock, or unlock */ 539 538 } __attribute__ ((packed)); 540 539
+10 -1
include/linux/ceph/osd_client.h
··· 43 43 }; 44 44 45 45 46 - #define CEPH_OSD_MAX_OP 2 46 + #define CEPH_OSD_MAX_OP 3 47 47 48 48 enum ceph_osd_data_type { 49 49 CEPH_OSD_DATA_TYPE_NONE = 0, ··· 76 76 77 77 struct ceph_osd_req_op { 78 78 u16 op; /* CEPH_OSD_OP_* */ 79 + u32 flags; /* CEPH_OSD_OP_FLAG_* */ 79 80 u32 payload_len; 80 81 union { 81 82 struct ceph_osd_data raw_data_in; ··· 103 102 u32 timeout; 104 103 __u8 flag; 105 104 } watch; 105 + struct { 106 + u64 expected_object_size; 107 + u64 expected_write_size; 108 + } alloc_hint; 106 109 }; 107 110 }; 108 111 ··· 298 293 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 299 294 unsigned int which, u16 opcode, 300 295 u64 cookie, u64 version, int flag); 296 + extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 297 + unsigned int which, 298 + u64 expected_object_size, 299 + u64 expected_write_size); 301 300 302 301 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 303 302 struct ceph_snap_context *snapc,
+45 -5
include/linux/ceph/osdmap.h
··· 41 41 char *name; 42 42 }; 43 43 44 + static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) 45 + { 46 + switch (pool->type) { 47 + case CEPH_POOL_TYPE_REP: 48 + return true; 49 + case CEPH_POOL_TYPE_EC: 50 + return false; 51 + default: 52 + BUG_ON(1); 53 + } 54 + } 55 + 44 56 struct ceph_object_locator { 45 57 s64 pool; 46 58 }; ··· 72 60 struct ceph_pg_mapping { 73 61 struct rb_node node; 74 62 struct ceph_pg pgid; 75 - int len; 76 - int osds[]; 63 + 64 + union { 65 + struct { 66 + int len; 67 + int osds[]; 68 + } pg_temp; 69 + struct { 70 + int osd; 71 + } primary_temp; 72 + }; 77 73 }; 78 74 79 75 struct ceph_osdmap { ··· 98 78 struct ceph_entity_addr *osd_addr; 99 79 100 80 struct rb_root pg_temp; 81 + struct rb_root primary_temp; 82 + 83 + u32 *osd_primary_affinity; 84 + 101 85 struct rb_root pg_pools; 102 86 u32 pool_max; 103 87 104 88 /* the CRUSH map specifies the mapping of placement groups to 105 89 * the list of osds that store+replicate them. */ 106 90 struct crush_map *crush; 91 + 92 + struct mutex crush_scratch_mutex; 93 + int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; 107 94 }; 108 95 109 96 static inline void ceph_oid_set_name(struct ceph_object_id *oid, ··· 137 110 dest->name_len = src->name_len; 138 111 } 139 112 113 + static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) 114 + { 115 + return osd >= 0 && osd < map->max_osd && 116 + (map->osd_state[osd] & CEPH_OSD_EXISTS); 117 + } 118 + 140 119 static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 141 120 { 142 - return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); 121 + return ceph_osd_exists(map, osd) && 122 + (map->osd_state[osd] & CEPH_OSD_UP); 123 + } 124 + 125 + static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) 126 + { 127 + return !ceph_osd_is_up(map, osd); 143 128 } 144 129 145 130 static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) ··· 160 121 } 161 122 162 123 extern char *ceph_osdmap_state_str(char *str, int len, int state); 124 + extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); 163 125 164 126 static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, 165 127 int osd) ··· 193 153 return 0; 194 154 } 195 155 196 - extern struct ceph_osdmap *osdmap_decode(void **p, void *end); 156 + extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); 197 157 extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 198 158 struct ceph_osdmap *map, 199 159 struct ceph_messenger *msgr); ··· 212 172 213 173 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 214 174 struct ceph_pg pgid, 215 - int *acting); 175 + int *osds, int *primary); 216 176 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 217 177 struct ceph_pg pgid); 218 178
+15 -3
include/linux/ceph/rados.h
··· 81 81 */ 82 82 #define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */ 83 83 84 - #define CEPH_PG_TYPE_REP 1 85 - #define CEPH_PG_TYPE_RAID4 2 84 + #define CEPH_POOL_TYPE_REP 1 85 + #define CEPH_POOL_TYPE_RAID4 2 /* never implemented */ 86 + #define CEPH_POOL_TYPE_EC 3 86 87 87 88 /* 88 89 * stable_mod func is used to control number of placement groups. ··· 133 132 /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ 134 133 #define CEPH_OSD_IN 0x10000 135 134 #define CEPH_OSD_OUT 0 135 + 136 + /* osd primary-affinity. fixed point value: 0x10000 == baseline */ 137 + #define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000 138 + #define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000 136 139 137 140 138 141 /* ··· 231 226 CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23, 232 227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, 233 228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, 229 + 230 + /* hints */ 231 + CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35, 234 232 235 233 /** multi **/ 236 234 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, ··· 390 382 */ 391 383 struct ceph_osd_op { 392 384 __le16 op; /* CEPH_OSD_OP_* */ 393 - __le32 flags; /* CEPH_OSD_FLAG_* */ 385 + __le32 flags; /* CEPH_OSD_OP_FLAG_* */ 394 386 union { 395 387 struct { 396 388 __le64 offset, length; ··· 424 416 __le64 offset, length; 425 417 __le64 src_offset; 426 418 } __attribute__ ((packed)) clonerange; 419 + struct { 420 + __le64 expected_object_size; 421 + __le64 expected_write_size; 422 + } __attribute__ ((packed)) alloc_hint; 427 423 }; 428 424 __le32 payload_len; 429 425 } __attribute__ ((packed));
+7
include/linux/crush/crush.h
··· 51 51 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 52 52 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 53 53 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 54 + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 54 55 }; 55 56 56 57 /* ··· 174 173 * apply to a collision: in that case we will retry as we used 175 174 * to. */ 176 175 __u32 chooseleaf_descend_once; 176 + 177 + /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) 178 + * bits. a value of 1 is best for new clusters. for legacy clusters 179 + * that want to limit reshuffling, a value of 3 or 4 will make the 180 + * mappings line up a bit better with previous mappings. */ 181 + __u8 chooseleaf_vary_r; 177 182 }; 178 183 179 184
+58 -27
net/ceph/crush/mapper.c
··· 292 292 * @outpos: our position in that vector 293 293 * @tries: number of attempts to make 294 294 * @recurse_tries: number of attempts to have recursive chooseleaf make 295 - * @local_tries: localized retries 296 - * @local_fallback_tries: localized fallback retries 295 + * @local_retries: localized retries 296 + * @local_fallback_retries: localized fallback retries 297 297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 298 + * @vary_r: pass r to recursive calls 298 299 * @out2: second output vector for leaf items (if @recurse_to_leaf) 300 + * @parent_r: r value passed from the parent 299 301 */ 300 302 static int crush_choose_firstn(const struct crush_map *map, 301 303 struct crush_bucket *bucket, ··· 306 304 int *out, int outpos, 307 305 unsigned int tries, 308 306 unsigned int recurse_tries, 309 - unsigned int local_tries, 310 - unsigned int local_fallback_tries, 307 + unsigned int local_retries, 308 + unsigned int local_fallback_retries, 311 309 int recurse_to_leaf, 312 - int *out2) 310 + unsigned int vary_r, 311 + int *out2, 312 + int parent_r) 313 313 { 314 314 int rep; 315 315 unsigned int ftotal, flocal; ··· 323 319 int itemtype; 324 320 int collide, reject; 325 321 326 - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 327 - bucket->id, x, outpos, numrep); 322 + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 323 + recurse_to_leaf ? "_LEAF" : "", 324 + bucket->id, x, outpos, numrep, 325 + tries, recurse_tries, local_retries, local_fallback_retries, 326 + parent_r); 328 327 329 328 for (rep = outpos; rep < numrep; rep++) { 330 329 /* keep trying until we get a non-out, non-colliding item */ ··· 342 335 do { 343 336 collide = 0; 344 337 retry_bucket = 0; 345 - r = rep; 338 + r = rep + parent_r; 346 339 /* r' = r + f_total */ 347 340 r += ftotal; 348 341 ··· 351 344 reject = 1; 352 345 goto reject; 353 346 } 354 - if (local_fallback_tries > 0 && 347 + if (local_fallback_retries > 0 && 355 348 flocal >= (in->size>>1) && 356 - flocal > local_fallback_tries) 349 + flocal > local_fallback_retries) 357 350 item = bucket_perm_choose(in, x, r); 358 351 else 359 352 item = crush_bucket_choose(in, x, r); ··· 394 387 reject = 0; 395 388 if (!collide && recurse_to_leaf) { 396 389 if (item < 0) { 390 + int sub_r; 391 + if (vary_r) 392 + sub_r = r >> (vary_r-1); 393 + else 394 + sub_r = 0; 397 395 if (crush_choose_firstn(map, 398 396 map->buckets[-1-item], 399 397 weight, weight_max, 400 398 x, outpos+1, 0, 401 399 out2, outpos, 402 400 recurse_tries, 0, 403 - local_tries, 404 - local_fallback_tries, 401 + local_retries, 402 + local_fallback_retries, 405 403 0, 406 - NULL) <= outpos) 404 + vary_r, 405 + NULL, 406 + sub_r) <= outpos) 407 407 /* didn't get leaf */ 408 408 reject = 1; 409 409 } else { ··· 434 420 ftotal++; 435 421 flocal++; 436 422 437 - if (collide && flocal <= local_tries) 423 + if (collide && flocal <= local_retries) 438 424 /* retry locally a few times */ 439 425 retry_bucket = 1; 440 - else if (local_fallback_tries > 0 && 441 - flocal <= in->size + local_fallback_tries) 426 + else if (local_fallback_retries > 0 && 427 + flocal <= in->size + local_fallback_retries) 442 428 /* exhaustive bucket search */ 443 429 retry_bucket = 1; 444 - else if (ftotal <= tries) 430 + else if (ftotal < tries) 445 431 /* then retry descent */ 446 432 retry_descent = 1; 447 433 else ··· 654 640 __u32 step; 655 641 int i, j; 656 642 int numrep; 657 - int choose_tries = map->choose_total_tries; 658 - int choose_local_tries = map->choose_local_tries; 659 - int choose_local_fallback_tries = map->choose_local_fallback_tries; 643 + /* 644 + * the original choose_total_tries value was off by one (it 645 + * counted "retries" and not "tries"). add one. 646 + */ 647 + int choose_tries = map->choose_total_tries + 1; 660 648 int choose_leaf_tries = 0; 649 + /* 650 + * the local tries values were counted as "retries", though, 651 + * and need no adjustment 652 + */ 653 + int choose_local_retries = map->choose_local_tries; 654 + int choose_local_fallback_retries = map->choose_local_fallback_tries; 655 + 656 + int vary_r = map->chooseleaf_vary_r; 661 657 662 658 if ((__u32)ruleno >= map->max_rules) { 663 659 dprintk(" bad ruleno %d\n", ruleno); ··· 700 676 break; 701 677 702 678 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 703 - if (curstep->arg1 > 0) 704 - choose_local_tries = curstep->arg1; 679 + if (curstep->arg1 >= 0) 680 + choose_local_retries = curstep->arg1; 705 681 break; 706 682 707 683 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 708 - if (curstep->arg1 > 0) 709 - choose_local_fallback_tries = curstep->arg1; 684 + if (curstep->arg1 >= 0) 685 + choose_local_fallback_retries = curstep->arg1; 686 + break; 687 + 688 + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: 689 + if (curstep->arg1 >= 0) 690 + vary_r = curstep->arg1; 710 691 break; 711 692 712 693 case CRUSH_RULE_CHOOSELEAF_FIRSTN: ··· 763 734 o+osize, j, 764 735 choose_tries, 765 736 recurse_tries, 766 - choose_local_tries, 767 - choose_local_fallback_tries, 737 + choose_local_retries, 738 + choose_local_fallback_retries, 768 739 recurse_to_leaf, 769 - c+osize); 740 + vary_r, 741 + c+osize, 742 + 0); 770 743 } else { 771 744 crush_choose_indep( 772 745 map,
+38 -17
net/ceph/debugfs.c
··· 53 53 { 54 54 int i; 55 55 struct ceph_client *client = s->private; 56 + struct ceph_osdmap *map = client->osdc.osdmap; 56 57 struct rb_node *n; 57 58 58 - if (client->osdc.osdmap == NULL) 59 + if (map == NULL) 59 60 return 0; 60 - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); 61 + 62 + seq_printf(s, "epoch %d\n", map->epoch); 61 63 seq_printf(s, "flags%s%s\n", 62 - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? 63 - " NEARFULL" : "", 64 - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? 65 - " FULL" : ""); 66 - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { 64 + (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", 65 + (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); 66 + 67 + for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { 67 68 struct ceph_pg_pool_info *pool = 68 69 rb_entry(n, struct ceph_pg_pool_info, node); 69 - seq_printf(s, "pg_pool %llu pg_num %d / %d\n", 70 - (unsigned long long)pool->id, pool->pg_num, 71 - pool->pg_num_mask); 70 + 71 + seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", 72 + pool->id, pool->pg_num, pool->pg_num_mask, 73 + pool->read_tier, pool->write_tier); 72 74 } 73 - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { 74 - struct ceph_entity_addr *addr = 75 - &client->osdc.osdmap->osd_addr[i]; 76 - int state = client->osdc.osdmap->osd_state[i]; 75 + for (i = 0; i < map->max_osd; i++) { 76 + struct ceph_entity_addr *addr = &map->osd_addr[i]; 77 + int state = map->osd_state[i]; 77 78 char sb[64]; 78 79 79 - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", 80 + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", 80 81 i, ceph_pr_addr(&addr->in_addr), 81 - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), 82 - ceph_osdmap_state_str(sb, sizeof(sb), state)); 82 + ((map->osd_weight[i]*100) >> 16), 83 + ceph_osdmap_state_str(sb, sizeof(sb), state), 84 + ((ceph_get_primary_affinity(map, i)*100) >> 16)); 83 85 } 86 + for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { 87 + struct ceph_pg_mapping *pg = 88 + rb_entry(n, struct ceph_pg_mapping, node); 89 + 90 + seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, 91 + pg->pgid.seed); 92 + for (i = 0; i < pg->pg_temp.len; i++) 93 + seq_printf(s, "%s%d", (i == 0 ? "" : ","), 94 + pg->pg_temp.osds[i]); 95 + seq_printf(s, "]\n"); 96 + } 97 + for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { 98 + struct ceph_pg_mapping *pg = 99 + rb_entry(n, struct ceph_pg_mapping, node); 100 + 101 + seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, 102 + pg->pgid.seed, pg->primary_temp.osd); 103 + } 104 + 84 105 return 0; 85 106 } 86 107
+6
net/ceph/messenger.c
··· 919 919 if (!bytes || cursor->page_offset) 920 920 return false; /* more bytes to process in the current page */ 921 921 922 + if (!cursor->resid) 923 + return false; /* no more data */ 924 + 922 925 /* Move on to the next page; offset is already at 0 */ 923 926 924 927 BUG_ON(cursor->page_index >= cursor->page_count); ··· 1006 1003 /* offset of first page in pagelist is always 0 */ 1007 1004 if (!bytes || cursor->offset & ~PAGE_MASK) 1008 1005 return false; /* more bytes to process in the current page */ 1006 + 1007 + if (!cursor->resid) 1008 + return false; /* no more data */ 1009 1009 1010 1010 /* Move on to the next page */ 1011 1011
+34 -7
net/ceph/osd_client.c
··· 436 436 case CEPH_OSD_OP_OMAPCLEAR: 437 437 case CEPH_OSD_OP_OMAPRMKEYS: 438 438 case CEPH_OSD_OP_OMAP_CMP: 439 + case CEPH_OSD_OP_SETALLOCHINT: 439 440 case CEPH_OSD_OP_CLONERANGE: 440 441 case CEPH_OSD_OP_ASSERT_SRC_VERSION: 441 442 case CEPH_OSD_OP_SRC_CMPXATTR: ··· 592 591 } 593 592 EXPORT_SYMBOL(osd_req_op_watch_init); 594 593 594 + void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 595 + unsigned int which, 596 + u64 expected_object_size, 597 + u64 expected_write_size) 598 + { 599 + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, 600 + CEPH_OSD_OP_SETALLOCHINT); 601 + 602 + op->alloc_hint.expected_object_size = expected_object_size; 603 + op->alloc_hint.expected_write_size = expected_write_size; 604 + 605 + /* 606 + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed 607 + * not worth a feature bit. Set FAILOK per-op flag to make 608 + * sure older osds don't trip over an unsupported opcode. 609 + */ 610 + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; 611 + } 612 + EXPORT_SYMBOL(osd_req_op_alloc_hint_init); 613 + 595 614 static void ceph_osdc_msg_data_add(struct ceph_msg *msg, 596 615 struct ceph_osd_data *osd_data) 597 616 { ··· 702 681 dst->watch.ver = cpu_to_le64(src->watch.ver); 703 682 dst->watch.flag = src->watch.flag; 704 683 break; 684 + case CEPH_OSD_OP_SETALLOCHINT: 685 + dst->alloc_hint.expected_object_size = 686 + cpu_to_le64(src->alloc_hint.expected_object_size); 687 + dst->alloc_hint.expected_write_size = 688 + cpu_to_le64(src->alloc_hint.expected_write_size); 689 + break; 705 690 default: 706 691 pr_err("unsupported osd opcode %s\n", 707 692 ceph_osd_op_name(src->op)); ··· 715 688 716 689 return 0; 717 690 } 691 + 718 692 dst->op = cpu_to_le16(src->op); 693 + dst->flags = cpu_to_le32(src->flags); 719 694 dst->payload_len = cpu_to_le32(src->payload_len); 720 695 721 696 return request_data_len; ··· 1333 1304 { 1334 1305 struct ceph_pg pgid; 1335 1306 int acting[CEPH_PG_MAX_SIZE]; 1336 - int o = -1, num = 0; 1307 + int num, o; 1337 1308 int err; 1338 1309 bool was_paused; 1339 1310 ··· 1346 1317 } 1347 1318 req->r_pgid = pgid; 1348 1319 1349 - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); 1350 - if (err > 0) { 1351 - o = acting[0]; 1352 - num = err; 1353 - } 1320 + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); 1321 + if (num < 0) 1322 + num = 0; 1354 1323 1355 1324 was_paused = req->r_paused; 1356 1325 req->r_paused = __req_should_be_paused(osdc, req); ··· 2060 2033 int skipped_map = 0; 2061 2034 2062 2035 dout("taking full map %u len %d\n", epoch, maplen); 2063 - newmap = osdmap_decode(&p, p+maplen); 2036 + newmap = ceph_osdmap_decode(&p, p+maplen); 2064 2037 if (IS_ERR(newmap)) { 2065 2038 err = PTR_ERR(newmap); 2066 2039 goto bad;
+730 -267
net/ceph/osdmap.c
··· 343 343 344 344 /* 345 345 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 346 - * to a set of osds) 346 + * to a set of osds) and primary_temp (explicit primary setting) 347 347 */ 348 348 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 349 349 { ··· 506 506 kfree(pi); 507 507 } 508 508 509 - static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 509 + static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 510 510 { 511 511 u8 ev, cv; 512 512 unsigned len, num; ··· 587 587 return -EINVAL; 588 588 } 589 589 590 - static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 590 + static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 591 591 { 592 592 struct ceph_pg_pool_info *pi; 593 593 u32 num, len; ··· 633 633 rb_erase(&pg->node, &map->pg_temp); 634 634 kfree(pg); 635 635 } 636 + while (!RB_EMPTY_ROOT(&map->primary_temp)) { 637 + struct ceph_pg_mapping *pg = 638 + rb_entry(rb_first(&map->primary_temp), 639 + struct ceph_pg_mapping, node); 640 + rb_erase(&pg->node, &map->primary_temp); 641 + kfree(pg); 642 + } 636 643 while (!RB_EMPTY_ROOT(&map->pg_pools)) { 637 644 struct ceph_pg_pool_info *pi = 638 645 rb_entry(rb_first(&map->pg_pools), ··· 649 642 kfree(map->osd_state); 650 643 kfree(map->osd_weight); 651 644 kfree(map->osd_addr); 645 + kfree(map->osd_primary_affinity); 652 646 kfree(map); 653 647 } 654 648 655 649 /* 656 - * adjust max osd value. reallocate arrays. 650 + * Adjust max_osd value, (re)allocate arrays. 651 + * 652 + * The new elements are properly initialized. 657 653 */ 658 654 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 659 655 { 660 656 u8 *state; 661 - struct ceph_entity_addr *addr; 662 657 u32 *weight; 658 + struct ceph_entity_addr *addr; 659 + int i; 663 660 664 - state = kcalloc(max, sizeof(*state), GFP_NOFS); 665 - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); 666 - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); 667 - if (state == NULL || addr == NULL || weight == NULL) { 661 + state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); 662 + weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); 663 + addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); 664 + if (!state || !weight || !addr) { 668 665 kfree(state); 669 - kfree(addr); 670 666 kfree(weight); 667 + kfree(addr); 668 + 671 669 return -ENOMEM; 672 670 } 673 671 674 - /* copy old? */ 675 - if (map->osd_state) { 676 - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); 677 - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); 678 - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); 679 - kfree(map->osd_state); 680 - kfree(map->osd_addr); 681 - kfree(map->osd_weight); 672 + for (i = map->max_osd; i < max; i++) { 673 + state[i] = 0; 674 + weight[i] = CEPH_OSD_OUT; 675 + memset(addr + i, 0, sizeof(*addr)); 682 676 } 683 677 684 678 map->osd_state = state; 685 679 map->osd_weight = weight; 686 680 map->osd_addr = addr; 681 + 682 + if (map->osd_primary_affinity) { 683 + u32 *affinity; 684 + 685 + affinity = krealloc(map->osd_primary_affinity, 686 + max*sizeof(*affinity), GFP_NOFS); 687 + if (!affinity) 688 + return -ENOMEM; 689 + 690 + for (i = map->max_osd; i < max; i++) 691 + affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 692 + 693 + map->osd_primary_affinity = affinity; 694 + } 695 + 687 696 map->max_osd = max; 697 + 688 698 return 0; 699 + } 700 + 701 + #define OSDMAP_WRAPPER_COMPAT_VER 7 702 + #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 703 + 704 + /* 705 + * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, 706 + * to struct_v of the client_data section for new (v7 and above) 707 + * osdmaps. 708 + */ 709 + static int get_osdmap_client_data_v(void **p, void *end, 710 + const char *prefix, u8 *v) 711 + { 712 + u8 struct_v; 713 + 714 + ceph_decode_8_safe(p, end, struct_v, e_inval); 715 + if (struct_v >= 7) { 716 + u8 struct_compat; 717 + 718 + ceph_decode_8_safe(p, end, struct_compat, e_inval); 719 + if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { 720 + pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n", 721 + struct_v, struct_compat, 722 + OSDMAP_WRAPPER_COMPAT_VER, prefix); 723 + return -EINVAL; 724 + } 725 + *p += 4; /* ignore wrapper struct_len */ 726 + 727 + ceph_decode_8_safe(p, end, struct_v, e_inval); 728 + ceph_decode_8_safe(p, end, struct_compat, e_inval); 729 + if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { 730 + pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n", 731 + struct_v, struct_compat, 732 + OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); 733 + return -EINVAL; 734 + } 735 + *p += 4; /* ignore client data struct_len */ 736 + } else { 737 + u16 version; 738 + 739 + *p -= 1; 740 + ceph_decode_16_safe(p, end, version, e_inval); 741 + if (version < 6) { 742 + pr_warning("got v %d < 6 of %s ceph_osdmap\n", version, 743 + prefix); 744 + return -EINVAL; 745 + } 746 + 747 + /* old osdmap enconding */ 748 + struct_v = 0; 749 + } 750 + 751 + *v = struct_v; 752 + return 0; 753 + 754 + e_inval: 755 + return -EINVAL; 756 + } 757 + 758 + static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, 759 + bool incremental) 760 + { 761 + u32 n; 762 + 763 + ceph_decode_32_safe(p, end, n, e_inval); 764 + while (n--) { 765 + struct ceph_pg_pool_info *pi; 766 + u64 pool; 767 + int ret; 768 + 769 + ceph_decode_64_safe(p, end, pool, e_inval); 770 + 771 + pi = __lookup_pg_pool(&map->pg_pools, pool); 772 + if (!incremental || !pi) { 773 + pi = kzalloc(sizeof(*pi), GFP_NOFS); 774 + if (!pi) 775 + return -ENOMEM; 776 + 777 + pi->id = pool; 778 + 779 + ret = __insert_pg_pool(&map->pg_pools, pi); 780 + if (ret) { 781 + kfree(pi); 782 + return ret; 783 + } 784 + } 785 + 786 + ret = decode_pool(p, end, pi); 787 + if (ret) 788 + return ret; 789 + } 790 + 791 + return 0; 792 + 793 + e_inval: 794 + return -EINVAL; 795 + } 796 + 797 + static int decode_pools(void **p, void *end, struct ceph_osdmap *map) 798 + { 799 + return __decode_pools(p, end, map, false); 800 + } 801 + 802 + static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) 803 + { 804 + return __decode_pools(p, end, map, true); 805 + } 806 + 807 + static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, 808 + bool incremental) 809 + { 810 + u32 n; 811 + 812 + ceph_decode_32_safe(p, end, n, e_inval); 813 + while (n--) { 814 + struct ceph_pg pgid; 815 + u32 len, i; 816 + int ret; 817 + 818 + ret = ceph_decode_pgid(p, end, &pgid); 819 + if (ret) 820 + return ret; 821 + 822 + ceph_decode_32_safe(p, end, len, e_inval); 823 + 824 + ret = __remove_pg_mapping(&map->pg_temp, pgid); 825 + BUG_ON(!incremental && ret != -ENOENT); 826 + 827 + if (!incremental || len > 0) { 828 + struct ceph_pg_mapping *pg; 829 + 830 + ceph_decode_need(p, end, len*sizeof(u32), e_inval); 831 + 832 + if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 833 + return -EINVAL; 834 + 835 + pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); 836 + if (!pg) 837 + return -ENOMEM; 838 + 839 + pg->pgid = pgid; 840 + pg->pg_temp.len = len; 841 + for (i = 0; i < len; i++) 842 + pg->pg_temp.osds[i] = ceph_decode_32(p); 843 + 844 + ret = __insert_pg_mapping(pg, &map->pg_temp); 845 + if (ret) { 846 + kfree(pg); 847 + return ret; 848 + } 849 + } 850 + } 851 + 852 + return 0; 853 + 854 + e_inval: 855 + return -EINVAL; 856 + } 857 + 858 + static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) 859 + { 860 + return __decode_pg_temp(p, end, map, false); 861 + } 862 + 863 + static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) 864 + { 865 + return __decode_pg_temp(p, end, map, true); 866 + } 867 + 868 + static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, 869 + bool incremental) 870 + { 871 + u32 n; 872 + 873 + ceph_decode_32_safe(p, end, n, e_inval); 874 + while (n--) { 875 + struct ceph_pg pgid; 876 + u32 osd; 877 + int ret; 878 + 879 + ret = ceph_decode_pgid(p, end, &pgid); 880 + if (ret) 881 + return ret; 882 + 883 + ceph_decode_32_safe(p, end, osd, e_inval); 884 + 885 + ret = __remove_pg_mapping(&map->primary_temp, pgid); 886 + BUG_ON(!incremental && ret != -ENOENT); 887 + 888 + if (!incremental || osd != (u32)-1) { 889 + struct ceph_pg_mapping *pg; 890 + 891 + pg = kzalloc(sizeof(*pg), GFP_NOFS); 892 + if (!pg) 893 + return -ENOMEM; 894 + 895 + pg->pgid = pgid; 896 + pg->primary_temp.osd = osd; 897 + 898 + ret = __insert_pg_mapping(pg, &map->primary_temp); 899 + if (ret) { 900 + kfree(pg); 901 + return ret; 902 + } 903 + } 904 + } 905 + 906 + return 0; 907 + 908 + e_inval: 909 + return -EINVAL; 910 + } 911 + 912 + static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) 913 + { 914 + return __decode_primary_temp(p, end, map, false); 915 + } 916 + 917 + static int decode_new_primary_temp(void **p, void *end, 918 + struct ceph_osdmap *map) 919 + { 920 + return __decode_primary_temp(p, end, map, true); 921 + } 922 + 923 + u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) 924 + { 925 + BUG_ON(osd >= map->max_osd); 926 + 927 + if (!map->osd_primary_affinity) 928 + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 929 + 930 + return map->osd_primary_affinity[osd]; 931 + } 932 + 933 + static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) 934 + { 935 + BUG_ON(osd >= map->max_osd); 936 + 937 + if (!map->osd_primary_affinity) { 938 + int i; 939 + 940 + map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), 941 + GFP_NOFS); 942 + if (!map->osd_primary_affinity) 943 + return -ENOMEM; 944 + 945 + for (i = 0; i < map->max_osd; i++) 946 + map->osd_primary_affinity[i] = 947 + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 948 + } 949 + 950 + map->osd_primary_affinity[osd] = aff; 951 + 952 + return 0; 953 + } 954 + 955 + static int decode_primary_affinity(void **p, void *end, 956 + struct ceph_osdmap *map) 957 + { 958 + u32 len, i; 959 + 960 + ceph_decode_32_safe(p, end, len, e_inval); 961 + if (len == 0) { 962 + kfree(map->osd_primary_affinity); 963 + map->osd_primary_affinity = NULL; 964 + return 0; 965 + } 966 + if (len != map->max_osd) 967 + goto e_inval; 968 + 969 + ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); 970 + 971 + for (i = 0; i < map->max_osd; i++) { 972 + int ret; 973 + 974 + ret = set_primary_affinity(map, i, ceph_decode_32(p)); 975 + if (ret) 976 + return ret; 977 + } 978 + 979 + return 0; 980 + 981 + e_inval: 982 + return -EINVAL; 983 + } 984 + 985 + static int decode_new_primary_affinity(void **p, void *end, 986 + struct ceph_osdmap *map) 987 + { 988 + u32 n; 989 + 990 + ceph_decode_32_safe(p, end, n, e_inval); 991 + while (n--) { 992 + u32 osd, aff; 993 + int ret; 994 + 995 + ceph_decode_32_safe(p, end, osd, e_inval); 996 + ceph_decode_32_safe(p, end, aff, e_inval); 997 + 998 + ret = set_primary_affinity(map, osd, aff); 999 + if (ret) 1000 + return ret; 1001 + 1002 + pr_info("osd%d primary-affinity 0x%x\n", osd, aff); 1003 + } 1004 + 1005 + return 0; 1006 + 1007 + e_inval: 1008 + return -EINVAL; 689 1009 } 690 1010 691 1011 /* 692 1012 * decode a full map. 693 1013 */ 694 - struct ceph_osdmap *osdmap_decode(void **p, void *end) 1014 + static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) 695 1015 { 696 - struct ceph_osdmap *map; 697 - u16 version; 698 - u32 len, max, i; 699 - int err = -EINVAL; 1016 + u8 struct_v; 1017 + u32 epoch = 0; 700 1018 void *start = *p; 701 - struct ceph_pg_pool_info *pi; 1019 + u32 max; 1020 + u32 len, i; 1021 + int err; 702 1022 703 - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 1023 + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); 704 1024 705 - map = kzalloc(sizeof(*map), GFP_NOFS); 706 - if (map == NULL) 707 - return ERR_PTR(-ENOMEM); 708 - map->pg_temp = RB_ROOT; 709 - 710 - ceph_decode_16_safe(p, end, version, bad); 711 - if (version > 6) { 712 - pr_warning("got unknown v %d > 6 of osdmap\n", version); 1025 + err = get_osdmap_client_data_v(p, end, "full", &struct_v); 1026 + if (err) 713 1027 goto bad; 714 - } 715 - if (version < 6) { 716 - pr_warning("got old v %d < 6 of osdmap\n", version); 717 - goto bad; 718 - } 719 1028 720 - ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); 1029 + /* fsid, epoch, created, modified */ 1030 + ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + 1031 + sizeof(map->created) + sizeof(map->modified), e_inval); 721 1032 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); 722 - map->epoch = ceph_decode_32(p); 1033 + epoch = map->epoch = ceph_decode_32(p); 723 1034 ceph_decode_copy(p, &map->created, sizeof(map->created)); 724 1035 ceph_decode_copy(p, &map->modified, sizeof(map->modified)); 725 1036 726 - ceph_decode_32_safe(p, end, max, bad); 727 - while (max--) { 728 - ceph_decode_need(p, end, 8 + 2, bad); 729 - err = -ENOMEM; 730 - pi = kzalloc(sizeof(*pi), GFP_NOFS); 731 - if (!pi) 732 - goto bad; 733 - pi->id = ceph_decode_64(p); 734 - err = __decode_pool(p, end, pi); 735 - if (err < 0) { 736 - kfree(pi); 737 - goto bad; 738 - } 739 - __insert_pg_pool(&map->pg_pools, pi); 740 - } 741 - 742 - err = __decode_pool_names(p, end, map); 743 - if (err < 0) { 744 - dout("fail to decode pool names"); 1037 + /* pools */ 1038 + err = decode_pools(p, end, map); 1039 + if (err) 745 1040 goto bad; 746 - } 747 1041 748 - ceph_decode_32_safe(p, end, map->pool_max, bad); 1042 + /* pool_name */ 1043 + err = decode_pool_names(p, end, map); 1044 + if (err) 1045 + goto bad; 749 1046 750 - ceph_decode_32_safe(p, end, map->flags, bad); 1047 + ceph_decode_32_safe(p, end, map->pool_max, e_inval); 751 1048 752 - max = ceph_decode_32(p); 1049 + ceph_decode_32_safe(p, end, map->flags, e_inval); 1050 + 1051 + /* max_osd */ 1052 + ceph_decode_32_safe(p, end, max, e_inval); 753 1053 754 1054 /* (re)alloc osd arrays */ 755 1055 err = osdmap_set_max_osd(map, max); 756 - if (err < 0) 1056 + if (err) 757 1057 goto bad; 758 - dout("osdmap_decode max_osd = %d\n", map->max_osd); 759 1058 760 - /* osds */ 761 - err = -EINVAL; 1059 + /* osd_state, osd_weight, osd_addrs->client_addr */ 762 1060 ceph_decode_need(p, end, 3*sizeof(u32) + 763 1061 map->max_osd*(1 + sizeof(*map->osd_weight) + 764 - sizeof(*map->osd_addr)), bad); 765 - *p += 4; /* skip length field (should match max) */ 1062 + sizeof(*map->osd_addr)), e_inval); 1063 + 1064 + if (ceph_decode_32(p) != map->max_osd) 1065 + goto e_inval; 1066 + 766 1067 ceph_decode_copy(p, map->osd_state, map->max_osd); 767 1068 768 - *p += 4; /* skip length field (should match max) */ 1069 + if (ceph_decode_32(p) != map->max_osd) 1070 + goto e_inval; 1071 + 769 1072 for (i = 0; i < map->max_osd; i++) 770 1073 map->osd_weight[i] = ceph_decode_32(p); 771 1074 772 - *p += 4; /* skip length field (should match max) */ 1075 + if (ceph_decode_32(p) != map->max_osd) 1076 + goto e_inval; 1077 + 773 1078 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); 774 1079 for (i = 0; i < map->max_osd; i++) 775 1080 ceph_decode_addr(&map->osd_addr[i]); 776 1081 777 1082 /* pg_temp */ 778 - ceph_decode_32_safe(p, end, len, bad); 779 - for (i = 0; i < len; i++) { 780 - int n, j; 781 - struct ceph_pg pgid; 782 - struct ceph_pg_mapping *pg; 1083 + err = decode_pg_temp(p, end, map); 1084 + if (err) 1085 + goto bad; 783 1086 784 - err = ceph_decode_pgid(p, end, &pgid); 1087 + /* primary_temp */ 1088 + if (struct_v >= 1) { 1089 + err = decode_primary_temp(p, end, map); 785 1090 if (err) 786 1091 goto bad; 787 - ceph_decode_need(p, end, sizeof(u32), bad); 788 - n = ceph_decode_32(p); 789 - err = -EINVAL; 790 - if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 791 - goto bad; 792 - ceph_decode_need(p, end, n * sizeof(u32), bad); 793 - err = -ENOMEM; 794 - pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); 795 - if (!pg) 796 - goto bad; 797 - pg->pgid = pgid; 798 - pg->len = n; 799 - for (j = 0; j < n; j++) 800 - pg->osds[j] = ceph_decode_32(p); 1092 + } 801 1093 802 - err = __insert_pg_mapping(pg, &map->pg_temp); 1094 + /* primary_affinity */ 1095 + if (struct_v >= 2) { 1096 + err = decode_primary_affinity(p, end, map); 803 1097 if (err) 804 1098 goto bad; 805 - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, 806 - len); 1099 + } else { 1100 + /* XXX can this happen? */ 1101 + kfree(map->osd_primary_affinity); 1102 + map->osd_primary_affinity = NULL; 807 1103 } 808 1104 809 1105 /* crush */ 810 - ceph_decode_32_safe(p, end, len, bad); 811 - dout("osdmap_decode crush len %d from off 0x%x\n", len, 812 - (int)(*p - start)); 813 - ceph_decode_need(p, end, len, bad); 814 - map->crush = crush_decode(*p, end); 815 - *p += len; 1106 + ceph_decode_32_safe(p, end, len, e_inval); 1107 + map->crush = crush_decode(*p, min(*p + len, end)); 816 1108 if (IS_ERR(map->crush)) { 817 1109 err = PTR_ERR(map->crush); 818 1110 map->crush = NULL; 819 1111 goto bad; 820 1112 } 1113 + *p += len; 821 1114 822 - /* ignore the rest of the map */ 1115 + /* ignore the rest */ 823 1116 *p = end; 824 1117 825 - dout("osdmap_decode done %p %p\n", *p, end); 826 - return map; 1118 + dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); 1119 + return 0; 827 1120 1121 + e_inval: 1122 + err = -EINVAL; 828 1123 bad: 829 - dout("osdmap_decode fail err %d\n", err); 830 - ceph_osdmap_destroy(map); 831 - return ERR_PTR(err); 1124 + pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", 1125 + err, epoch, (int)(*p - start), *p, start, end); 1126 + print_hex_dump(KERN_DEBUG, "osdmap: ", 1127 + DUMP_PREFIX_OFFSET, 16, 1, 1128 + start, end - start, true); 1129 + return err; 1130 + } 1131 + 1132 + /* 1133 + * Allocate and decode a full map. 1134 + */ 1135 + struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) 1136 + { 1137 + struct ceph_osdmap *map; 1138 + int ret; 1139 + 1140 + map = kzalloc(sizeof(*map), GFP_NOFS); 1141 + if (!map) 1142 + return ERR_PTR(-ENOMEM); 1143 + 1144 + map->pg_temp = RB_ROOT; 1145 + map->primary_temp = RB_ROOT; 1146 + mutex_init(&map->crush_scratch_mutex); 1147 + 1148 + ret = osdmap_decode(p, end, map); 1149 + if (ret) { 1150 + ceph_osdmap_destroy(map); 1151 + return ERR_PTR(ret); 1152 + } 1153 + 1154 + return map; 832 1155 } 833 1156 834 1157 /* ··· 1177 840 __s64 new_pool_max; 1178 841 __s32 new_flags, max; 1179 842 void *start = *p; 1180 - int err = -EINVAL; 1181 - u16 version; 843 + int err; 844 + u8 struct_v; 1182 845 1183 - ceph_decode_16_safe(p, end, version, bad); 1184 - if (version != 6) { 1185 - pr_warning("got unknown v %d != 6 of inc osdmap\n", version); 846 + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); 847 + 848 + err = get_osdmap_client_data_v(p, end, "inc", &struct_v); 849 + if (err) 1186 850 goto bad; 1187 - } 1188 851 1189 - ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), 1190 - bad); 852 + /* fsid, epoch, modified, new_pool_max, new_flags */ 853 + ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + 854 + sizeof(u64) + sizeof(u32), e_inval); 1191 855 ceph_decode_copy(p, &fsid, sizeof(fsid)); 1192 856 epoch = ceph_decode_32(p); 1193 857 BUG_ON(epoch != map->epoch+1); ··· 1197 859 new_flags = ceph_decode_32(p); 1198 860 1199 861 /* full map? */ 1200 - ceph_decode_32_safe(p, end, len, bad); 862 + ceph_decode_32_safe(p, end, len, e_inval); 1201 863 if (len > 0) { 1202 864 dout("apply_incremental full map len %d, %p to %p\n", 1203 865 len, *p, end); 1204 - return osdmap_decode(p, min(*p+len, end)); 866 + return ceph_osdmap_decode(p, min(*p+len, end)); 1205 867 } 1206 868 1207 869 /* new crush? */ 1208 - ceph_decode_32_safe(p, end, len, bad); 870 + ceph_decode_32_safe(p, end, len, e_inval); 1209 871 if (len > 0) { 1210 - dout("apply_incremental new crush map len %d, %p to %p\n", 1211 - len, *p, end); 1212 872 newcrush = crush_decode(*p, min(*p+len, end)); 1213 - if (IS_ERR(newcrush)) 1214 - return ERR_CAST(newcrush); 873 + if (IS_ERR(newcrush)) { 874 + err = PTR_ERR(newcrush); 875 + newcrush = NULL; 876 + goto bad; 877 + } 1215 878 *p += len; 1216 879 } 1217 880 ··· 1222 883 if (new_pool_max >= 0) 1223 884 map->pool_max = new_pool_max; 1224 885 1225 - ceph_decode_need(p, end, 5*sizeof(u32), bad); 1226 - 1227 886 /* new max? */ 1228 - max = ceph_decode_32(p); 887 + ceph_decode_32_safe(p, end, max, e_inval); 1229 888 if (max >= 0) { 1230 889 err = osdmap_set_max_osd(map, max); 1231 - if (err < 0) 890 + if (err) 1232 891 goto bad; 1233 892 } 1234 893 ··· 1239 902 newcrush = NULL; 1240 903 } 1241 904 1242 - /* new_pool */ 1243 - ceph_decode_32_safe(p, end, len, bad); 1244 - while (len--) { 1245 - struct ceph_pg_pool_info *pi; 905 + /* new_pools */ 906 + err = decode_new_pools(p, end, map); 907 + if (err) 908 + goto bad; 1246 909 1247 - ceph_decode_64_safe(p, end, pool, bad); 1248 - pi = __lookup_pg_pool(&map->pg_pools, pool); 1249 - if (!pi) { 1250 - pi = kzalloc(sizeof(*pi), GFP_NOFS); 1251 - if (!pi) { 1252 - err = -ENOMEM; 1253 - goto bad; 1254 - } 1255 - pi->id = pool; 1256 - __insert_pg_pool(&map->pg_pools, pi); 1257 - } 1258 - err = __decode_pool(p, end, pi); 1259 - if (err < 0) 1260 - goto bad; 1261 - } 1262 - if (version >= 5) { 1263 - err = __decode_pool_names(p, end, map); 1264 - if (err < 0) 1265 - goto bad; 1266 - } 910 + /* new_pool_names */ 911 + err = decode_pool_names(p, end, map); 912 + if (err) 913 + goto bad; 1267 914 1268 915 /* old_pool */ 1269 - ceph_decode_32_safe(p, end, len, bad); 916 + ceph_decode_32_safe(p, end, len, e_inval); 1270 917 while (len--) { 1271 918 struct ceph_pg_pool_info *pi; 1272 919 1273 - ceph_decode_64_safe(p, end, pool, bad); 920 + ceph_decode_64_safe(p, end, pool, e_inval); 1274 921 pi = __lookup_pg_pool(&map->pg_pools, pool); 1275 922 if (pi) 1276 923 __remove_pg_pool(&map->pg_pools, pi); 1277 924 } 1278 925 1279 926 /* new_up */ 1280 - err = -EINVAL; 1281 - ceph_decode_32_safe(p, end, len, bad); 927 + ceph_decode_32_safe(p, end, len, e_inval); 1282 928 while (len--) { 1283 929 u32 osd; 1284 930 struct ceph_entity_addr addr; 1285 - ceph_decode_32_safe(p, end, osd, bad); 1286 - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); 931 + ceph_decode_32_safe(p, end, osd, e_inval); 932 + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); 1287 933 ceph_decode_addr(&addr); 1288 934 pr_info("osd%d up\n", osd); 1289 935 BUG_ON(osd >= map->max_osd); ··· 1275 955 } 1276 956 1277 957 /* new_state */ 1278 - ceph_decode_32_safe(p, end, len, bad); 958 + ceph_decode_32_safe(p, end, len, e_inval); 1279 959 while (len--) { 1280 960 u32 osd; 1281 961 u8 xorstate; 1282 - ceph_decode_32_safe(p, end, osd, bad); 962 + ceph_decode_32_safe(p, end, osd, e_inval); 1283 963 xorstate = **(u8 **)p; 1284 964 (*p)++; /* clean flag */ 1285 965 if (xorstate == 0) ··· 1291 971 } 1292 972 1293 973 /* new_weight */ 1294 - ceph_decode_32_safe(p, end, len, bad); 974 + ceph_decode_32_safe(p, end, len, e_inval); 1295 975 while (len--) { 1296 976 u32 osd, off; 1297 - ceph_decode_need(p, end, sizeof(u32)*2, bad); 977 + ceph_decode_need(p, end, sizeof(u32)*2, e_inval); 1298 978 osd = ceph_decode_32(p); 1299 979 off = ceph_decode_32(p); 1300 980 pr_info("osd%d weight 0x%x %s\n", osd, off, ··· 1305 985 } 1306 986 1307 987 /* new_pg_temp */ 1308 - ceph_decode_32_safe(p, end, len, bad); 1309 - while (len--) { 1310 - struct ceph_pg_mapping *pg; 1311 - int j; 1312 - struct ceph_pg pgid; 1313 - u32 pglen; 988 + err = decode_new_pg_temp(p, end, map); 989 + if (err) 990 + goto bad; 1314 991 1315 - err = ceph_decode_pgid(p, end, &pgid); 992 + /* new_primary_temp */ 993 + if (struct_v >= 1) { 994 + err = decode_new_primary_temp(p, end, map); 1316 995 if (err) 1317 996 goto bad; 1318 - ceph_decode_need(p, end, sizeof(u32), bad); 1319 - pglen = ceph_decode_32(p); 1320 - if (pglen) { 1321 - ceph_decode_need(p, end, pglen*sizeof(u32), bad); 997 + } 1322 998 1323 - /* removing existing (if any) */ 1324 - (void) __remove_pg_mapping(&map->pg_temp, pgid); 1325 - 1326 - /* insert */ 1327 - err = -EINVAL; 1328 - if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 1329 - goto bad; 1330 - err = -ENOMEM; 1331 - pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); 1332 - if (!pg) 1333 - goto bad; 1334 - pg->pgid = pgid; 1335 - pg->len = pglen; 1336 - for (j = 0; j < pglen; j++) 1337 - pg->osds[j] = ceph_decode_32(p); 1338 - err = __insert_pg_mapping(pg, &map->pg_temp); 1339 - if (err) { 1340 - kfree(pg); 1341 - goto bad; 1342 - } 1343 - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, 1344 - pgid.seed, pglen); 1345 - } else { 1346 - /* remove */ 1347 - __remove_pg_mapping(&map->pg_temp, pgid); 1348 - } 999 + /* new_primary_affinity */ 1000 + if (struct_v >= 2) { 1001 + err = decode_new_primary_affinity(p, end, map); 1002 + if (err) 1003 + goto bad; 1349 1004 } 1350 1005 1351 1006 /* ignore the rest */ 1352 1007 *p = end; 1008 + 1009 + dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); 1353 1010 return map; 1354 1011 1012 + e_inval: 1013 + err = -EINVAL; 1355 1014 bad: 1356 - pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", 1357 - epoch, (int)(*p - start), *p, start, end); 1015 + pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", 1016 + err, epoch, (int)(*p - start), *p, start, end); 1358 1017 print_hex_dump(KERN_DEBUG, "osdmap: ", 1359 1018 DUMP_PREFIX_OFFSET, 16, 1, 1360 1019 start, end - start, true); ··· 1441 1142 } 1442 1143 EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1443 1144 1444 - static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, 1445 - int *result, int result_max, 1446 - const __u32 *weight, int weight_max) 1145 + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, 1146 + int *result, int result_max, 1147 + const __u32 *weight, int weight_max) 1447 1148 { 1448 - int scratch[result_max * 3]; 1149 + int r; 1449 1150 1450 - return crush_do_rule(map, ruleno, x, result, result_max, 1451 - weight, weight_max, scratch); 1151 + BUG_ON(result_max > CEPH_PG_MAX_SIZE); 1152 + 1153 + mutex_lock(&map->crush_scratch_mutex); 1154 + r = crush_do_rule(map->crush, ruleno, x, result, result_max, 1155 + weight, weight_max, map->crush_scratch_ary); 1156 + mutex_unlock(&map->crush_scratch_mutex); 1157 + 1158 + return r; 1452 1159 } 1453 1160 1454 1161 /* 1455 - * Calculate raw osd vector for the given pgid. Return pointer to osd 1456 - * array, or NULL on failure. 1162 + * Calculate raw (crush) set for given pgid. 1163 + * 1164 + * Return raw set length, or error. 1457 1165 */ 1458 - static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1459 - int *osds, int *num) 1166 + static int pg_to_raw_osds(struct ceph_osdmap *osdmap, 1167 + struct ceph_pg_pool_info *pool, 1168 + struct ceph_pg pgid, u32 pps, int *osds) 1460 1169 { 1461 - struct ceph_pg_mapping *pg; 1462 - struct ceph_pg_pool_info *pool; 1463 1170 int ruleno; 1464 - int r; 1465 - u32 pps; 1466 - 1467 - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1468 - if (!pool) 1469 - return NULL; 1470 - 1471 - /* pg_temp? */ 1472 - pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 1473 - pool->pg_num_mask); 1474 - pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1475 - if (pg) { 1476 - *num = pg->len; 1477 - return pg->osds; 1478 - } 1171 + int len; 1479 1172 1480 1173 /* crush */ 1481 1174 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1482 1175 pool->type, pool->size); 1483 1176 if (ruleno < 0) { 1484 - pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", 1177 + pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", 1485 1178 pgid.pool, pool->crush_ruleset, pool->type, 1486 1179 pool->size); 1487 - return NULL; 1180 + return -ENOENT; 1181 + } 1182 + 1183 + len = do_crush(osdmap, ruleno, pps, osds, 1184 + min_t(int, pool->size, CEPH_PG_MAX_SIZE), 1185 + osdmap->osd_weight, osdmap->max_osd); 1186 + if (len < 0) { 1187 + pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 1188 + len, ruleno, pgid.pool, pool->crush_ruleset, 1189 + pool->type, pool->size); 1190 + return len; 1191 + } 1192 + 1193 + return len; 1194 + } 1195 + 1196 + /* 1197 + * Given raw set, calculate up set and up primary. 1198 + * 1199 + * Return up set length. *primary is set to up primary osd id, or -1 1200 + * if up set is empty. 1201 + */ 1202 + static int raw_to_up_osds(struct ceph_osdmap *osdmap, 1203 + struct ceph_pg_pool_info *pool, 1204 + int *osds, int len, int *primary) 1205 + { 1206 + int up_primary = -1; 1207 + int i; 1208 + 1209 + if (ceph_can_shift_osds(pool)) { 1210 + int removed = 0; 1211 + 1212 + for (i = 0; i < len; i++) { 1213 + if (ceph_osd_is_down(osdmap, osds[i])) { 1214 + removed++; 1215 + continue; 1216 + } 1217 + if (removed) 1218 + osds[i - removed] = osds[i]; 1219 + } 1220 + 1221 + len -= removed; 1222 + if (len > 0) 1223 + up_primary = osds[0]; 1224 + } else { 1225 + for (i = len - 1; i >= 0; i--) { 1226 + if (ceph_osd_is_down(osdmap, osds[i])) 1227 + osds[i] = CRUSH_ITEM_NONE; 1228 + else 1229 + up_primary = osds[i]; 1230 + } 1231 + } 1232 + 1233 + *primary = up_primary; 1234 + return len; 1235 + } 1236 + 1237 + static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, 1238 + struct ceph_pg_pool_info *pool, 1239 + int *osds, int len, int *primary) 1240 + { 1241 + int i; 1242 + int pos = -1; 1243 + 1244 + /* 1245 + * Do we have any non-default primary_affinity values for these 1246 + * osds? 1247 + */ 1248 + if (!osdmap->osd_primary_affinity) 1249 + return; 1250 + 1251 + for (i = 0; i < len; i++) { 1252 + if (osds[i] != CRUSH_ITEM_NONE && 1253 + osdmap->osd_primary_affinity[i] != 1254 + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { 1255 + break; 1256 + } 1257 + } 1258 + if (i == len) 1259 + return; 1260 + 1261 + /* 1262 + * Pick the primary. Feed both the seed (for the pg) and the 1263 + * osd into the hash/rng so that a proportional fraction of an 1264 + * osd's pgs get rejected as primary. 1265 + */ 1266 + for (i = 0; i < len; i++) { 1267 + int osd; 1268 + u32 aff; 1269 + 1270 + osd = osds[i]; 1271 + if (osd == CRUSH_ITEM_NONE) 1272 + continue; 1273 + 1274 + aff = osdmap->osd_primary_affinity[osd]; 1275 + if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && 1276 + (crush_hash32_2(CRUSH_HASH_RJENKINS1, 1277 + pps, osd) >> 16) >= aff) { 1278 + /* 1279 + * We chose not to use this primary. Note it 1280 + * anyway as a fallback in case we don't pick 1281 + * anyone else, but keep looking. 1282 + */ 1283 + if (pos < 0) 1284 + pos = i; 1285 + } else { 1286 + pos = i; 1287 + break; 1288 + } 1289 + } 1290 + if (pos < 0) 1291 + return; 1292 + 1293 + *primary = osds[pos]; 1294 + 1295 + if (ceph_can_shift_osds(pool) && pos > 0) { 1296 + /* move the new primary to the front */ 1297 + for (i = pos; i > 0; i--) 1298 + osds[i] = osds[i - 1]; 1299 + osds[0] = *primary; 1300 + } 1301 + } 1302 + 1303 + /* 1304 + * Given up set, apply pg_temp and primary_temp mappings. 1305 + * 1306 + * Return acting set length. *primary is set to acting primary osd id, 1307 + * or -1 if acting set is empty. 1308 + */ 1309 + static int apply_temps(struct ceph_osdmap *osdmap, 1310 + struct ceph_pg_pool_info *pool, struct ceph_pg pgid, 1311 + int *osds, int len, int *primary) 1312 + { 1313 + struct ceph_pg_mapping *pg; 1314 + int temp_len; 1315 + int temp_primary; 1316 + int i; 1317 + 1318 + /* raw_pg -> pg */ 1319 + pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 1320 + pool->pg_num_mask); 1321 + 1322 + /* pg_temp? */ 1323 + pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1324 + if (pg) { 1325 + temp_len = 0; 1326 + temp_primary = -1; 1327 + 1328 + for (i = 0; i < pg->pg_temp.len; i++) { 1329 + if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 1330 + if (ceph_can_shift_osds(pool)) 1331 + continue; 1332 + else 1333 + osds[temp_len++] = CRUSH_ITEM_NONE; 1334 + } else { 1335 + osds[temp_len++] = pg->pg_temp.osds[i]; 1336 + } 1337 + } 1338 + 1339 + /* apply pg_temp's primary */ 1340 + for (i = 0; i < temp_len; i++) { 1341 + if (osds[i] != CRUSH_ITEM_NONE) { 1342 + temp_primary = osds[i]; 1343 + break; 1344 + } 1345 + } 1346 + } else { 1347 + temp_len = len; 1348 + temp_primary = *primary; 1349 + } 1350 + 1351 + /* primary_temp? */ 1352 + pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); 1353 + if (pg) 1354 + temp_primary = pg->primary_temp.osd; 1355 + 1356 + *primary = temp_primary; 1357 + return temp_len; 1358 + } 1359 + 1360 + /* 1361 + * Calculate acting set for given pgid. 1362 + * 1363 + * Return acting set length, or error. *primary is set to acting 1364 + * primary osd id, or -1 if acting set is empty or on error. 1365 + */ 1366 + int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1367 + int *osds, int *primary) 1368 + { 1369 + struct ceph_pg_pool_info *pool; 1370 + u32 pps; 1371 + int len; 1372 + 1373 + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1374 + if (!pool) { 1375 + *primary = -1; 1376 + return -ENOENT; 1488 1377 } 1489 1378 1490 1379 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 1491 - /* hash pool id and seed sothat pool PGs do not overlap */ 1380 + /* hash pool id and seed so that pool PGs do not overlap */ 1492 1381 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 1493 1382 ceph_stable_mod(pgid.seed, pool->pgp_num, 1494 1383 pool->pgp_num_mask), 1495 1384 pgid.pool); 1496 1385 } else { 1497 1386 /* 1498 - * legacy ehavior: add ps and pool together. this is 1387 + * legacy behavior: add ps and pool together. this is 1499 1388 * not a great approach because the PGs from each pool 1500 1389 * will overlap on top of each other: 0.5 == 1.4 == 1501 1390 * 2.3 == ... ··· 1692 1205 pool->pgp_num_mask) + 1693 1206 (unsigned)pgid.pool; 1694 1207 } 1695 - r = crush_do_rule_ary(osdmap->crush, ruleno, pps, 1696 - osds, min_t(int, pool->size, *num), 1697 - osdmap->osd_weight, osdmap->max_osd); 1698 - if (r < 0) { 1699 - pr_err("error %d from crush rule: pool %lld ruleset %d type %d" 1700 - " size %d\n", r, pgid.pool, pool->crush_ruleset, 1701 - pool->type, pool->size); 1702 - return NULL; 1208 + 1209 + len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); 1210 + if (len < 0) { 1211 + *primary = -1; 1212 + return len; 1703 1213 } 1704 - *num = r; 1705 - return osds; 1706 - } 1707 1214 1708 - /* 1709 - * Return acting set for given pgid. 1710 - */ 1711 - int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1712 - int *acting) 1713 - { 1714 - int rawosds[CEPH_PG_MAX_SIZE], *osds; 1715 - int i, o, num = CEPH_PG_MAX_SIZE; 1215 + len = raw_to_up_osds(osdmap, pool, osds, len, primary); 1716 1216 1717 - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1718 - if (!osds) 1719 - return -1; 1217 + apply_primary_affinity(osdmap, pps, pool, osds, len, primary); 1720 1218 1721 - /* primary is first up osd */ 1722 - o = 0; 1723 - for (i = 0; i < num; i++) 1724 - if (ceph_osd_is_up(osdmap, osds[i])) 1725 - acting[o++] = osds[i]; 1726 - return o; 1219 + len = apply_temps(osdmap, pool, pgid, osds, len, primary); 1220 + 1221 + return len; 1727 1222 } 1728 1223 1729 1224 /* ··· 1713 1244 */ 1714 1245 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1715 1246 { 1716 - int rawosds[CEPH_PG_MAX_SIZE], *osds; 1717 - int i, num = CEPH_PG_MAX_SIZE; 1247 + int osds[CEPH_PG_MAX_SIZE]; 1248 + int primary; 1718 1249 1719 - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1720 - if (!osds) 1721 - return -1; 1250 + ceph_calc_pg_acting(osdmap, pgid, osds, &primary); 1722 1251 1723 - /* primary is first up osd */ 1724 - for (i = 0; i < num; i++) 1725 - if (ceph_osd_is_up(osdmap, osds[i])) 1726 - return osds[i]; 1727 - return -1; 1252 + return primary; 1728 1253 } 1729 1254 EXPORT_SYMBOL(ceph_calc_pg_primary);