Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull ceph updates from Sage Weil:
"This is a big batch. From Ilya we have:

- rbd support for more than ~250 mapped devices (now uses same scheme
that SCSI does for device major/minor numbering)
- crush updates for new mapping behaviors (will be needed for coming
erasure coding support, among other things)
- preliminary support for tiered storage pools

There is also a big series fixing a pile cephfs bugs with clustered
MDSs from Yan Zheng, ACL support for cephfs from Guangliang Zhao, ceph
fscache improvements from Li Wang, improved behavior when we get
ENOSPC from Josh Durgin, some readv/writev improvements from
Majianpeng, and the usual mix of small cleanups"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (76 commits)
ceph: cast PAGE_SIZE to size_t in ceph_sync_write()
ceph: fix dout() compile warnings in ceph_filemap_fault()
libceph: support CEPH_FEATURE_OSD_CACHEPOOL feature
libceph: follow redirect replies from osds
libceph: rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid}
libceph: follow {read,write}_tier fields on osd request submission
libceph: add ceph_pg_pool_by_id()
libceph: CEPH_OSD_FLAG_* enum update
libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg()
libceph: introduce and start using oid abstraction
libceph: rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN
libceph: move ceph_file_layout helpers to ceph_fs.h
libceph: start using oloc abstraction
libceph: dout() is missing a newline
libceph: add ceph_kv{malloc,free}() and switch to them
libceph: support CEPH_FEATURE_EXPORT_PEER
ceph: add imported caps when handling cap export message
ceph: add open export target session helper
ceph: remove exported caps when handling cap import message
ceph: handle session flush message
...

+2264 -682
+26
Documentation/ABI/testing/sysfs-bus-rbd
··· 18 18 19 19 $ echo <dev-id> > /sys/bus/rbd/remove 20 20 21 + What: /sys/bus/rbd/add_single_major 22 + Date: December 2013 23 + KernelVersion: 3.14 24 + Contact: Sage Weil <sage@inktank.com> 25 + Description: Available only if rbd module is inserted with single_major 26 + parameter set to true. 27 + Usage is the same as for /sys/bus/rbd/add. If present, 28 + should be used instead of the latter: any attempts to use 29 + /sys/bus/rbd/add if /sys/bus/rbd/add_single_major is 30 + available will fail for backwards compatibility reasons. 31 + 32 + What: /sys/bus/rbd/remove_single_major 33 + Date: December 2013 34 + KernelVersion: 3.14 35 + Contact: Sage Weil <sage@inktank.com> 36 + Description: Available only if rbd module is inserted with single_major 37 + parameter set to true. 38 + Usage is the same as for /sys/bus/rbd/remove. If present, 39 + should be used instead of the latter: any attempts to use 40 + /sys/bus/rbd/remove if /sys/bus/rbd/remove_single_major is 41 + available will fail for backwards compatibility reasons. 42 + 21 43 Entries under /sys/bus/rbd/devices/<dev-id>/ 22 44 -------------------------------------------- 23 45 ··· 54 32 major 55 33 56 34 The block device major number. 35 + 36 + minor 37 + 38 + The block device minor number. (December 2013, since 3.14.) 57 39 58 40 name 59 41
+1 -1
MAINTAINERS
··· 7075 7075 RADOS BLOCK DEVICE (RBD) 7076 7076 M: Yehuda Sadeh <yehuda@inktank.com> 7077 7077 M: Sage Weil <sage@inktank.com> 7078 - M: Alex Elder <elder@inktank.com> 7078 + M: Alex Elder <elder@kernel.org> 7079 7079 M: ceph-devel@vger.kernel.org 7080 7080 W: http://ceph.com/ 7081 7081 T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+205 -100
drivers/block/rbd.c
··· 41 41 #include <linux/fs.h> 42 42 #include <linux/blkdev.h> 43 43 #include <linux/slab.h> 44 + #include <linux/idr.h> 44 45 45 46 #include "rbd_types.h" 46 47 ··· 90 89 } 91 90 92 91 #define RBD_DRV_NAME "rbd" 93 - #define RBD_DRV_NAME_LONG "rbd (rados block device)" 94 92 95 - #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 93 + #define RBD_MINORS_PER_MAJOR 256 94 + #define RBD_SINGLE_MAJOR_PART_SHIFT 4 96 95 97 96 #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 98 97 #define RBD_MAX_SNAP_NAME_LEN \ ··· 324 323 int dev_id; /* blkdev unique id */ 325 324 326 325 int major; /* blkdev assigned major */ 326 + int minor; 327 327 struct gendisk *disk; /* blkdev's gendisk and rq */ 328 328 329 329 u32 image_format; /* Either 1 or 2 */ ··· 388 386 static struct kmem_cache *rbd_obj_request_cache; 389 387 static struct kmem_cache *rbd_segment_name_cache; 390 388 389 + static int rbd_major; 390 + static DEFINE_IDA(rbd_dev_id_ida); 391 + 392 + /* 393 + * Default to false for now, as single-major requires >= 0.75 version of 394 + * userspace rbd utility. 395 + */ 396 + static bool single_major = false; 397 + module_param(single_major, bool, S_IRUGO); 398 + MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 399 + 391 400 static int rbd_img_request_submit(struct rbd_img_request *img_request); 392 401 393 402 static void rbd_dev_device_release(struct device *dev); ··· 407 394 size_t count); 408 395 static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 409 396 size_t count); 397 + static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 398 + size_t count); 399 + static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 400 + size_t count); 410 401 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); 411 402 static void rbd_spec_put(struct rbd_spec *spec); 412 403 404 + static int rbd_dev_id_to_minor(int dev_id) 405 + { 406 + return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 407 + } 408 + 409 + static int minor_to_rbd_dev_id(int minor) 410 + { 411 + return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 412 + } 413 + 413 414 static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 414 415 static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 416 + static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 417 + static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 415 418 416 419 static struct attribute *rbd_bus_attrs[] = { 417 420 &bus_attr_add.attr, 418 421 &bus_attr_remove.attr, 422 + &bus_attr_add_single_major.attr, 423 + &bus_attr_remove_single_major.attr, 419 424 NULL, 420 425 }; 421 - ATTRIBUTE_GROUPS(rbd_bus); 426 + 427 + static umode_t rbd_bus_is_visible(struct kobject *kobj, 428 + struct attribute *attr, int index) 429 + { 430 + if (!single_major && 431 + (attr == &bus_attr_add_single_major.attr || 432 + attr == &bus_attr_remove_single_major.attr)) 433 + return 0; 434 + 435 + return attr->mode; 436 + } 437 + 438 + static const struct attribute_group rbd_bus_group = { 439 + .attrs = rbd_bus_attrs, 440 + .is_visible = rbd_bus_is_visible, 441 + }; 442 + __ATTRIBUTE_GROUPS(rbd_bus); 422 443 423 444 static struct bus_type rbd_bus_type = { 424 445 .name = "rbd", ··· 1088 1041 name_format = "%s.%012llx"; 1089 1042 if (rbd_dev->image_format == 2) 1090 1043 name_format = "%s.%016llx"; 1091 - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, 1044 + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 1092 1045 rbd_dev->header.object_prefix, segment); 1093 - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 1046 + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 1094 1047 pr_err("error formatting segment name for #%llu (%d)\n", 1095 1048 segment, ret); 1096 1049 kfree(name); ··· 1808 1761 osd_req->r_callback = rbd_osd_req_callback; 1809 1762 osd_req->r_priv = obj_request; 1810 1763 1811 - osd_req->r_oid_len = strlen(obj_request->object_name); 1812 - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1813 - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1814 - 1815 - osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1764 + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1765 + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1816 1766 1817 1767 return osd_req; 1818 1768 } ··· 1846 1802 osd_req->r_callback = rbd_osd_req_callback; 1847 1803 osd_req->r_priv = obj_request; 1848 1804 1849 - osd_req->r_oid_len = strlen(obj_request->object_name); 1850 - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1851 - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1852 - 1853 - osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1805 + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1806 + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1854 1807 1855 1808 return osd_req; 1856 1809 } ··· 2907 2866 * Request sync osd watch/unwatch. The value of "start" determines 2908 2867 * whether a watch request is being initiated or torn down. 2909 2868 */ 2910 - static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 2869 + static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 2911 2870 { 2912 2871 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2913 2872 struct rbd_obj_request *obj_request; ··· 2980 2939 rbd_obj_request_put(obj_request); 2981 2940 2982 2941 return ret; 2942 + } 2943 + 2944 + static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 2945 + { 2946 + return __rbd_dev_header_watch_sync(rbd_dev, true); 2947 + } 2948 + 2949 + static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 2950 + { 2951 + int ret; 2952 + 2953 + ret = __rbd_dev_header_watch_sync(rbd_dev, false); 2954 + if (ret) { 2955 + rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", 2956 + ret); 2957 + } 2983 2958 } 2984 2959 2985 2960 /* ··· 3445 3388 u64 segment_size; 3446 3389 3447 3390 /* create gendisk info */ 3448 - disk = alloc_disk(RBD_MINORS_PER_MAJOR); 3391 + disk = alloc_disk(single_major ? 3392 + (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 3393 + RBD_MINORS_PER_MAJOR); 3449 3394 if (!disk) 3450 3395 return -ENOMEM; 3451 3396 3452 3397 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3453 3398 rbd_dev->dev_id); 3454 3399 disk->major = rbd_dev->major; 3455 - disk->first_minor = 0; 3400 + disk->first_minor = rbd_dev->minor; 3401 + if (single_major) 3402 + disk->flags |= GENHD_FL_EXT_DEVT; 3456 3403 disk->fops = &rbd_bd_ops; 3457 3404 disk->private_data = rbd_dev; 3458 3405 ··· 3528 3467 return sprintf(buf, "%d\n", rbd_dev->major); 3529 3468 3530 3469 return sprintf(buf, "(none)\n"); 3470 + } 3531 3471 3472 + static ssize_t rbd_minor_show(struct device *dev, 3473 + struct device_attribute *attr, char *buf) 3474 + { 3475 + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3476 + 3477 + return sprintf(buf, "%d\n", rbd_dev->minor); 3532 3478 } 3533 3479 3534 3480 static ssize_t rbd_client_id_show(struct device *dev, ··· 3657 3589 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 3658 3590 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3659 3591 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3592 + static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3660 3593 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3661 3594 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 3662 3595 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); ··· 3671 3602 &dev_attr_size.attr, 3672 3603 &dev_attr_features.attr, 3673 3604 &dev_attr_major.attr, 3605 + &dev_attr_minor.attr, 3674 3606 &dev_attr_client_id.attr, 3675 3607 &dev_attr_pool.attr, 3676 3608 &dev_attr_pool_id.attr, ··· 4442 4372 device_unregister(&rbd_dev->dev); 4443 4373 } 4444 4374 4445 - static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 4446 - 4447 4375 /* 4448 4376 * Get a unique rbd identifier for the given new rbd_dev, and add 4449 - * the rbd_dev to the global list. The minimum rbd id is 1. 4377 + * the rbd_dev to the global list. 4450 4378 */ 4451 - static void rbd_dev_id_get(struct rbd_device *rbd_dev) 4379 + static int rbd_dev_id_get(struct rbd_device *rbd_dev) 4452 4380 { 4453 - rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 4381 + int new_dev_id; 4382 + 4383 + new_dev_id = ida_simple_get(&rbd_dev_id_ida, 4384 + 0, minor_to_rbd_dev_id(1 << MINORBITS), 4385 + GFP_KERNEL); 4386 + if (new_dev_id < 0) 4387 + return new_dev_id; 4388 + 4389 + rbd_dev->dev_id = new_dev_id; 4454 4390 4455 4391 spin_lock(&rbd_dev_list_lock); 4456 4392 list_add_tail(&rbd_dev->node, &rbd_dev_list); 4457 4393 spin_unlock(&rbd_dev_list_lock); 4458 - dout("rbd_dev %p given dev id %llu\n", rbd_dev, 4459 - (unsigned long long) rbd_dev->dev_id); 4394 + 4395 + dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); 4396 + 4397 + return 0; 4460 4398 } 4461 4399 4462 4400 /* ··· 4473 4395 */ 4474 4396 static void rbd_dev_id_put(struct rbd_device *rbd_dev) 4475 4397 { 4476 - struct list_head *tmp; 4477 - int rbd_id = rbd_dev->dev_id; 4478 - int max_id; 4479 - 4480 - rbd_assert(rbd_id > 0); 4481 - 4482 - dout("rbd_dev %p released dev id %llu\n", rbd_dev, 4483 - (unsigned long long) rbd_dev->dev_id); 4484 4398 spin_lock(&rbd_dev_list_lock); 4485 4399 list_del_init(&rbd_dev->node); 4486 - 4487 - /* 4488 - * If the id being "put" is not the current maximum, there 4489 - * is nothing special we need to do. 4490 - */ 4491 - if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 4492 - spin_unlock(&rbd_dev_list_lock); 4493 - return; 4494 - } 4495 - 4496 - /* 4497 - * We need to update the current maximum id. Search the 4498 - * list to find out what it is. We're more likely to find 4499 - * the maximum at the end, so search the list backward. 4500 - */ 4501 - max_id = 0; 4502 - list_for_each_prev(tmp, &rbd_dev_list) { 4503 - struct rbd_device *rbd_dev; 4504 - 4505 - rbd_dev = list_entry(tmp, struct rbd_device, node); 4506 - if (rbd_dev->dev_id > max_id) 4507 - max_id = rbd_dev->dev_id; 4508 - } 4509 4400 spin_unlock(&rbd_dev_list_lock); 4510 4401 4511 - /* 4512 - * The max id could have been updated by rbd_dev_id_get(), in 4513 - * which case it now accurately reflects the new maximum. 4514 - * Be careful not to overwrite the maximum value in that 4515 - * case. 4516 - */ 4517 - atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 4518 - dout(" max dev id has been reset\n"); 4402 + ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 4403 + 4404 + dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); 4519 4405 } 4520 4406 4521 4407 /* ··· 4902 4860 { 4903 4861 int ret; 4904 4862 4905 - /* generate unique id: find highest unique id, add one */ 4906 - rbd_dev_id_get(rbd_dev); 4863 + /* Get an id and fill in device name. */ 4907 4864 4908 - /* Fill in the device name, now that we have its id. */ 4865 + ret = rbd_dev_id_get(rbd_dev); 4866 + if (ret) 4867 + return ret; 4868 + 4909 4869 BUILD_BUG_ON(DEV_NAME_LEN 4910 4870 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 4911 4871 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 4912 4872 4913 - /* Get our block major device number. */ 4873 + /* Record our major and minor device numbers. */ 4914 4874 4915 - ret = register_blkdev(0, rbd_dev->name); 4916 - if (ret < 0) 4917 - goto err_out_id; 4918 - rbd_dev->major = ret; 4875 + if (!single_major) { 4876 + ret = register_blkdev(0, rbd_dev->name); 4877 + if (ret < 0) 4878 + goto err_out_id; 4879 + 4880 + rbd_dev->major = ret; 4881 + rbd_dev->minor = 0; 4882 + } else { 4883 + rbd_dev->major = rbd_major; 4884 + rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 4885 + } 4919 4886 4920 4887 /* Set up the blkdev mapping. */ 4921 4888 ··· 4956 4905 err_out_disk: 4957 4906 rbd_free_disk(rbd_dev); 4958 4907 err_out_blkdev: 4959 - unregister_blkdev(rbd_dev->major, rbd_dev->name); 4908 + if (!single_major) 4909 + unregister_blkdev(rbd_dev->major, rbd_dev->name); 4960 4910 err_out_id: 4961 4911 rbd_dev_id_put(rbd_dev); 4962 4912 rbd_dev_mapping_clear(rbd_dev); ··· 5013 4961 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) 5014 4962 { 5015 4963 int ret; 5016 - int tmp; 5017 4964 5018 4965 /* 5019 4966 * Get the id from the image id object. Unless there's an ··· 5031 4980 goto err_out_format; 5032 4981 5033 4982 if (mapping) { 5034 - ret = rbd_dev_header_watch_sync(rbd_dev, true); 4983 + ret = rbd_dev_header_watch_sync(rbd_dev); 5035 4984 if (ret) 5036 4985 goto out_header_name; 5037 4986 } ··· 5058 5007 err_out_probe: 5059 5008 rbd_dev_unprobe(rbd_dev); 5060 5009 err_out_watch: 5061 - if (mapping) { 5062 - tmp = rbd_dev_header_watch_sync(rbd_dev, false); 5063 - if (tmp) 5064 - rbd_warn(rbd_dev, "unable to tear down " 5065 - "watch request (%d)\n", tmp); 5066 - } 5010 + if (mapping) 5011 + rbd_dev_header_unwatch_sync(rbd_dev); 5067 5012 out_header_name: 5068 5013 kfree(rbd_dev->header_name); 5069 5014 rbd_dev->header_name = NULL; ··· 5073 5026 return ret; 5074 5027 } 5075 5028 5076 - static ssize_t rbd_add(struct bus_type *bus, 5077 - const char *buf, 5078 - size_t count) 5029 + static ssize_t do_rbd_add(struct bus_type *bus, 5030 + const char *buf, 5031 + size_t count) 5079 5032 { 5080 5033 struct rbd_device *rbd_dev = NULL; 5081 5034 struct ceph_options *ceph_opts = NULL; ··· 5137 5090 5138 5091 rc = rbd_dev_device_setup(rbd_dev); 5139 5092 if (rc) { 5093 + /* 5094 + * rbd_dev_header_unwatch_sync() can't be moved into 5095 + * rbd_dev_image_release() without refactoring, see 5096 + * commit 1f3ef78861ac. 5097 + */ 5098 + rbd_dev_header_unwatch_sync(rbd_dev); 5140 5099 rbd_dev_image_release(rbd_dev); 5141 5100 goto err_out_module; 5142 5101 } ··· 5163 5110 return (ssize_t)rc; 5164 5111 } 5165 5112 5113 + static ssize_t rbd_add(struct bus_type *bus, 5114 + const char *buf, 5115 + size_t count) 5116 + { 5117 + if (single_major) 5118 + return -EINVAL; 5119 + 5120 + return do_rbd_add(bus, buf, count); 5121 + } 5122 + 5123 + static ssize_t rbd_add_single_major(struct bus_type *bus, 5124 + const char *buf, 5125 + size_t count) 5126 + { 5127 + return do_rbd_add(bus, buf, count); 5128 + } 5129 + 5166 5130 static void rbd_dev_device_release(struct device *dev) 5167 5131 { 5168 5132 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); ··· 5187 5117 rbd_free_disk(rbd_dev); 5188 5118 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5189 5119 rbd_dev_mapping_clear(rbd_dev); 5190 - unregister_blkdev(rbd_dev->major, rbd_dev->name); 5191 - rbd_dev->major = 0; 5120 + if (!single_major) 5121 + unregister_blkdev(rbd_dev->major, rbd_dev->name); 5192 5122 rbd_dev_id_put(rbd_dev); 5193 5123 rbd_dev_mapping_clear(rbd_dev); 5194 5124 } ··· 5219 5149 } 5220 5150 } 5221 5151 5222 - static ssize_t rbd_remove(struct bus_type *bus, 5223 - const char *buf, 5224 - size_t count) 5152 + static ssize_t do_rbd_remove(struct bus_type *bus, 5153 + const char *buf, 5154 + size_t count) 5225 5155 { 5226 5156 struct rbd_device *rbd_dev = NULL; 5227 5157 struct list_head *tmp; ··· 5261 5191 if (ret < 0 || already) 5262 5192 return ret; 5263 5193 5264 - ret = rbd_dev_header_watch_sync(rbd_dev, false); 5265 - if (ret) 5266 - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); 5267 - 5194 + rbd_dev_header_unwatch_sync(rbd_dev); 5268 5195 /* 5269 5196 * flush remaining watch callbacks - these must be complete 5270 5197 * before the osd_client is shutdown 5271 5198 */ 5272 5199 dout("%s: flushing notifies", __func__); 5273 5200 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 5201 + 5274 5202 /* 5275 5203 * Don't free anything from rbd_dev->disk until after all 5276 5204 * notifies are completely processed. Otherwise ··· 5280 5212 module_put(THIS_MODULE); 5281 5213 5282 5214 return count; 5215 + } 5216 + 5217 + static ssize_t rbd_remove(struct bus_type *bus, 5218 + const char *buf, 5219 + size_t count) 5220 + { 5221 + if (single_major) 5222 + return -EINVAL; 5223 + 5224 + return do_rbd_remove(bus, buf, count); 5225 + } 5226 + 5227 + static ssize_t rbd_remove_single_major(struct bus_type *bus, 5228 + const char *buf, 5229 + size_t count) 5230 + { 5231 + return do_rbd_remove(bus, buf, count); 5283 5232 } 5284 5233 5285 5234 /* ··· 5344 5259 5345 5260 rbd_assert(!rbd_segment_name_cache); 5346 5261 rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 5347 - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); 5262 + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 5348 5263 if (rbd_segment_name_cache) 5349 5264 return 0; 5350 5265 out_err: ··· 5380 5295 5381 5296 if (!libceph_compatible(NULL)) { 5382 5297 rbd_warn(NULL, "libceph incompatibility (quitting)"); 5383 - 5384 5298 return -EINVAL; 5385 5299 } 5300 + 5386 5301 rc = rbd_slab_init(); 5387 5302 if (rc) 5388 5303 return rc; 5304 + 5305 + if (single_major) { 5306 + rbd_major = register_blkdev(0, RBD_DRV_NAME); 5307 + if (rbd_major < 0) { 5308 + rc = rbd_major; 5309 + goto err_out_slab; 5310 + } 5311 + } 5312 + 5389 5313 rc = rbd_sysfs_init(); 5390 5314 if (rc) 5391 - rbd_slab_exit(); 5392 - else 5393 - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 5315 + goto err_out_blkdev; 5394 5316 5317 + if (single_major) 5318 + pr_info("loaded (major %d)\n", rbd_major); 5319 + else 5320 + pr_info("loaded\n"); 5321 + 5322 + return 0; 5323 + 5324 + err_out_blkdev: 5325 + if (single_major) 5326 + unregister_blkdev(rbd_major, RBD_DRV_NAME); 5327 + err_out_slab: 5328 + rbd_slab_exit(); 5395 5329 return rc; 5396 5330 } 5397 5331 5398 5332 static void __exit rbd_exit(void) 5399 5333 { 5400 5334 rbd_sysfs_cleanup(); 5335 + if (single_major) 5336 + unregister_blkdev(rbd_major, RBD_DRV_NAME); 5401 5337 rbd_slab_exit(); 5402 5338 } 5403 5339 ··· 5428 5322 MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5429 5323 MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5430 5324 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5431 - MODULE_DESCRIPTION("rados block device"); 5432 - 5433 5325 /* following authorship retained from original osdblk.c */ 5434 5326 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5435 5327 5328 + MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5436 5329 MODULE_LICENSE("GPL");
+13
fs/ceph/Kconfig
··· 25 25 caching support for Ceph clients using FS-Cache 26 26 27 27 endif 28 + 29 + config CEPH_FS_POSIX_ACL 30 + bool "Ceph POSIX Access Control Lists" 31 + depends on CEPH_FS 32 + select FS_POSIX_ACL 33 + help 34 + POSIX Access Control Lists (ACLs) support permissions for users and 35 + groups beyond the owner/group/world scheme. 36 + 37 + To learn more about Access Control Lists, visit the POSIX ACLs for 38 + Linux website <http://acl.bestbits.at/>. 39 + 40 + If you don't know what Access Control Lists are, say N
+1
fs/ceph/Makefile
··· 10 10 debugfs.o 11 11 12 12 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o 13 + ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+332
fs/ceph/acl.c
··· 1 + /* 2 + * linux/fs/ceph/acl.c 3 + * 4 + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public 8 + * License v2 as published by the Free Software Foundation. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 + * General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public 16 + * License along with this program; if not, write to the 17 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 + * Boston, MA 021110-1307, USA. 19 + */ 20 + 21 + #include <linux/ceph/ceph_debug.h> 22 + #include <linux/fs.h> 23 + #include <linux/string.h> 24 + #include <linux/xattr.h> 25 + #include <linux/posix_acl_xattr.h> 26 + #include <linux/posix_acl.h> 27 + #include <linux/sched.h> 28 + #include <linux/slab.h> 29 + 30 + #include "super.h" 31 + 32 + static inline void ceph_set_cached_acl(struct inode *inode, 33 + int type, struct posix_acl *acl) 34 + { 35 + struct ceph_inode_info *ci = ceph_inode(inode); 36 + 37 + spin_lock(&ci->i_ceph_lock); 38 + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) 39 + set_cached_acl(inode, type, acl); 40 + spin_unlock(&ci->i_ceph_lock); 41 + } 42 + 43 + static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, 44 + int type) 45 + { 46 + struct ceph_inode_info *ci = ceph_inode(inode); 47 + struct posix_acl *acl = ACL_NOT_CACHED; 48 + 49 + spin_lock(&ci->i_ceph_lock); 50 + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) 51 + acl = get_cached_acl(inode, type); 52 + spin_unlock(&ci->i_ceph_lock); 53 + 54 + return acl; 55 + } 56 + 57 + void ceph_forget_all_cached_acls(struct inode *inode) 58 + { 59 + forget_all_cached_acls(inode); 60 + } 61 + 62 + struct posix_acl *ceph_get_acl(struct inode *inode, int type) 63 + { 64 + int size; 65 + const char *name; 66 + char *value = NULL; 67 + struct posix_acl *acl; 68 + 69 + if (!IS_POSIXACL(inode)) 70 + return NULL; 71 + 72 + acl = ceph_get_cached_acl(inode, type); 73 + if (acl != ACL_NOT_CACHED) 74 + return acl; 75 + 76 + switch (type) { 77 + case ACL_TYPE_ACCESS: 78 + name = POSIX_ACL_XATTR_ACCESS; 79 + break; 80 + case ACL_TYPE_DEFAULT: 81 + name = POSIX_ACL_XATTR_DEFAULT; 82 + break; 83 + default: 84 + BUG(); 85 + } 86 + 87 + size = __ceph_getxattr(inode, name, "", 0); 88 + if (size > 0) { 89 + value = kzalloc(size, GFP_NOFS); 90 + if (!value) 91 + return ERR_PTR(-ENOMEM); 92 + size = __ceph_getxattr(inode, name, value, size); 93 + } 94 + 95 + if (size > 0) 96 + acl = posix_acl_from_xattr(&init_user_ns, value, size); 97 + else if (size == -ERANGE || size == -ENODATA || size == 0) 98 + acl = NULL; 99 + else 100 + acl = ERR_PTR(-EIO); 101 + 102 + kfree(value); 103 + 104 + if (!IS_ERR(acl)) 105 + ceph_set_cached_acl(inode, type, acl); 106 + 107 + return acl; 108 + } 109 + 110 + static int ceph_set_acl(struct dentry *dentry, struct inode *inode, 111 + struct posix_acl *acl, int type) 112 + { 113 + int ret = 0, size = 0; 114 + const char *name = NULL; 115 + char *value = NULL; 116 + struct iattr newattrs; 117 + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; 118 + 119 + if (acl) { 120 + ret = posix_acl_valid(acl); 121 + if (ret < 0) 122 + goto out; 123 + } 124 + 125 + switch (type) { 126 + case ACL_TYPE_ACCESS: 127 + name = POSIX_ACL_XATTR_ACCESS; 128 + if (acl) { 129 + ret = posix_acl_equiv_mode(acl, &new_mode); 130 + if (ret < 0) 131 + goto out; 132 + if (ret == 0) 133 + acl = NULL; 134 + } 135 + break; 136 + case ACL_TYPE_DEFAULT: 137 + if (!S_ISDIR(inode->i_mode)) { 138 + ret = acl ? -EINVAL : 0; 139 + goto out; 140 + } 141 + name = POSIX_ACL_XATTR_DEFAULT; 142 + break; 143 + default: 144 + ret = -EINVAL; 145 + goto out; 146 + } 147 + 148 + if (acl) { 149 + size = posix_acl_xattr_size(acl->a_count); 150 + value = kmalloc(size, GFP_NOFS); 151 + if (!value) { 152 + ret = -ENOMEM; 153 + goto out; 154 + } 155 + 156 + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); 157 + if (ret < 0) 158 + goto out_free; 159 + } 160 + 161 + if (new_mode != old_mode) { 162 + newattrs.ia_mode = new_mode; 163 + newattrs.ia_valid = ATTR_MODE; 164 + ret = ceph_setattr(dentry, &newattrs); 165 + if (ret) 166 + goto out_free; 167 + } 168 + 169 + if (value) 170 + ret = __ceph_setxattr(dentry, name, value, size, 0); 171 + else 172 + ret = __ceph_removexattr(dentry, name); 173 + 174 + if (ret) { 175 + if (new_mode != old_mode) { 176 + newattrs.ia_mode = old_mode; 177 + newattrs.ia_valid = ATTR_MODE; 178 + ceph_setattr(dentry, &newattrs); 179 + } 180 + goto out_free; 181 + } 182 + 183 + ceph_set_cached_acl(inode, type, acl); 184 + 185 + out_free: 186 + kfree(value); 187 + out: 188 + return ret; 189 + } 190 + 191 + int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) 192 + { 193 + struct posix_acl *acl = NULL; 194 + int ret = 0; 195 + 196 + if (!S_ISLNK(inode->i_mode)) { 197 + if (IS_POSIXACL(dir)) { 198 + acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT); 199 + if (IS_ERR(acl)) { 200 + ret = PTR_ERR(acl); 201 + goto out; 202 + } 203 + } 204 + 205 + if (!acl) 206 + inode->i_mode &= ~current_umask(); 207 + } 208 + 209 + if (IS_POSIXACL(dir) && acl) { 210 + if (S_ISDIR(inode->i_mode)) { 211 + ret = ceph_set_acl(dentry, inode, acl, 212 + ACL_TYPE_DEFAULT); 213 + if (ret) 214 + goto out_release; 215 + } 216 + ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); 217 + if (ret < 0) 218 + goto out; 219 + else if (ret > 0) 220 + ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS); 221 + else 222 + cache_no_acl(inode); 223 + } else { 224 + cache_no_acl(inode); 225 + } 226 + 227 + out_release: 228 + posix_acl_release(acl); 229 + out: 230 + return ret; 231 + } 232 + 233 + int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) 234 + { 235 + struct posix_acl *acl; 236 + int ret = 0; 237 + 238 + if (S_ISLNK(inode->i_mode)) { 239 + ret = -EOPNOTSUPP; 240 + goto out; 241 + } 242 + 243 + if (!IS_POSIXACL(inode)) 244 + goto out; 245 + 246 + acl = ceph_get_acl(inode, ACL_TYPE_ACCESS); 247 + if (IS_ERR_OR_NULL(acl)) { 248 + ret = PTR_ERR(acl); 249 + goto out; 250 + } 251 + 252 + ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); 253 + if (ret) 254 + goto out; 255 + ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS); 256 + posix_acl_release(acl); 257 + out: 258 + return ret; 259 + } 260 + 261 + static int ceph_xattr_acl_get(struct dentry *dentry, const char *name, 262 + void *value, size_t size, int type) 263 + { 264 + struct posix_acl *acl; 265 + int ret = 0; 266 + 267 + if (!IS_POSIXACL(dentry->d_inode)) 268 + return -EOPNOTSUPP; 269 + 270 + acl = ceph_get_acl(dentry->d_inode, type); 271 + if (IS_ERR(acl)) 272 + return PTR_ERR(acl); 273 + if (acl == NULL) 274 + return -ENODATA; 275 + 276 + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); 277 + posix_acl_release(acl); 278 + 279 + return ret; 280 + } 281 + 282 + static int ceph_xattr_acl_set(struct dentry *dentry, const char *name, 283 + const void *value, size_t size, int flags, int type) 284 + { 285 + int ret = 0; 286 + struct posix_acl *acl = NULL; 287 + 288 + if (!inode_owner_or_capable(dentry->d_inode)) { 289 + ret = -EPERM; 290 + goto out; 291 + } 292 + 293 + if (!IS_POSIXACL(dentry->d_inode)) { 294 + ret = -EOPNOTSUPP; 295 + goto out; 296 + } 297 + 298 + if (value) { 299 + acl = posix_acl_from_xattr(&init_user_ns, value, size); 300 + if (IS_ERR(acl)) { 301 + ret = PTR_ERR(acl); 302 + goto out; 303 + } 304 + 305 + if (acl) { 306 + ret = posix_acl_valid(acl); 307 + if (ret) 308 + goto out_release; 309 + } 310 + } 311 + 312 + ret = ceph_set_acl(dentry, dentry->d_inode, acl, type); 313 + 314 + out_release: 315 + posix_acl_release(acl); 316 + out: 317 + return ret; 318 + } 319 + 320 + const struct xattr_handler ceph_xattr_acl_default_handler = { 321 + .prefix = POSIX_ACL_XATTR_DEFAULT, 322 + .flags = ACL_TYPE_DEFAULT, 323 + .get = ceph_xattr_acl_get, 324 + .set = ceph_xattr_acl_set, 325 + }; 326 + 327 + const struct xattr_handler ceph_xattr_acl_access_handler = { 328 + .prefix = POSIX_ACL_XATTR_ACCESS, 329 + .flags = ACL_TYPE_ACCESS, 330 + .get = ceph_xattr_acl_get, 331 + .set = ceph_xattr_acl_set, 332 + };
+81 -12
fs/ceph/addr.c
··· 209 209 err = 0; 210 210 if (err < 0) { 211 211 SetPageError(page); 212 + ceph_fscache_readpage_cancel(inode, page); 212 213 goto out; 213 214 } else { 214 215 if (err < PAGE_CACHE_SIZE) { ··· 257 256 for (i = 0; i < num_pages; i++) { 258 257 struct page *page = osd_data->pages[i]; 259 258 259 + if (rc < 0) 260 + goto unlock; 260 261 if (bytes < (int)PAGE_CACHE_SIZE) { 261 262 /* zero (remainder of) page */ 262 263 int s = bytes < 0 ? 0 : bytes; ··· 269 266 flush_dcache_page(page); 270 267 SetPageUptodate(page); 271 268 ceph_readpage_to_fscache(inode, page); 269 + unlock: 272 270 unlock_page(page); 273 271 page_cache_release(page); 274 272 bytes -= PAGE_CACHE_SIZE; ··· 1211 1207 /* 1212 1208 * vm ops 1213 1209 */ 1210 + static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1211 + { 1212 + struct inode *inode = file_inode(vma->vm_file); 1213 + struct ceph_inode_info *ci = ceph_inode(inode); 1214 + struct ceph_file_info *fi = vma->vm_file->private_data; 1215 + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; 1216 + int want, got, ret; 1217 + 1218 + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", 1219 + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); 1220 + if (fi->fmode & CEPH_FILE_MODE_LAZY) 1221 + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1222 + else 1223 + want = CEPH_CAP_FILE_CACHE; 1224 + while (1) { 1225 + got = 0; 1226 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 1227 + if (ret == 0) 1228 + break; 1229 + if (ret != -ERESTARTSYS) { 1230 + WARN_ON(1); 1231 + return VM_FAULT_SIGBUS; 1232 + } 1233 + } 1234 + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1235 + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); 1236 + 1237 + ret = filemap_fault(vma, vmf); 1238 + 1239 + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", 1240 + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); 1241 + ceph_put_cap_refs(ci, got); 1242 + 1243 + return ret; 1244 + } 1214 1245 1215 1246 /* 1216 1247 * Reuse write_begin here for simplicity. ··· 1253 1214 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1254 1215 { 1255 1216 struct inode *inode = file_inode(vma->vm_file); 1256 - struct page *page = vmf->page; 1217 + struct ceph_inode_info *ci = ceph_inode(inode); 1218 + struct ceph_file_info *fi = vma->vm_file->private_data; 1257 1219 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1220 + struct page *page = vmf->page; 1258 1221 loff_t off = page_offset(page); 1259 - loff_t size, len; 1260 - int ret; 1222 + loff_t size = i_size_read(inode); 1223 + size_t len; 1224 + int want, got, ret; 1261 1225 1262 - /* Update time before taking page lock */ 1263 - file_update_time(vma->vm_file); 1264 - 1265 - size = i_size_read(inode); 1266 1226 if (off + PAGE_CACHE_SIZE <= size) 1267 1227 len = PAGE_CACHE_SIZE; 1268 1228 else 1269 1229 len = size & ~PAGE_CACHE_MASK; 1270 1230 1271 - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, 1272 - off, len, page, page->index); 1231 + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", 1232 + inode, ceph_vinop(inode), off, len, size); 1233 + if (fi->fmode & CEPH_FILE_MODE_LAZY) 1234 + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1235 + else 1236 + want = CEPH_CAP_FILE_BUFFER; 1237 + while (1) { 1238 + got = 0; 1239 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); 1240 + if (ret == 0) 1241 + break; 1242 + if (ret != -ERESTARTSYS) { 1243 + WARN_ON(1); 1244 + return VM_FAULT_SIGBUS; 1245 + } 1246 + } 1247 + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", 1248 + inode, off, len, ceph_cap_string(got)); 1249 + 1250 + /* Update time before taking page lock */ 1251 + file_update_time(vma->vm_file); 1273 1252 1274 1253 lock_page(page); 1275 1254 ··· 1309 1252 ret = VM_FAULT_SIGBUS; 1310 1253 } 1311 1254 out: 1312 - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); 1313 - if (ret != VM_FAULT_LOCKED) 1255 + if (ret != VM_FAULT_LOCKED) { 1314 1256 unlock_page(page); 1257 + } else { 1258 + int dirty; 1259 + spin_lock(&ci->i_ceph_lock); 1260 + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1261 + spin_unlock(&ci->i_ceph_lock); 1262 + if (dirty) 1263 + __mark_inode_dirty(inode, dirty); 1264 + } 1265 + 1266 + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", 1267 + inode, off, len, ceph_cap_string(got), ret); 1268 + ceph_put_cap_refs(ci, got); 1269 + 1315 1270 return ret; 1316 1271 } 1317 1272 1318 1273 static struct vm_operations_struct ceph_vmops = { 1319 - .fault = filemap_fault, 1274 + .fault = ceph_filemap_fault, 1320 1275 .page_mkwrite = ceph_page_mkwrite, 1321 1276 .remap_pages = generic_file_remap_pages, 1322 1277 };
+13
fs/ceph/cache.h
··· 67 67 return fscache_maybe_release_page(ci->fscache, page, gfp); 68 68 } 69 69 70 + static inline void ceph_fscache_readpage_cancel(struct inode *inode, 71 + struct page *page) 72 + { 73 + struct ceph_inode_info *ci = ceph_inode(inode); 74 + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) 75 + __fscache_uncache_page(ci->fscache, page); 76 + } 77 + 70 78 static inline void ceph_fscache_readpages_cancel(struct inode *inode, 71 79 struct list_head *pages) 72 80 { ··· 151 143 static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) 152 144 { 153 145 return 1; 146 + } 147 + 148 + static inline void ceph_fscache_readpage_cancel(struct inode *inode, 149 + struct page *page) 150 + { 154 151 } 155 152 156 153 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+225 -119
fs/ceph/caps.c
··· 555 555 cap->ci = ci; 556 556 __insert_cap_node(ci, cap); 557 557 558 - /* clear out old exporting info? (i.e. on cap import) */ 559 - if (ci->i_cap_exporting_mds == mds) { 560 - ci->i_cap_exporting_issued = 0; 561 - ci->i_cap_exporting_mseq = 0; 562 - ci->i_cap_exporting_mds = -1; 563 - } 564 - 565 558 /* add to session cap list */ 566 559 cap->session = session; 567 560 spin_lock(&session->s_cap_lock); 568 561 list_add_tail(&cap->session_caps, &session->s_caps); 569 562 session->s_nr_caps++; 570 563 spin_unlock(&session->s_cap_lock); 571 - } else if (new_cap) 572 - ceph_put_cap(mdsc, new_cap); 564 + } else { 565 + if (new_cap) 566 + ceph_put_cap(mdsc, new_cap); 567 + 568 + /* 569 + * auth mds of the inode changed. we received the cap export 570 + * message, but still haven't received the cap import message. 571 + * handle_cap_export() updated the new auth MDS' cap. 572 + * 573 + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing 574 + * a message that was send before the cap import message. So 575 + * don't remove caps. 576 + */ 577 + if (ceph_seq_cmp(seq, cap->seq) <= 0) { 578 + WARN_ON(cap != ci->i_auth_cap); 579 + WARN_ON(cap->cap_id != cap_id); 580 + seq = cap->seq; 581 + mseq = cap->mseq; 582 + issued |= cap->issued; 583 + flags |= CEPH_CAP_FLAG_AUTH; 584 + } 585 + } 573 586 574 587 if (!ci->i_snap_realm) { 575 588 /* ··· 624 611 if (ci->i_auth_cap == NULL || 625 612 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 626 613 ci->i_auth_cap = cap; 627 - } else if (ci->i_auth_cap == cap) { 628 - ci->i_auth_cap = NULL; 629 - spin_lock(&mdsc->cap_dirty_lock); 630 - if (!list_empty(&ci->i_dirty_item)) { 631 - dout(" moving %p to cap_dirty_migrating\n", inode); 632 - list_move(&ci->i_dirty_item, 633 - &mdsc->cap_dirty_migrating); 634 - } 635 - spin_unlock(&mdsc->cap_dirty_lock); 614 + ci->i_cap_exporting_issued = 0; 615 + } else { 616 + WARN_ON(ci->i_auth_cap == cap); 636 617 } 637 618 638 619 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", ··· 635 628 cap->cap_id = cap_id; 636 629 cap->issued = issued; 637 630 cap->implemented |= issued; 638 - if (mseq > cap->mseq) 631 + if (ceph_seq_cmp(mseq, cap->mseq) > 0) 639 632 cap->mds_wanted = wanted; 640 633 else 641 634 cap->mds_wanted |= wanted; ··· 823 816 824 817 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 825 818 cap = rb_entry(p, struct ceph_cap, ci_node); 826 - if (cap != ocap && __cap_is_valid(cap) && 819 + if (cap != ocap && 827 820 (cap->implemented & ~cap->issued & mask)) 828 821 return 1; 829 822 } ··· 895 888 */ 896 889 static int __ceph_is_any_caps(struct ceph_inode_info *ci) 897 890 { 898 - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; 891 + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; 892 + } 893 + 894 + int ceph_is_any_caps(struct inode *inode) 895 + { 896 + struct ceph_inode_info *ci = ceph_inode(inode); 897 + int ret; 898 + 899 + spin_lock(&ci->i_ceph_lock); 900 + ret = __ceph_is_any_caps(ci); 901 + spin_unlock(&ci->i_ceph_lock); 902 + 903 + return ret; 899 904 } 900 905 901 906 /* ··· 1402 1383 ci->i_snap_realm->cached_context); 1403 1384 dout(" inode %p now dirty snapc %p auth cap %p\n", 1404 1385 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); 1386 + WARN_ON(!ci->i_auth_cap); 1405 1387 BUG_ON(!list_empty(&ci->i_dirty_item)); 1406 1388 spin_lock(&mdsc->cap_dirty_lock); 1407 - if (ci->i_auth_cap) 1408 - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1409 - else 1410 - list_add(&ci->i_dirty_item, 1411 - &mdsc->cap_dirty_migrating); 1389 + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1412 1390 spin_unlock(&mdsc->cap_dirty_lock); 1413 1391 if (ci->i_flushing_caps == 0) { 1414 1392 ihold(inode); ··· 1751 1735 /* 1752 1736 * Try to flush dirty caps back to the auth mds. 1753 1737 */ 1754 - static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1755 - unsigned *flush_tid) 1738 + static int try_flush_caps(struct inode *inode, unsigned *flush_tid) 1756 1739 { 1757 1740 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1758 1741 struct ceph_inode_info *ci = ceph_inode(inode); 1759 - int unlock_session = session ? 0 : 1; 1760 1742 int flushing = 0; 1743 + struct ceph_mds_session *session = NULL; 1761 1744 1762 1745 retry: 1763 1746 spin_lock(&ci->i_ceph_lock); ··· 1770 1755 int want = __ceph_caps_wanted(ci); 1771 1756 int delayed; 1772 1757 1773 - if (!session) { 1758 + if (!session || session != cap->session) { 1774 1759 spin_unlock(&ci->i_ceph_lock); 1760 + if (session) 1761 + mutex_unlock(&session->s_mutex); 1775 1762 session = cap->session; 1776 1763 mutex_lock(&session->s_mutex); 1777 1764 goto retry; 1778 1765 } 1779 - BUG_ON(session != cap->session); 1780 1766 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) 1781 1767 goto out; 1782 1768 ··· 1796 1780 out: 1797 1781 spin_unlock(&ci->i_ceph_lock); 1798 1782 out_unlocked: 1799 - if (session && unlock_session) 1783 + if (session) 1800 1784 mutex_unlock(&session->s_mutex); 1801 1785 return flushing; 1802 1786 } ··· 1881 1865 return ret; 1882 1866 mutex_lock(&inode->i_mutex); 1883 1867 1884 - dirty = try_flush_caps(inode, NULL, &flush_tid); 1868 + dirty = try_flush_caps(inode, &flush_tid); 1885 1869 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 1886 1870 1887 1871 /* ··· 1916 1900 1917 1901 dout("write_inode %p wait=%d\n", inode, wait); 1918 1902 if (wait) { 1919 - dirty = try_flush_caps(inode, NULL, &flush_tid); 1903 + dirty = try_flush_caps(inode, &flush_tid); 1920 1904 if (dirty) 1921 1905 err = wait_event_interruptible(ci->i_cap_wq, 1922 1906 caps_are_flushed(inode, flush_tid)); ··· 2366 2350 d_prune_aliases(inode); 2367 2351 /* 2368 2352 * For non-directory inode, d_find_alias() only returns 2369 - * connected dentry. After calling d_invalidate(), the 2370 - * dentry become disconnected. 2353 + * hashed dentry. After calling d_invalidate(), the 2354 + * dentry becomes unhashed. 2371 2355 * 2372 2356 * For directory inode, d_find_alias() can return 2373 - * disconnected dentry. But directory inode should have 2357 + * unhashed dentry. But directory inode should have 2374 2358 * one alias at most. 2375 2359 */ 2376 2360 while ((dn = d_find_alias(inode))) { ··· 2424 2408 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2425 2409 inode->i_size); 2426 2410 2411 + 2412 + /* 2413 + * auth mds of the inode changed. we received the cap export message, 2414 + * but still haven't received the cap import message. handle_cap_export 2415 + * updated the new auth MDS' cap. 2416 + * 2417 + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message 2418 + * that was sent before the cap import message. So don't remove caps. 2419 + */ 2420 + if (ceph_seq_cmp(seq, cap->seq) <= 0) { 2421 + WARN_ON(cap != ci->i_auth_cap); 2422 + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); 2423 + seq = cap->seq; 2424 + newcaps |= cap->issued; 2425 + } 2426 + 2427 2427 /* 2428 2428 * If CACHE is being revoked, and we have no dirty buffers, 2429 2429 * try to invalidate (once). (If there are dirty buffers, we ··· 2466 2434 issued |= implemented | __ceph_caps_dirty(ci); 2467 2435 2468 2436 cap->cap_gen = session->s_cap_gen; 2437 + cap->seq = seq; 2469 2438 2470 2439 __check_cap_issue(ci, cap, newcaps); 2471 2440 ··· 2497 2464 ceph_buffer_put(ci->i_xattrs.blob); 2498 2465 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); 2499 2466 ci->i_xattrs.version = version; 2467 + ceph_forget_all_cached_acls(inode); 2500 2468 } 2501 2469 } 2502 2470 ··· 2516 2482 ceph_fill_file_time(inode, issued, 2517 2483 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2518 2484 &atime); 2485 + 2486 + 2487 + /* file layout may have changed */ 2488 + ci->i_layout = grant->layout; 2519 2489 2520 2490 /* max size increase? */ 2521 2491 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { ··· 2548 2510 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) 2549 2511 check_caps = 1; 2550 2512 } 2551 - 2552 - cap->seq = seq; 2553 - 2554 - /* file layout may have changed */ 2555 - ci->i_layout = grant->layout; 2556 2513 2557 2514 /* revocation, grant, or no-op? */ 2558 2515 if (cap->issued & ~newcaps) { ··· 2774 2741 * caller holds s_mutex 2775 2742 */ 2776 2743 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2777 - struct ceph_mds_session *session, 2778 - int *open_target_sessions) 2744 + struct ceph_mds_cap_peer *ph, 2745 + struct ceph_mds_session *session) 2779 2746 { 2780 2747 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2748 + struct ceph_mds_session *tsession = NULL; 2749 + struct ceph_cap *cap, *tcap; 2781 2750 struct ceph_inode_info *ci = ceph_inode(inode); 2782 - int mds = session->s_mds; 2751 + u64 t_cap_id; 2783 2752 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2784 - struct ceph_cap *cap = NULL, *t; 2785 - struct rb_node *p; 2786 - int remember = 1; 2753 + unsigned t_seq, t_mseq; 2754 + int target, issued; 2755 + int mds = session->s_mds; 2787 2756 2788 - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", 2789 - inode, ci, mds, mseq); 2790 - 2791 - spin_lock(&ci->i_ceph_lock); 2792 - 2793 - /* make sure we haven't seen a higher mseq */ 2794 - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2795 - t = rb_entry(p, struct ceph_cap, ci_node); 2796 - if (ceph_seq_cmp(t->mseq, mseq) > 0) { 2797 - dout(" higher mseq on cap from mds%d\n", 2798 - t->session->s_mds); 2799 - remember = 0; 2800 - } 2801 - if (t->session->s_mds == mds) 2802 - cap = t; 2757 + if (ph) { 2758 + t_cap_id = le64_to_cpu(ph->cap_id); 2759 + t_seq = le32_to_cpu(ph->seq); 2760 + t_mseq = le32_to_cpu(ph->mseq); 2761 + target = le32_to_cpu(ph->mds); 2762 + } else { 2763 + t_cap_id = t_seq = t_mseq = 0; 2764 + target = -1; 2803 2765 } 2804 2766 2805 - if (cap) { 2806 - if (remember) { 2807 - /* make note */ 2808 - ci->i_cap_exporting_mds = mds; 2809 - ci->i_cap_exporting_mseq = mseq; 2810 - ci->i_cap_exporting_issued = cap->issued; 2767 + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", 2768 + inode, ci, mds, mseq, target); 2769 + retry: 2770 + spin_lock(&ci->i_ceph_lock); 2771 + cap = __get_cap_for_mds(ci, mds); 2772 + if (!cap) 2773 + goto out_unlock; 2811 2774 2812 - /* 2813 - * make sure we have open sessions with all possible 2814 - * export targets, so that we get the matching IMPORT 2815 - */ 2816 - *open_target_sessions = 1; 2775 + if (target < 0) { 2776 + __ceph_remove_cap(cap, false); 2777 + goto out_unlock; 2778 + } 2817 2779 2818 - /* 2819 - * we can't flush dirty caps that we've seen the 2820 - * EXPORT but no IMPORT for 2821 - */ 2822 - spin_lock(&mdsc->cap_dirty_lock); 2823 - if (!list_empty(&ci->i_dirty_item)) { 2824 - dout(" moving %p to cap_dirty_migrating\n", 2825 - inode); 2826 - list_move(&ci->i_dirty_item, 2827 - &mdsc->cap_dirty_migrating); 2780 + /* 2781 + * now we know we haven't received the cap import message yet 2782 + * because the exported cap still exist. 2783 + */ 2784 + 2785 + issued = cap->issued; 2786 + WARN_ON(issued != cap->implemented); 2787 + 2788 + tcap = __get_cap_for_mds(ci, target); 2789 + if (tcap) { 2790 + /* already have caps from the target */ 2791 + if (tcap->cap_id != t_cap_id || 2792 + ceph_seq_cmp(tcap->seq, t_seq) < 0) { 2793 + dout(" updating import cap %p mds%d\n", tcap, target); 2794 + tcap->cap_id = t_cap_id; 2795 + tcap->seq = t_seq - 1; 2796 + tcap->issue_seq = t_seq - 1; 2797 + tcap->mseq = t_mseq; 2798 + tcap->issued |= issued; 2799 + tcap->implemented |= issued; 2800 + if (cap == ci->i_auth_cap) 2801 + ci->i_auth_cap = tcap; 2802 + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { 2803 + spin_lock(&mdsc->cap_dirty_lock); 2804 + list_move_tail(&ci->i_flushing_item, 2805 + &tcap->session->s_cap_flushing); 2806 + spin_unlock(&mdsc->cap_dirty_lock); 2828 2807 } 2829 - spin_unlock(&mdsc->cap_dirty_lock); 2830 2808 } 2831 2809 __ceph_remove_cap(cap, false); 2810 + goto out_unlock; 2832 2811 } 2833 - /* else, we already released it */ 2812 + 2813 + if (tsession) { 2814 + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; 2815 + spin_unlock(&ci->i_ceph_lock); 2816 + /* add placeholder for the export tagert */ 2817 + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, 2818 + t_seq - 1, t_mseq, (u64)-1, flag, NULL); 2819 + goto retry; 2820 + } 2834 2821 2835 2822 spin_unlock(&ci->i_ceph_lock); 2823 + mutex_unlock(&session->s_mutex); 2824 + 2825 + /* open target session */ 2826 + tsession = ceph_mdsc_open_export_target_session(mdsc, target); 2827 + if (!IS_ERR(tsession)) { 2828 + if (mds > target) { 2829 + mutex_lock(&session->s_mutex); 2830 + mutex_lock_nested(&tsession->s_mutex, 2831 + SINGLE_DEPTH_NESTING); 2832 + } else { 2833 + mutex_lock(&tsession->s_mutex); 2834 + mutex_lock_nested(&session->s_mutex, 2835 + SINGLE_DEPTH_NESTING); 2836 + } 2837 + ceph_add_cap_releases(mdsc, tsession); 2838 + } else { 2839 + WARN_ON(1); 2840 + tsession = NULL; 2841 + target = -1; 2842 + } 2843 + goto retry; 2844 + 2845 + out_unlock: 2846 + spin_unlock(&ci->i_ceph_lock); 2847 + mutex_unlock(&session->s_mutex); 2848 + if (tsession) { 2849 + mutex_unlock(&tsession->s_mutex); 2850 + ceph_put_mds_session(tsession); 2851 + } 2836 2852 } 2837 2853 2838 2854 /* ··· 2892 2810 */ 2893 2811 static void handle_cap_import(struct ceph_mds_client *mdsc, 2894 2812 struct inode *inode, struct ceph_mds_caps *im, 2813 + struct ceph_mds_cap_peer *ph, 2895 2814 struct ceph_mds_session *session, 2896 2815 void *snaptrace, int snaptrace_len) 2897 2816 { 2898 2817 struct ceph_inode_info *ci = ceph_inode(inode); 2818 + struct ceph_cap *cap; 2899 2819 int mds = session->s_mds; 2900 2820 unsigned issued = le32_to_cpu(im->caps); 2901 2821 unsigned wanted = le32_to_cpu(im->wanted); ··· 2905 2821 unsigned mseq = le32_to_cpu(im->migrate_seq); 2906 2822 u64 realmino = le64_to_cpu(im->realm); 2907 2823 u64 cap_id = le64_to_cpu(im->cap_id); 2824 + u64 p_cap_id; 2825 + int peer; 2908 2826 2909 - if (ci->i_cap_exporting_mds >= 0 && 2910 - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { 2911 - dout("handle_cap_import inode %p ci %p mds%d mseq %d" 2912 - " - cleared exporting from mds%d\n", 2913 - inode, ci, mds, mseq, 2914 - ci->i_cap_exporting_mds); 2915 - ci->i_cap_exporting_issued = 0; 2916 - ci->i_cap_exporting_mseq = 0; 2917 - ci->i_cap_exporting_mds = -1; 2918 - 2919 - spin_lock(&mdsc->cap_dirty_lock); 2920 - if (!list_empty(&ci->i_dirty_item)) { 2921 - dout(" moving %p back to cap_dirty\n", inode); 2922 - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); 2923 - } 2924 - spin_unlock(&mdsc->cap_dirty_lock); 2827 + if (ph) { 2828 + p_cap_id = le64_to_cpu(ph->cap_id); 2829 + peer = le32_to_cpu(ph->mds); 2925 2830 } else { 2926 - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", 2927 - inode, ci, mds, mseq); 2831 + p_cap_id = 0; 2832 + peer = -1; 2928 2833 } 2834 + 2835 + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", 2836 + inode, ci, mds, mseq, peer); 2837 + 2838 + spin_lock(&ci->i_ceph_lock); 2839 + cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; 2840 + if (cap && cap->cap_id == p_cap_id) { 2841 + dout(" remove export cap %p mds%d flags %d\n", 2842 + cap, peer, ph->flags); 2843 + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && 2844 + (cap->seq != le32_to_cpu(ph->seq) || 2845 + cap->mseq != le32_to_cpu(ph->mseq))) { 2846 + pr_err("handle_cap_import: mismatched seq/mseq: " 2847 + "ino (%llx.%llx) mds%d seq %d mseq %d " 2848 + "importer mds%d has peer seq %d mseq %d\n", 2849 + ceph_vinop(inode), peer, cap->seq, 2850 + cap->mseq, mds, le32_to_cpu(ph->seq), 2851 + le32_to_cpu(ph->mseq)); 2852 + } 2853 + ci->i_cap_exporting_issued = cap->issued; 2854 + __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); 2855 + } 2856 + 2857 + /* make sure we re-request max_size, if necessary */ 2858 + ci->i_wanted_max_size = 0; 2859 + ci->i_requested_max_size = 0; 2860 + spin_unlock(&ci->i_ceph_lock); 2929 2861 2930 2862 down_write(&mdsc->snap_rwsem); 2931 2863 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, ··· 2953 2853 kick_flushing_inode_caps(mdsc, session, inode); 2954 2854 up_read(&mdsc->snap_rwsem); 2955 2855 2956 - /* make sure we re-request max_size, if necessary */ 2957 - spin_lock(&ci->i_ceph_lock); 2958 - ci->i_wanted_max_size = 0; /* reset */ 2959 - ci->i_requested_max_size = 0; 2960 - spin_unlock(&ci->i_ceph_lock); 2961 2856 } 2962 2857 2963 2858 /* ··· 2970 2875 struct ceph_inode_info *ci; 2971 2876 struct ceph_cap *cap; 2972 2877 struct ceph_mds_caps *h; 2878 + struct ceph_mds_cap_peer *peer = NULL; 2973 2879 int mds = session->s_mds; 2974 2880 int op; 2975 2881 u32 seq, mseq; ··· 2981 2885 void *snaptrace; 2982 2886 size_t snaptrace_len; 2983 2887 void *flock; 2888 + void *end; 2984 2889 u32 flock_len; 2985 - int open_target_sessions = 0; 2986 2890 2987 2891 dout("handle_caps from mds%d\n", mds); 2988 2892 2989 2893 /* decode */ 2894 + end = msg->front.iov_base + msg->front.iov_len; 2990 2895 tid = le64_to_cpu(msg->hdr.tid); 2991 2896 if (msg->front.iov_len < sizeof(*h)) 2992 2897 goto bad; ··· 3005 2908 snaptrace_len = le32_to_cpu(h->snap_trace_len); 3006 2909 3007 2910 if (le16_to_cpu(msg->hdr.version) >= 2) { 3008 - void *p, *end; 3009 - 3010 - p = snaptrace + snaptrace_len; 3011 - end = msg->front.iov_base + msg->front.iov_len; 2911 + void *p = snaptrace + snaptrace_len; 3012 2912 ceph_decode_32_safe(&p, end, flock_len, bad); 2913 + if (p + flock_len > end) 2914 + goto bad; 3013 2915 flock = p; 3014 2916 } else { 3015 2917 flock = NULL; 3016 2918 flock_len = 0; 2919 + } 2920 + 2921 + if (le16_to_cpu(msg->hdr.version) >= 3) { 2922 + if (op == CEPH_CAP_OP_IMPORT) { 2923 + void *p = flock + flock_len; 2924 + if (p + sizeof(*peer) > end) 2925 + goto bad; 2926 + peer = p; 2927 + } else if (op == CEPH_CAP_OP_EXPORT) { 2928 + /* recorded in unused fields */ 2929 + peer = (void *)&h->size; 2930 + } 3017 2931 } 3018 2932 3019 2933 mutex_lock(&session->s_mutex); ··· 3059 2951 goto done; 3060 2952 3061 2953 case CEPH_CAP_OP_EXPORT: 3062 - handle_cap_export(inode, h, session, &open_target_sessions); 3063 - goto done; 2954 + handle_cap_export(inode, h, peer, session); 2955 + goto done_unlocked; 3064 2956 3065 2957 case CEPH_CAP_OP_IMPORT: 3066 - handle_cap_import(mdsc, inode, h, session, 2958 + handle_cap_import(mdsc, inode, h, peer, session, 3067 2959 snaptrace, snaptrace_len); 3068 2960 } 3069 2961 ··· 3115 3007 done_unlocked: 3116 3008 if (inode) 3117 3009 iput(inode); 3118 - if (open_target_sessions) 3119 - ceph_mdsc_open_export_target_sessions(mdsc, session); 3120 3010 return; 3121 3011 3122 3012 bad:
+13 -3
fs/ceph/dir.c
··· 693 693 if (!err && !req->r_reply_info.head->is_dentry) 694 694 err = ceph_handle_notrace_create(dir, dentry); 695 695 ceph_mdsc_put_request(req); 696 + 697 + if (!err) 698 + err = ceph_init_acl(dentry, dentry->d_inode, dir); 699 + 696 700 if (err) 697 701 d_drop(dentry); 698 702 return err; ··· 1041 1037 valid = 1; 1042 1038 } else if (dentry_lease_is_valid(dentry) || 1043 1039 dir_lease_is_valid(dir, dentry)) { 1044 - valid = 1; 1040 + if (dentry->d_inode) 1041 + valid = ceph_is_any_caps(dentry->d_inode); 1042 + else 1043 + valid = 1; 1045 1044 } 1046 1045 1047 1046 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1048 - if (valid) 1047 + if (valid) { 1049 1048 ceph_dentry_lru_touch(dentry); 1050 - else 1049 + } else { 1050 + ceph_dir_clear_complete(dir); 1051 1051 d_drop(dentry); 1052 + } 1052 1053 iput(dir); 1053 1054 return valid; 1054 1055 } ··· 1302 1293 .getxattr = ceph_getxattr, 1303 1294 .listxattr = ceph_listxattr, 1304 1295 .removexattr = ceph_removexattr, 1296 + .get_acl = ceph_get_acl, 1305 1297 .mknod = ceph_mknod, 1306 1298 .symlink = ceph_symlink, 1307 1299 .mkdir = ceph_mkdir,
+309 -126
fs/ceph/file.c
··· 408 408 * 409 409 * If the read spans object boundary, just do multiple reads. 410 410 */ 411 - static ssize_t ceph_sync_read(struct file *file, char __user *data, 412 - unsigned len, loff_t *poff, int *checkeof) 411 + static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, 412 + int *checkeof) 413 413 { 414 + struct file *file = iocb->ki_filp; 414 415 struct inode *inode = file_inode(file); 415 416 struct page **pages; 416 - u64 off = *poff; 417 + u64 off = iocb->ki_pos; 417 418 int num_pages, ret; 419 + size_t len = i->count; 418 420 419 - dout("sync_read on file %p %llu~%u %s\n", file, off, len, 421 + dout("sync_read on file %p %llu~%u %s\n", file, off, 422 + (unsigned)len, 420 423 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 421 - 422 - if (file->f_flags & O_DIRECT) { 423 - num_pages = calc_pages_for((unsigned long)data, len); 424 - pages = ceph_get_direct_page_vector(data, num_pages, true); 425 - } else { 426 - num_pages = calc_pages_for(off, len); 427 - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 428 - } 429 - if (IS_ERR(pages)) 430 - return PTR_ERR(pages); 431 - 432 424 /* 433 425 * flush any page cache pages in this range. this 434 426 * will make concurrent normal and sync io slow, 435 427 * but it will at least behave sensibly when they are 436 428 * in sequence. 437 429 */ 438 - ret = filemap_write_and_wait(inode->i_mapping); 430 + ret = filemap_write_and_wait_range(inode->i_mapping, off, 431 + off + len); 439 432 if (ret < 0) 440 - goto done; 433 + return ret; 441 434 442 - ret = striped_read(inode, off, len, pages, num_pages, checkeof, 443 - file->f_flags & O_DIRECT, 444 - (unsigned long)data & ~PAGE_MASK); 435 + if (file->f_flags & O_DIRECT) { 436 + while (iov_iter_count(i)) { 437 + void __user *data = i->iov[0].iov_base + i->iov_offset; 438 + size_t len = i->iov[0].iov_len - i->iov_offset; 445 439 446 - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 447 - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 448 - if (ret >= 0) 449 - *poff = off + ret; 440 + num_pages = calc_pages_for((unsigned long)data, len); 441 + pages = ceph_get_direct_page_vector(data, 442 + num_pages, true); 443 + if (IS_ERR(pages)) 444 + return PTR_ERR(pages); 450 445 451 - done: 452 - if (file->f_flags & O_DIRECT) 453 - ceph_put_page_vector(pages, num_pages, true); 454 - else 446 + ret = striped_read(inode, off, len, 447 + pages, num_pages, checkeof, 448 + 1, (unsigned long)data & ~PAGE_MASK); 449 + ceph_put_page_vector(pages, num_pages, true); 450 + 451 + if (ret <= 0) 452 + break; 453 + off += ret; 454 + iov_iter_advance(i, ret); 455 + if (ret < len) 456 + break; 457 + } 458 + } else { 459 + num_pages = calc_pages_for(off, len); 460 + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 461 + if (IS_ERR(pages)) 462 + return PTR_ERR(pages); 463 + ret = striped_read(inode, off, len, pages, 464 + num_pages, checkeof, 0, 0); 465 + if (ret > 0) { 466 + int l, k = 0; 467 + size_t left = len = ret; 468 + 469 + while (left) { 470 + void __user *data = i->iov[0].iov_base 471 + + i->iov_offset; 472 + l = min(i->iov[0].iov_len - i->iov_offset, 473 + left); 474 + 475 + ret = ceph_copy_page_vector_to_user(&pages[k], 476 + data, off, 477 + l); 478 + if (ret > 0) { 479 + iov_iter_advance(i, ret); 480 + left -= ret; 481 + off += ret; 482 + k = calc_pages_for(iocb->ki_pos, 483 + len - left + 1) - 1; 484 + BUG_ON(k >= num_pages && left); 485 + } else 486 + break; 487 + } 488 + } 455 489 ceph_release_page_vector(pages, num_pages); 490 + } 491 + 492 + if (off > iocb->ki_pos) { 493 + ret = off - iocb->ki_pos; 494 + iocb->ki_pos = off; 495 + } 496 + 456 497 dout("sync_read result %d\n", ret); 457 498 return ret; 458 499 } ··· 530 489 } 531 490 } 532 491 492 + 533 493 /* 534 - * Synchronous write, straight from __user pointer or user pages (if 535 - * O_DIRECT). 494 + * Synchronous write, straight from __user pointer or user pages. 536 495 * 537 496 * If write spans object boundary, just do multiple writes. (For a 538 497 * correct atomic write, we should e.g. take write locks on all 539 498 * objects, rollback on failure, etc.) 540 499 */ 541 - static ssize_t ceph_sync_write(struct file *file, const char __user *data, 542 - size_t left, loff_t pos, loff_t *ppos) 500 + static ssize_t 501 + ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, 502 + unsigned long nr_segs, size_t count) 543 503 { 504 + struct file *file = iocb->ki_filp; 544 505 struct inode *inode = file_inode(file); 545 506 struct ceph_inode_info *ci = ceph_inode(inode); 546 507 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 547 508 struct ceph_snap_context *snapc; 548 509 struct ceph_vino vino; 549 510 struct ceph_osd_request *req; 550 - int num_ops = 1; 551 511 struct page **pages; 552 512 int num_pages; 553 - u64 len; 554 513 int written = 0; 555 514 int flags; 556 515 int check_caps = 0; 557 - int page_align, io_align; 558 - unsigned long buf_align; 516 + int page_align; 559 517 int ret; 560 518 struct timespec mtime = CURRENT_TIME; 561 - bool own_pages = false; 519 + loff_t pos = iocb->ki_pos; 520 + struct iov_iter i; 562 521 563 522 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 564 523 return -EROFS; 565 524 566 - dout("sync_write on file %p %lld~%u %s\n", file, pos, 567 - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 525 + dout("sync_direct_write on file %p %lld~%u\n", file, pos, 526 + (unsigned)count); 568 527 569 - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 528 + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 570 529 if (ret < 0) 571 530 return ret; 572 531 573 532 ret = invalidate_inode_pages2_range(inode->i_mapping, 574 533 pos >> PAGE_CACHE_SHIFT, 575 - (pos + left) >> PAGE_CACHE_SHIFT); 534 + (pos + count) >> PAGE_CACHE_SHIFT); 576 535 if (ret < 0) 577 536 dout("invalidate_inode_pages2_range returned %d\n", ret); 578 537 579 538 flags = CEPH_OSD_FLAG_ORDERSNAP | 580 539 CEPH_OSD_FLAG_ONDISK | 581 540 CEPH_OSD_FLAG_WRITE; 582 - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) 583 - flags |= CEPH_OSD_FLAG_ACK; 584 - else 585 - num_ops++; /* Also include a 'startsync' command. */ 586 541 587 - /* 588 - * we may need to do multiple writes here if we span an object 589 - * boundary. this isn't atomic, unfortunately. :( 590 - */ 591 - more: 592 - io_align = pos & ~PAGE_MASK; 593 - buf_align = (unsigned long)data & ~PAGE_MASK; 594 - len = left; 542 + iov_iter_init(&i, iov, nr_segs, count, 0); 595 543 596 - snapc = ci->i_snap_realm->cached_context; 597 - vino = ceph_vino(inode); 598 - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 599 - vino, pos, &len, num_ops, 600 - CEPH_OSD_OP_WRITE, flags, snapc, 601 - ci->i_truncate_seq, ci->i_truncate_size, 602 - false); 603 - if (IS_ERR(req)) 604 - return PTR_ERR(req); 544 + while (iov_iter_count(&i) > 0) { 545 + void __user *data = i.iov->iov_base + i.iov_offset; 546 + u64 len = i.iov->iov_len - i.iov_offset; 605 547 606 - /* write from beginning of first page, regardless of io alignment */ 607 - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; 608 - num_pages = calc_pages_for(page_align, len); 609 - if (file->f_flags & O_DIRECT) { 548 + page_align = (unsigned long)data & ~PAGE_MASK; 549 + 550 + snapc = ci->i_snap_realm->cached_context; 551 + vino = ceph_vino(inode); 552 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 553 + vino, pos, &len, 554 + 2,/*include a 'startsync' command*/ 555 + CEPH_OSD_OP_WRITE, flags, snapc, 556 + ci->i_truncate_seq, 557 + ci->i_truncate_size, 558 + false); 559 + if (IS_ERR(req)) { 560 + ret = PTR_ERR(req); 561 + goto out; 562 + } 563 + 564 + num_pages = calc_pages_for(page_align, len); 610 565 pages = ceph_get_direct_page_vector(data, num_pages, false); 611 566 if (IS_ERR(pages)) { 612 567 ret = PTR_ERR(pages); ··· 614 577 * may block. 615 578 */ 616 579 truncate_inode_pages_range(inode->i_mapping, pos, 617 - (pos+len) | (PAGE_CACHE_SIZE-1)); 618 - } else { 580 + (pos+len) | (PAGE_CACHE_SIZE-1)); 581 + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, 582 + false, false); 583 + 584 + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 585 + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 586 + 587 + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 588 + if (!ret) 589 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 590 + 591 + ceph_put_page_vector(pages, num_pages, false); 592 + 593 + out: 594 + ceph_osdc_put_request(req); 595 + if (ret == 0) { 596 + pos += len; 597 + written += len; 598 + iov_iter_advance(&i, (size_t)len); 599 + 600 + if (pos > i_size_read(inode)) { 601 + check_caps = ceph_inode_set_size(inode, pos); 602 + if (check_caps) 603 + ceph_check_caps(ceph_inode(inode), 604 + CHECK_CAPS_AUTHONLY, 605 + NULL); 606 + } 607 + } else 608 + break; 609 + } 610 + 611 + if (ret != -EOLDSNAPC && written > 0) { 612 + iocb->ki_pos = pos; 613 + ret = written; 614 + } 615 + return ret; 616 + } 617 + 618 + 619 + /* 620 + * Synchronous write, straight from __user pointer or user pages. 621 + * 622 + * If write spans object boundary, just do multiple writes. (For a 623 + * correct atomic write, we should e.g. take write locks on all 624 + * objects, rollback on failure, etc.) 625 + */ 626 + static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, 627 + unsigned long nr_segs, size_t count) 628 + { 629 + struct file *file = iocb->ki_filp; 630 + struct inode *inode = file_inode(file); 631 + struct ceph_inode_info *ci = ceph_inode(inode); 632 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 633 + struct ceph_snap_context *snapc; 634 + struct ceph_vino vino; 635 + struct ceph_osd_request *req; 636 + struct page **pages; 637 + u64 len; 638 + int num_pages; 639 + int written = 0; 640 + int flags; 641 + int check_caps = 0; 642 + int ret; 643 + struct timespec mtime = CURRENT_TIME; 644 + loff_t pos = iocb->ki_pos; 645 + struct iov_iter i; 646 + 647 + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 648 + return -EROFS; 649 + 650 + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); 651 + 652 + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 653 + if (ret < 0) 654 + return ret; 655 + 656 + ret = invalidate_inode_pages2_range(inode->i_mapping, 657 + pos >> PAGE_CACHE_SHIFT, 658 + (pos + count) >> PAGE_CACHE_SHIFT); 659 + if (ret < 0) 660 + dout("invalidate_inode_pages2_range returned %d\n", ret); 661 + 662 + flags = CEPH_OSD_FLAG_ORDERSNAP | 663 + CEPH_OSD_FLAG_ONDISK | 664 + CEPH_OSD_FLAG_WRITE | 665 + CEPH_OSD_FLAG_ACK; 666 + 667 + iov_iter_init(&i, iov, nr_segs, count, 0); 668 + 669 + while ((len = iov_iter_count(&i)) > 0) { 670 + size_t left; 671 + int n; 672 + 673 + snapc = ci->i_snap_realm->cached_context; 674 + vino = ceph_vino(inode); 675 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 676 + vino, pos, &len, 1, 677 + CEPH_OSD_OP_WRITE, flags, snapc, 678 + ci->i_truncate_seq, 679 + ci->i_truncate_size, 680 + false); 681 + if (IS_ERR(req)) { 682 + ret = PTR_ERR(req); 683 + goto out; 684 + } 685 + 686 + /* 687 + * write from beginning of first page, 688 + * regardless of io alignment 689 + */ 690 + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 691 + 619 692 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 620 693 if (IS_ERR(pages)) { 621 694 ret = PTR_ERR(pages); 622 695 goto out; 623 696 } 624 - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); 697 + 698 + left = len; 699 + for (n = 0; n < num_pages; n++) { 700 + size_t plen = min_t(size_t, left, PAGE_SIZE); 701 + ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); 702 + if (ret != plen) { 703 + ret = -EFAULT; 704 + break; 705 + } 706 + left -= ret; 707 + iov_iter_advance(&i, ret); 708 + } 709 + 625 710 if (ret < 0) { 626 711 ceph_release_page_vector(pages, num_pages); 627 712 goto out; 628 713 } 629 714 630 - if ((file->f_flags & O_SYNC) == 0) { 631 - /* get a second commit callback */ 632 - req->r_unsafe_callback = ceph_sync_write_unsafe; 633 - req->r_inode = inode; 634 - own_pages = true; 635 - } 636 - } 637 - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, 638 - false, own_pages); 715 + /* get a second commit callback */ 716 + req->r_unsafe_callback = ceph_sync_write_unsafe; 717 + req->r_inode = inode; 639 718 640 - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 641 - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 719 + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 720 + false, true); 642 721 643 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 644 - if (!ret) 645 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 722 + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 723 + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 646 724 647 - if (file->f_flags & O_DIRECT) 648 - ceph_put_page_vector(pages, num_pages, false); 649 - else if (file->f_flags & O_SYNC) 650 - ceph_release_page_vector(pages, num_pages); 725 + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 726 + if (!ret) 727 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 651 728 652 729 out: 653 - ceph_osdc_put_request(req); 654 - if (ret == 0) { 655 - pos += len; 656 - written += len; 657 - left -= len; 658 - data += len; 659 - if (left) 660 - goto more; 730 + ceph_osdc_put_request(req); 731 + if (ret == 0) { 732 + pos += len; 733 + written += len; 661 734 735 + if (pos > i_size_read(inode)) { 736 + check_caps = ceph_inode_set_size(inode, pos); 737 + if (check_caps) 738 + ceph_check_caps(ceph_inode(inode), 739 + CHECK_CAPS_AUTHONLY, 740 + NULL); 741 + } 742 + } else 743 + break; 744 + } 745 + 746 + if (ret != -EOLDSNAPC && written > 0) { 662 747 ret = written; 663 - *ppos = pos; 664 - if (pos > i_size_read(inode)) 665 - check_caps = ceph_inode_set_size(inode, pos); 666 - if (check_caps) 667 - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, 668 - NULL); 669 - } else if (ret != -EOLDSNAPC && written > 0) { 670 - ret = written; 748 + iocb->ki_pos = pos; 671 749 } 672 750 return ret; 673 751 } ··· 799 647 { 800 648 struct file *filp = iocb->ki_filp; 801 649 struct ceph_file_info *fi = filp->private_data; 802 - loff_t *ppos = &iocb->ki_pos; 803 - size_t len = iov->iov_len; 650 + size_t len = iocb->ki_nbytes; 804 651 struct inode *inode = file_inode(filp); 805 652 struct ceph_inode_info *ci = ceph_inode(inode); 806 - void __user *base = iov->iov_base; 807 653 ssize_t ret; 808 654 int want, got = 0; 809 655 int checkeof = 0, read = 0; 810 656 811 - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 812 - inode, ceph_vinop(inode), pos, (unsigned)len, inode); 813 657 again: 658 + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 659 + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); 660 + 814 661 if (fi->fmode & CEPH_FILE_MODE_LAZY) 815 662 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 816 663 else 817 664 want = CEPH_CAP_FILE_CACHE; 818 665 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 819 666 if (ret < 0) 820 - goto out; 821 - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 822 - inode, ceph_vinop(inode), pos, (unsigned)len, 823 - ceph_cap_string(got)); 667 + return ret; 824 668 825 669 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 826 670 (iocb->ki_filp->f_flags & O_DIRECT) || 827 - (fi->flags & CEPH_F_SYNC)) 828 - /* hmm, this isn't really async... */ 829 - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); 830 - else 831 - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 671 + (fi->flags & CEPH_F_SYNC)) { 672 + struct iov_iter i; 832 673 674 + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", 675 + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 676 + ceph_cap_string(got)); 677 + 678 + if (!read) { 679 + ret = generic_segment_checks(iov, &nr_segs, 680 + &len, VERIFY_WRITE); 681 + if (ret) 682 + goto out; 683 + } 684 + 685 + iov_iter_init(&i, iov, nr_segs, len, read); 686 + 687 + /* hmm, this isn't really async... */ 688 + ret = ceph_sync_read(iocb, &i, &checkeof); 689 + } else { 690 + /* 691 + * We can't modify the content of iov, 692 + * so we only read from beginning. 693 + */ 694 + if (read) { 695 + iocb->ki_pos = pos; 696 + len = iocb->ki_nbytes; 697 + read = 0; 698 + } 699 + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 700 + inode, ceph_vinop(inode), pos, (unsigned)len, 701 + ceph_cap_string(got)); 702 + 703 + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 704 + } 833 705 out: 834 706 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 835 707 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 836 708 ceph_put_cap_refs(ci, got); 837 709 838 710 if (checkeof && ret >= 0) { 839 - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 711 + int statret = ceph_do_getattr(inode, 712 + CEPH_STAT_CAP_SIZE); 840 713 841 714 /* hit EOF or hole? */ 842 - if (statret == 0 && *ppos < inode->i_size) { 843 - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); 715 + if (statret == 0 && iocb->ki_pos < inode->i_size && 716 + ret < len) { 717 + dout("sync_read hit hole, ppos %lld < size %lld" 718 + ", reading more\n", iocb->ki_pos, 719 + inode->i_size); 720 + 844 721 read += ret; 845 - base += ret; 846 722 len -= ret; 847 723 checkeof = 0; 848 724 goto again; 849 725 } 850 726 } 727 + 851 728 if (ret >= 0) 852 729 ret += read; 853 730 ··· 953 772 inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); 954 773 955 774 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 956 - (iocb->ki_filp->f_flags & O_DIRECT) || 957 - (fi->flags & CEPH_F_SYNC)) { 775 + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { 958 776 mutex_unlock(&inode->i_mutex); 959 - written = ceph_sync_write(file, iov->iov_base, count, 960 - pos, &iocb->ki_pos); 777 + if (file->f_flags & O_DIRECT) 778 + written = ceph_sync_direct_write(iocb, iov, 779 + nr_segs, count); 780 + else 781 + written = ceph_sync_write(iocb, iov, nr_segs, count); 961 782 if (written == -EOLDSNAPC) { 962 783 dout("aio_write %p %llx.%llx %llu~%u" 963 784 "got EOLDSNAPC, retrying\n", ··· 1201 1018 loff_t offset, loff_t length) 1202 1019 { 1203 1020 struct ceph_file_info *fi = file->private_data; 1204 - struct inode *inode = file->f_dentry->d_inode; 1021 + struct inode *inode = file_inode(file); 1205 1022 struct ceph_inode_info *ci = ceph_inode(inode); 1206 1023 struct ceph_osd_client *osdc = 1207 1024 &ceph_inode_to_client(inode)->client->osdc;
+27 -6
fs/ceph/inode.c
··· 95 95 .getxattr = ceph_getxattr, 96 96 .listxattr = ceph_listxattr, 97 97 .removexattr = ceph_removexattr, 98 + .get_acl = ceph_get_acl, 98 99 }; 99 100 100 101 ··· 336 335 ci->i_hold_caps_min = 0; 337 336 ci->i_hold_caps_max = 0; 338 337 INIT_LIST_HEAD(&ci->i_cap_delay_list); 339 - ci->i_cap_exporting_mds = 0; 340 - ci->i_cap_exporting_mseq = 0; 341 - ci->i_cap_exporting_issued = 0; 342 338 INIT_LIST_HEAD(&ci->i_cap_snaps); 343 339 ci->i_head_snapc = NULL; 344 340 ci->i_snap_caps = 0; 341 + ci->i_cap_exporting_issued = 0; 345 342 346 343 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 347 344 ci->i_nr_by_mode[i] = 0; ··· 433 434 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 434 435 435 436 call_rcu(&inode->i_rcu, ceph_i_callback); 437 + } 438 + 439 + int ceph_drop_inode(struct inode *inode) 440 + { 441 + /* 442 + * Positve dentry and corresponding inode are always accompanied 443 + * in MDS reply. So no need to keep inode in the cache after 444 + * dropping all its aliases. 445 + */ 446 + return 1; 436 447 } 437 448 438 449 /* ··· 679 670 memcpy(ci->i_xattrs.blob->vec.iov_base, 680 671 iinfo->xattr_data, iinfo->xattr_len); 681 672 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 673 + ceph_forget_all_cached_acls(inode); 682 674 xattr_blob = NULL; 683 675 } 684 676 ··· 1464 1454 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1465 1455 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1466 1456 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1467 - /* nevermind! */ 1457 + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 1458 + check = 1; 1468 1459 spin_unlock(&ci->i_ceph_lock); 1469 1460 mutex_unlock(&ci->i_truncate_mutex); 1470 1461 goto out; ··· 1486 1475 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", 1487 1476 inode, orig_gen, ci->i_rdcache_gen, 1488 1477 ci->i_rdcache_revoking); 1478 + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 1479 + check = 1; 1489 1480 } 1490 1481 spin_unlock(&ci->i_ceph_lock); 1491 1482 mutex_unlock(&ci->i_truncate_mutex); 1492 - 1483 + out: 1493 1484 if (check) 1494 1485 ceph_check_caps(ci, 0, NULL); 1495 - out: 1496 1486 iput(inode); 1497 1487 } 1498 1488 ··· 1614 1602 .getxattr = ceph_getxattr, 1615 1603 .listxattr = ceph_listxattr, 1616 1604 .removexattr = ceph_removexattr, 1605 + .get_acl = ceph_get_acl, 1617 1606 }; 1618 1607 1619 1608 /* ··· 1688 1675 dirtied |= CEPH_CAP_AUTH_EXCL; 1689 1676 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 1690 1677 attr->ia_mode != inode->i_mode) { 1678 + inode->i_mode = attr->ia_mode; 1691 1679 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); 1692 1680 mask |= CEPH_SETATTR_MODE; 1693 1681 release |= CEPH_CAP_AUTH_SHARED; ··· 1804 1790 if (inode_dirty_flags) 1805 1791 __mark_inode_dirty(inode, inode_dirty_flags); 1806 1792 1793 + if (ia_valid & ATTR_MODE) { 1794 + err = ceph_acl_chmod(dentry, inode); 1795 + if (err) 1796 + goto out_put; 1797 + } 1798 + 1807 1799 if (mask) { 1808 1800 req->r_inode = inode; 1809 1801 ihold(inode); ··· 1829 1809 return err; 1830 1810 out: 1831 1811 spin_unlock(&ci->i_ceph_lock); 1812 + out_put: 1832 1813 ceph_mdsc_put_request(req); 1833 1814 return err; 1834 1815 }
+6 -2
fs/ceph/ioctl.c
··· 183 183 struct ceph_inode_info *ci = ceph_inode(inode); 184 184 struct ceph_osd_client *osdc = 185 185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 186 + struct ceph_object_locator oloc; 187 + struct ceph_object_id oid; 186 188 u64 len = 1, olen; 187 189 u64 tmp; 188 190 struct ceph_pg pgid; ··· 213 211 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 214 212 ceph_ino(inode), dl.object_no); 215 213 216 - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, 217 - ceph_file_layout_pg_pool(ci->i_layout)); 214 + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); 215 + ceph_oid_set_name(&oid, dl.object_name); 216 + 217 + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); 218 218 if (r < 0) { 219 219 up_read(&osdc->map_sem); 220 220 return r;
+84 -48
fs/ceph/mds_client.c
··· 63 63 */ 64 64 static int parse_reply_info_in(void **p, void *end, 65 65 struct ceph_mds_reply_info_in *info, 66 - int features) 66 + u64 features) 67 67 { 68 68 int err = -EIO; 69 69 ··· 98 98 */ 99 99 static int parse_reply_info_trace(void **p, void *end, 100 100 struct ceph_mds_reply_info_parsed *info, 101 - int features) 101 + u64 features) 102 102 { 103 103 int err; 104 104 ··· 145 145 */ 146 146 static int parse_reply_info_dir(void **p, void *end, 147 147 struct ceph_mds_reply_info_parsed *info, 148 - int features) 148 + u64 features) 149 149 { 150 150 u32 num, i = 0; 151 151 int err; ··· 217 217 */ 218 218 static int parse_reply_info_filelock(void **p, void *end, 219 219 struct ceph_mds_reply_info_parsed *info, 220 - int features) 220 + u64 features) 221 221 { 222 222 if (*p + sizeof(*info->filelock_reply) > end) 223 223 goto bad; ··· 238 238 */ 239 239 static int parse_reply_info_create(void **p, void *end, 240 240 struct ceph_mds_reply_info_parsed *info, 241 - int features) 241 + u64 features) 242 242 { 243 243 if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { 244 244 if (*p == end) { ··· 262 262 */ 263 263 static int parse_reply_info_extra(void **p, void *end, 264 264 struct ceph_mds_reply_info_parsed *info, 265 - int features) 265 + u64 features) 266 266 { 267 267 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 268 268 return parse_reply_info_filelock(p, end, info, features); ··· 280 280 */ 281 281 static int parse_reply_info(struct ceph_msg *msg, 282 282 struct ceph_mds_reply_info_parsed *info, 283 - int features) 283 + u64 features) 284 284 { 285 285 void *p, *end; 286 286 u32 len; ··· 713 713 struct dentry *dn = get_nonsnap_parent(parent); 714 714 inode = dn->d_inode; 715 715 dout("__choose_mds using nonsnap parent %p\n", inode); 716 - } else if (req->r_dentry->d_inode) { 716 + } else { 717 717 /* dentry target */ 718 718 inode = req->r_dentry->d_inode; 719 - } else { 720 - /* dir + name */ 721 - inode = dir; 722 - hash = ceph_dentry_hash(dir, req->r_dentry); 723 - is_hash = true; 719 + if (!inode || mode == USE_AUTH_MDS) { 720 + /* dir + name */ 721 + inode = dir; 722 + hash = ceph_dentry_hash(dir, req->r_dentry); 723 + is_hash = true; 724 + } 724 725 } 725 726 } 726 727 ··· 847 846 * 848 847 * called under mdsc->mutex 849 848 */ 849 + static struct ceph_mds_session * 850 + __open_export_target_session(struct ceph_mds_client *mdsc, int target) 851 + { 852 + struct ceph_mds_session *session; 853 + 854 + session = __ceph_lookup_mds_session(mdsc, target); 855 + if (!session) { 856 + session = register_session(mdsc, target); 857 + if (IS_ERR(session)) 858 + return session; 859 + } 860 + if (session->s_state == CEPH_MDS_SESSION_NEW || 861 + session->s_state == CEPH_MDS_SESSION_CLOSING) 862 + __open_session(mdsc, session); 863 + 864 + return session; 865 + } 866 + 867 + struct ceph_mds_session * 868 + ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) 869 + { 870 + struct ceph_mds_session *session; 871 + 872 + dout("open_export_target_session to mds%d\n", target); 873 + 874 + mutex_lock(&mdsc->mutex); 875 + session = __open_export_target_session(mdsc, target); 876 + mutex_unlock(&mdsc->mutex); 877 + 878 + return session; 879 + } 880 + 850 881 static void __open_export_target_sessions(struct ceph_mds_client *mdsc, 851 882 struct ceph_mds_session *session) 852 883 { 853 884 struct ceph_mds_info *mi; 854 885 struct ceph_mds_session *ts; 855 886 int i, mds = session->s_mds; 856 - int target; 857 887 858 888 if (mds >= mdsc->mdsmap->m_max_mds) 859 889 return; 890 + 860 891 mi = &mdsc->mdsmap->m_info[mds]; 861 892 dout("open_export_target_sessions for mds%d (%d targets)\n", 862 893 session->s_mds, mi->num_export_targets); 863 894 864 895 for (i = 0; i < mi->num_export_targets; i++) { 865 - target = mi->export_targets[i]; 866 - ts = __ceph_lookup_mds_session(mdsc, target); 867 - if (!ts) { 868 - ts = register_session(mdsc, target); 869 - if (IS_ERR(ts)) 870 - return; 871 - } 872 - if (session->s_state == CEPH_MDS_SESSION_NEW || 873 - session->s_state == CEPH_MDS_SESSION_CLOSING) 874 - __open_session(mdsc, session); 875 - else 876 - dout(" mds%d target mds%d %p is %s\n", session->s_mds, 877 - i, ts, session_state_name(ts->s_state)); 878 - ceph_put_mds_session(ts); 896 + ts = __open_export_target_session(mdsc, mi->export_targets[i]); 897 + if (!IS_ERR(ts)) 898 + ceph_put_mds_session(ts); 879 899 } 880 900 } 881 901 ··· 1158 1136 return 0; 1159 1137 } 1160 1138 1139 + static int send_flushmsg_ack(struct ceph_mds_client *mdsc, 1140 + struct ceph_mds_session *session, u64 seq) 1141 + { 1142 + struct ceph_msg *msg; 1143 + 1144 + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", 1145 + session->s_mds, session_state_name(session->s_state), seq); 1146 + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); 1147 + if (!msg) 1148 + return -ENOMEM; 1149 + ceph_con_send(&session->s_con, msg); 1150 + return 0; 1151 + } 1152 + 1153 + 1161 1154 /* 1162 1155 * Note new cap ttl, and any transition from stale -> not stale (fresh?). 1163 1156 * ··· 1251 1214 { 1252 1215 struct ceph_mds_session *session = arg; 1253 1216 struct ceph_inode_info *ci = ceph_inode(inode); 1254 - int used, oissued, mine; 1217 + int used, wanted, oissued, mine; 1255 1218 1256 1219 if (session->s_trim_caps <= 0) 1257 1220 return -1; ··· 1259 1222 spin_lock(&ci->i_ceph_lock); 1260 1223 mine = cap->issued | cap->implemented; 1261 1224 used = __ceph_caps_used(ci); 1225 + wanted = __ceph_caps_file_wanted(ci); 1262 1226 oissued = __ceph_caps_issued_other(ci, cap); 1263 1227 1264 - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", 1228 + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", 1265 1229 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), 1266 - ceph_cap_string(used)); 1267 - if (ci->i_dirty_caps) 1268 - goto out; /* dirty caps */ 1269 - if ((used & ~oissued) & mine) 1230 + ceph_cap_string(used), ceph_cap_string(wanted)); 1231 + if (cap == ci->i_auth_cap) { 1232 + if (ci->i_dirty_caps | ci->i_flushing_caps) 1233 + goto out; 1234 + if ((used | wanted) & CEPH_CAP_ANY_WR) 1235 + goto out; 1236 + } 1237 + if ((used | wanted) & ~oissued & mine) 1270 1238 goto out; /* we need these caps */ 1271 1239 1272 1240 session->s_trim_caps--; ··· 2198 2156 */ 2199 2157 if (result == -ESTALE) { 2200 2158 dout("got ESTALE on request %llu", req->r_tid); 2201 - if (!req->r_inode) { 2202 - /* do nothing; not an authority problem */ 2203 - } else if (req->r_direct_mode != USE_AUTH_MDS) { 2159 + if (req->r_direct_mode != USE_AUTH_MDS) { 2204 2160 dout("not using auth, setting for that now"); 2205 2161 req->r_direct_mode = USE_AUTH_MDS; 2206 2162 __do_request(mdsc, req); 2207 2163 mutex_unlock(&mdsc->mutex); 2208 2164 goto out; 2209 2165 } else { 2210 - struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2211 - struct ceph_cap *cap = NULL; 2212 - 2213 - if (req->r_session) 2214 - cap = ceph_get_cap_for_mds(ci, 2215 - req->r_session->s_mds); 2216 - 2217 - dout("already using auth"); 2218 - if ((!cap || cap != ci->i_auth_cap) || 2219 - (cap->mseq != req->r_sent_on_mseq)) { 2220 - dout("but cap changed, so resending"); 2166 + int mds = __choose_mds(mdsc, req); 2167 + if (mds >= 0 && mds != req->r_session->s_mds) { 2168 + dout("but auth changed, so resending"); 2221 2169 __do_request(mdsc, req); 2222 2170 mutex_unlock(&mdsc->mutex); 2223 2171 goto out; ··· 2430 2398 2431 2399 case CEPH_SESSION_RECALL_STATE: 2432 2400 trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); 2401 + break; 2402 + 2403 + case CEPH_SESSION_FLUSHMSG: 2404 + send_flushmsg_ack(mdsc, session, seq); 2433 2405 break; 2434 2406 2435 2407 default:
+2
fs/ceph/mds_client.h
··· 383 383 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 384 384 struct ceph_msg *msg); 385 385 386 + extern struct ceph_mds_session * 387 + ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); 386 388 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 387 389 struct ceph_mds_session *session); 388 390
+2
fs/ceph/strings.c
··· 41 41 case CEPH_SESSION_RENEWCAPS: return "renewcaps"; 42 42 case CEPH_SESSION_STALE: return "stale"; 43 43 case CEPH_SESSION_RECALL_STATE: return "recall_state"; 44 + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; 45 + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; 44 46 } 45 47 return "???"; 46 48 }
+7 -2
fs/ceph/super.c
··· 490 490 struct ceph_options *opt) 491 491 { 492 492 struct ceph_fs_client *fsc; 493 - const unsigned supported_features = 493 + const u64 supported_features = 494 494 CEPH_FEATURE_FLOCK | 495 495 CEPH_FEATURE_DIRLAYOUTHASH; 496 - const unsigned required_features = 0; 496 + const u64 required_features = 0; 497 497 int page_count; 498 498 size_t size; 499 499 int err = -ENOMEM; ··· 686 686 .alloc_inode = ceph_alloc_inode, 687 687 .destroy_inode = ceph_destroy_inode, 688 688 .write_inode = ceph_write_inode, 689 + .drop_inode = ceph_drop_inode, 689 690 .sync_fs = ceph_sync_fs, 690 691 .put_super = ceph_put_super, 691 692 .show_options = ceph_show_options, ··· 819 818 820 819 s->s_flags = fsc->mount_options->sb_flags; 821 820 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 821 + #ifdef CONFIG_CEPH_FS_POSIX_ACL 822 + s->s_flags |= MS_POSIXACL; 823 + #endif 822 824 825 + s->s_xattr = ceph_xattr_handlers; 823 826 s->s_fs_info = fsc; 824 827 fsc->sb = s; 825 828
+41 -4
fs/ceph/super.h
··· 287 287 unsigned long i_hold_caps_min; /* jiffies */ 288 288 unsigned long i_hold_caps_max; /* jiffies */ 289 289 struct list_head i_cap_delay_list; /* for delayed cap release to mds */ 290 - int i_cap_exporting_mds; /* to handle cap migration between */ 291 - unsigned i_cap_exporting_mseq; /* mds's. */ 292 - unsigned i_cap_exporting_issued; 293 290 struct ceph_cap_reservation i_cap_migration_resv; 294 291 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 295 292 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 296 293 dirty|flushing caps */ 297 294 unsigned i_snap_caps; /* cap bits for snapped files */ 295 + unsigned i_cap_exporting_issued; 298 296 299 297 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 300 298 ··· 333 335 u32 i_fscache_gen; /* sequence, for delayed fscache validate */ 334 336 struct work_struct i_revalidate_work; 335 337 #endif 336 - 337 338 struct inode vfs_inode; /* at end */ 338 339 }; 339 340 ··· 526 529 } 527 530 extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); 528 531 532 + extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, 533 + struct ceph_cap *ocap, int mask); 529 534 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); 530 535 extern int __ceph_caps_used(struct ceph_inode_info *ci); 531 536 ··· 690 691 691 692 extern struct inode *ceph_alloc_inode(struct super_block *sb); 692 693 extern void ceph_destroy_inode(struct inode *inode); 694 + extern int ceph_drop_inode(struct inode *inode); 693 695 694 696 extern struct inode *ceph_get_inode(struct super_block *sb, 695 697 struct ceph_vino vino); ··· 724 724 /* xattr.c */ 725 725 extern int ceph_setxattr(struct dentry *, const char *, const void *, 726 726 size_t, int); 727 + int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); 728 + ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); 729 + int __ceph_removexattr(struct dentry *, const char *); 727 730 extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); 728 731 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); 729 732 extern int ceph_removexattr(struct dentry *, const char *); ··· 734 731 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); 735 732 extern void __init ceph_xattr_init(void); 736 733 extern void ceph_xattr_exit(void); 734 + 735 + /* acl.c */ 736 + extern const struct xattr_handler ceph_xattr_acl_access_handler; 737 + extern const struct xattr_handler ceph_xattr_acl_default_handler; 738 + extern const struct xattr_handler *ceph_xattr_handlers[]; 739 + 740 + #ifdef CONFIG_CEPH_FS_POSIX_ACL 741 + 742 + struct posix_acl *ceph_get_acl(struct inode *, int); 743 + int ceph_init_acl(struct dentry *, struct inode *, struct inode *); 744 + int ceph_acl_chmod(struct dentry *, struct inode *); 745 + void ceph_forget_all_cached_acls(struct inode *inode); 746 + 747 + #else 748 + 749 + #define ceph_get_acl NULL 750 + 751 + static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, 752 + struct inode *dir) 753 + { 754 + return 0; 755 + } 756 + 757 + static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) 758 + { 759 + return 0; 760 + } 761 + 762 + static inline void ceph_forget_all_cached_acls(struct inode *inode) 763 + { 764 + } 765 + 766 + #endif 737 767 738 768 /* caps.c */ 739 769 extern const char *ceph_cap_string(int c); ··· 780 744 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); 781 745 extern void ceph_put_cap(struct ceph_mds_client *mdsc, 782 746 struct ceph_cap *cap); 747 + extern int ceph_is_any_caps(struct inode *inode); 783 748 784 749 extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, 785 750 u64 cap_id, u32 migrate_seq, u32 issue_seq);
+48 -12
fs/ceph/xattr.c
··· 11 11 #define XATTR_CEPH_PREFIX "ceph." 12 12 #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) 13 13 14 + /* 15 + * List of handlers for synthetic system.* attributes. Other 16 + * attributes are handled directly. 17 + */ 18 + const struct xattr_handler *ceph_xattr_handlers[] = { 19 + #ifdef CONFIG_CEPH_FS_POSIX_ACL 20 + &ceph_xattr_acl_access_handler, 21 + &ceph_xattr_acl_default_handler, 22 + #endif 23 + NULL, 24 + }; 25 + 14 26 static bool ceph_is_valid_xattr(const char *name) 15 27 { 16 28 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 17 29 !strncmp(name, XATTR_SECURITY_PREFIX, 18 30 XATTR_SECURITY_PREFIX_LEN) || 31 + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || 19 32 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 20 33 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 21 34 } ··· 676 663 } 677 664 } 678 665 679 - ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, 666 + ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, 680 667 size_t size) 681 668 { 682 - struct inode *inode = dentry->d_inode; 683 669 struct ceph_inode_info *ci = ceph_inode(inode); 684 670 int err; 685 671 struct ceph_inode_xattr *xattr; ··· 686 674 687 675 if (!ceph_is_valid_xattr(name)) 688 676 return -ENODATA; 689 - 690 677 691 678 /* let's see if a virtual xattr was requested */ 692 679 vxattr = ceph_match_vxattr(inode, name); ··· 734 723 out: 735 724 spin_unlock(&ci->i_ceph_lock); 736 725 return err; 726 + } 727 + 728 + ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, 729 + size_t size) 730 + { 731 + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 732 + return generic_getxattr(dentry, name, value, size); 733 + 734 + return __ceph_getxattr(dentry->d_inode, name, value, size); 737 735 } 738 736 739 737 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ··· 883 863 return err; 884 864 } 885 865 886 - int ceph_setxattr(struct dentry *dentry, const char *name, 887 - const void *value, size_t size, int flags) 866 + int __ceph_setxattr(struct dentry *dentry, const char *name, 867 + const void *value, size_t size, int flags) 888 868 { 889 869 struct inode *inode = dentry->d_inode; 890 870 struct ceph_vxattr *vxattr; ··· 898 878 char *newval = NULL; 899 879 struct ceph_inode_xattr *xattr = NULL; 900 880 int required_blob_size; 901 - 902 - if (ceph_snap(inode) != CEPH_NOSNAP) 903 - return -EROFS; 904 881 905 882 if (!ceph_is_valid_xattr(name)) 906 883 return -EOPNOTSUPP; ··· 975 958 return err; 976 959 } 977 960 961 + int ceph_setxattr(struct dentry *dentry, const char *name, 962 + const void *value, size_t size, int flags) 963 + { 964 + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 965 + return -EROFS; 966 + 967 + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 968 + return generic_setxattr(dentry, name, value, size, flags); 969 + 970 + return __ceph_setxattr(dentry, name, value, size, flags); 971 + } 972 + 978 973 static int ceph_send_removexattr(struct dentry *dentry, const char *name) 979 974 { 980 975 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); ··· 1013 984 return err; 1014 985 } 1015 986 1016 - int ceph_removexattr(struct dentry *dentry, const char *name) 987 + int __ceph_removexattr(struct dentry *dentry, const char *name) 1017 988 { 1018 989 struct inode *inode = dentry->d_inode; 1019 990 struct ceph_vxattr *vxattr; ··· 1022 993 int err; 1023 994 int required_blob_size; 1024 995 int dirty; 1025 - 1026 - if (ceph_snap(inode) != CEPH_NOSNAP) 1027 - return -EROFS; 1028 996 1029 997 if (!ceph_is_valid_xattr(name)) 1030 998 return -EOPNOTSUPP; ··· 1079 1053 return err; 1080 1054 } 1081 1055 1056 + int ceph_removexattr(struct dentry *dentry, const char *name) 1057 + { 1058 + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 1059 + return -EROFS; 1060 + 1061 + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1062 + return generic_removexattr(dentry, name); 1063 + 1064 + return __ceph_removexattr(dentry, name); 1065 + }
-1
include/linux/ceph/buffer.h
··· 17 17 struct kref kref; 18 18 struct kvec vec; 19 19 size_t alloc_len; 20 - bool is_vmalloc; 21 20 }; 22 21 23 22 extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+68 -33
include/linux/ceph/ceph_features.h
··· 4 4 /* 5 5 * feature bits 6 6 */ 7 - #define CEPH_FEATURE_UID (1<<0) 8 - #define CEPH_FEATURE_NOSRCADDR (1<<1) 9 - #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) 10 - #define CEPH_FEATURE_FLOCK (1<<3) 11 - #define CEPH_FEATURE_SUBSCRIBE2 (1<<4) 12 - #define CEPH_FEATURE_MONNAMES (1<<5) 13 - #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 14 - #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 15 - #define CEPH_FEATURE_OBJECTLOCATOR (1<<8) 16 - #define CEPH_FEATURE_PGID64 (1<<9) 17 - #define CEPH_FEATURE_INCSUBOSDMAP (1<<10) 18 - #define CEPH_FEATURE_PGPOOL3 (1<<11) 19 - #define CEPH_FEATURE_OSDREPLYMUX (1<<12) 20 - #define CEPH_FEATURE_OSDENC (1<<13) 21 - #define CEPH_FEATURE_OMAP (1<<14) 22 - #define CEPH_FEATURE_MONENC (1<<15) 23 - #define CEPH_FEATURE_QUERY_T (1<<16) 24 - #define CEPH_FEATURE_INDEP_PG_MAP (1<<17) 25 - #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) 26 - #define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) 27 - #define CEPH_FEATURE_MON_NULLROUTE (1<<20) 28 - #define CEPH_FEATURE_MON_GV (1<<21) 29 - #define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) 30 - #define CEPH_FEATURE_MSG_AUTH (1<<23) 31 - #define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) 32 - #define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) 33 - #define CEPH_FEATURE_CREATEPOOLID (1<<26) 34 - #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) 35 - #define CEPH_FEATURE_OSD_HBMSGS (1<<28) 36 - #define CEPH_FEATURE_MDSENC (1<<29) 37 - #define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) 7 + #define CEPH_FEATURE_UID (1ULL<<0) 8 + #define CEPH_FEATURE_NOSRCADDR (1ULL<<1) 9 + #define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) 10 + #define CEPH_FEATURE_FLOCK (1ULL<<3) 11 + #define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) 12 + #define CEPH_FEATURE_MONNAMES (1ULL<<5) 13 + #define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) 14 + #define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) 15 + #define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) 16 + #define CEPH_FEATURE_PGID64 (1ULL<<9) 17 + #define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) 18 + #define CEPH_FEATURE_PGPOOL3 (1ULL<<11) 19 + #define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) 20 + #define CEPH_FEATURE_OSDENC (1ULL<<13) 21 + #define CEPH_FEATURE_OMAP (1ULL<<14) 22 + #define CEPH_FEATURE_MONENC (1ULL<<15) 23 + #define CEPH_FEATURE_QUERY_T (1ULL<<16) 24 + #define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) 25 + #define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) 26 + #define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) 27 + #define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) 28 + #define CEPH_FEATURE_MON_GV (1ULL<<21) 29 + #define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) 30 + #define CEPH_FEATURE_MSG_AUTH (1ULL<<23) 31 + #define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) 32 + #define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) 33 + #define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) 34 + #define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) 35 + #define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) 36 + #define CEPH_FEATURE_MDSENC (1ULL<<29) 37 + #define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) 38 + #define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) 39 + #define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) 40 + #define CEPH_FEATURE_MON_SCRUB (1ULL<<33) 41 + #define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) 42 + #define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) 43 + #define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ 44 + #define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) 45 + #define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) 46 + 47 + /* 48 + * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature 49 + * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 50 + * to mean 33 bit ~0, and introduce a helper below to do the 51 + * translation. 52 + * 53 + * This was introduced by ceph.git commit 54 + * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 55 + * and fixed by ceph.git commit 56 + * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c 57 + */ 58 + #define CEPH_FEATURE_RESERVED (1ULL<<63) 59 + 60 + static inline u64 ceph_sanitize_features(u64 features) 61 + { 62 + if (features & CEPH_FEATURE_RESERVED) { 63 + /* everything through OSD_SNAPMAPPER */ 64 + return 0x1ffffffffull; 65 + } else { 66 + return features; 67 + } 68 + } 38 69 39 70 /* 40 71 * Features supported. 41 72 */ 42 - #define CEPH_FEATURES_SUPPORTED_DEFAULT \ 73 + #define CEPH_FEATURES_SUPPORTED_DEFAULT \ 43 74 (CEPH_FEATURE_NOSRCADDR | \ 44 75 CEPH_FEATURE_RECONNECT_SEQ | \ 45 76 CEPH_FEATURE_PGID64 | \ ··· 79 48 CEPH_FEATURE_CRUSH_TUNABLES | \ 80 49 CEPH_FEATURE_CRUSH_TUNABLES2 | \ 81 50 CEPH_FEATURE_REPLY_CREATE_INODE | \ 82 - CEPH_FEATURE_OSDHASHPSPOOL) 51 + CEPH_FEATURE_OSDHASHPSPOOL | \ 52 + CEPH_FEATURE_OSD_CACHEPOOL | \ 53 + CEPH_FEATURE_CRUSH_V2 | \ 54 + CEPH_FEATURE_EXPORT_PEER) 83 55 84 56 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 85 57 (CEPH_FEATURE_NOSRCADDR | \ ··· 90 56 CEPH_FEATURE_PGID64 | \ 91 57 CEPH_FEATURE_PGPOOL3 | \ 92 58 CEPH_FEATURE_OSDENC) 59 + 93 60 #endif
+35 -1
include/linux/ceph/ceph_fs.h
··· 53 53 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ 54 54 } __attribute__ ((packed)); 55 55 56 + #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) 57 + #define ceph_file_layout_stripe_count(l) \ 58 + ((__s32)le32_to_cpu((l).fl_stripe_count)) 59 + #define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) 60 + #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) 61 + #define ceph_file_layout_object_su(l) \ 62 + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) 63 + #define ceph_file_layout_pg_pool(l) \ 64 + ((__s32)le32_to_cpu((l).fl_pg_pool)) 65 + 66 + static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) 67 + { 68 + return le32_to_cpu(l->fl_stripe_unit) * 69 + le32_to_cpu(l->fl_stripe_count); 70 + } 71 + 72 + /* "period" == bytes before i start on a new set of objects */ 73 + static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) 74 + { 75 + return le32_to_cpu(l->fl_object_size) * 76 + le32_to_cpu(l->fl_stripe_count); 77 + } 78 + 56 79 #define CEPH_MIN_STRIPE_UNIT 65536 57 80 58 81 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); ··· 305 282 CEPH_SESSION_RENEWCAPS, 306 283 CEPH_SESSION_STALE, 307 284 CEPH_SESSION_RECALL_STATE, 285 + CEPH_SESSION_FLUSHMSG, 286 + CEPH_SESSION_FLUSHMSG_ACK, 308 287 }; 309 288 310 289 extern const char *ceph_session_op_name(int op); ··· 482 457 __u8 flags; /* CEPH_CAP_FLAG_* */ 483 458 } __attribute__ ((packed)); 484 459 485 - #define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ 460 + #define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ 461 + #define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */ 486 462 487 463 /* inode record, for bundling with mds reply */ 488 464 struct ceph_mds_reply_inode { ··· 682 656 struct ceph_timespec mtime, atime, ctime; 683 657 struct ceph_file_layout layout; 684 658 __le32 time_warp_seq; 659 + } __attribute__ ((packed)); 660 + 661 + struct ceph_mds_cap_peer { 662 + __le64 cap_id; 663 + __le32 seq; 664 + __le32 mseq; 665 + __le32 mds; 666 + __u8 flags; 685 667 } __attribute__ ((packed)); 686 668 687 669 /* cap release msg head */
+11 -8
include/linux/ceph/libceph.h
··· 122 122 123 123 int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); 124 124 125 - u32 supported_features; 126 - u32 required_features; 125 + u64 supported_features; 126 + u64 required_features; 127 127 128 128 struct ceph_messenger msgr; /* messenger instance */ 129 129 struct ceph_mon_client monc; ··· 173 173 (off >> PAGE_CACHE_SHIFT); 174 174 } 175 175 176 + extern struct kmem_cache *ceph_inode_cachep; 177 + extern struct kmem_cache *ceph_cap_cachep; 178 + extern struct kmem_cache *ceph_dentry_cachep; 179 + extern struct kmem_cache *ceph_file_cachep; 180 + 176 181 /* ceph_common.c */ 177 182 extern bool libceph_compatible(void *data); 178 183 179 184 extern const char *ceph_msg_type_name(int type); 180 185 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 181 - extern struct kmem_cache *ceph_inode_cachep; 182 - extern struct kmem_cache *ceph_cap_cachep; 183 - extern struct kmem_cache *ceph_dentry_cachep; 184 - extern struct kmem_cache *ceph_file_cachep; 186 + extern void *ceph_kvmalloc(size_t size, gfp_t flags); 187 + extern void ceph_kvfree(const void *ptr); 185 188 186 189 extern struct ceph_options *ceph_parse_options(char *options, 187 190 const char *dev_name, const char *dev_name_end, ··· 195 192 struct ceph_client *client); 196 193 extern struct ceph_client *ceph_create_client(struct ceph_options *opt, 197 194 void *private, 198 - unsigned supported_features, 199 - unsigned required_features); 195 + u64 supported_features, 196 + u64 required_features); 200 197 extern u64 ceph_client_id(struct ceph_client *client); 201 198 extern void ceph_destroy_client(struct ceph_client *client); 202 199 extern int __ceph_open_session(struct ceph_client *client,
+6 -7
include/linux/ceph/messenger.h
··· 60 60 u32 global_seq; 61 61 spinlock_t global_seq_lock; 62 62 63 - u32 supported_features; 64 - u32 required_features; 63 + u64 supported_features; 64 + u64 required_features; 65 65 }; 66 66 67 67 enum ceph_msg_data_type { ··· 154 154 struct list_head list_head; /* links for connection lists */ 155 155 156 156 struct kref kref; 157 - bool front_is_vmalloc; 158 157 bool more_to_follow; 159 158 bool needs_out_seq; 160 - int front_max; 159 + int front_alloc_len; 161 160 unsigned long ack_stamp; /* tx: when we were acked */ 162 161 163 162 struct ceph_msgpool *pool; ··· 191 192 192 193 struct ceph_entity_name peer_name; /* peer name */ 193 194 194 - unsigned peer_features; 195 + u64 peer_features; 195 196 u32 connect_seq; /* identify the most recent connection 196 197 attempt for this connection, client */ 197 198 u32 peer_global_seq; /* peer's global seq for this connection */ ··· 255 256 256 257 extern void ceph_messenger_init(struct ceph_messenger *msgr, 257 258 struct ceph_entity_addr *myaddr, 258 - u32 supported_features, 259 - u32 required_features, 259 + u64 supported_features, 260 + u64 required_features, 260 261 bool nocrc); 261 262 262 263 extern void ceph_con_init(struct ceph_connection *con, void *private,
+10 -9
include/linux/ceph/osd_client.h
··· 12 12 #include <linux/ceph/auth.h> 13 13 #include <linux/ceph/pagelist.h> 14 14 15 - /* 16 - * Maximum object name size 17 - * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) 18 - */ 19 - #define MAX_OBJ_NAME_SIZE 100 20 - 21 15 struct ceph_msg; 22 16 struct ceph_snap_context; 23 17 struct ceph_osd_request; ··· 132 138 __le64 *r_request_pool; 133 139 void *r_request_pgid; 134 140 __le32 *r_request_attempts; 141 + bool r_paused; 135 142 struct ceph_eversion *r_request_reassert_version; 136 143 137 144 int r_result; ··· 153 158 struct inode *r_inode; /* for use by callbacks */ 154 159 void *r_priv; /* ditto */ 155 160 156 - char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ 157 - int r_oid_len; 161 + struct ceph_object_locator r_base_oloc; 162 + struct ceph_object_id r_base_oid; 163 + struct ceph_object_locator r_target_oloc; 164 + struct ceph_object_id r_target_oid; 165 + 158 166 u64 r_snapid; 159 167 unsigned long r_stamp; /* send OR check time */ 160 168 161 - struct ceph_file_layout r_file_layout; 162 169 struct ceph_snap_context *r_snapc; /* snap context for writes */ 170 + }; 171 + 172 + struct ceph_request_redirect { 173 + struct ceph_object_locator oloc; 163 174 }; 164 175 165 176 struct ceph_osd_event {
+41 -25
include/linux/ceph/osdmap.h
··· 35 35 u8 object_hash; 36 36 u32 pg_num, pgp_num; 37 37 int pg_num_mask, pgp_num_mask; 38 + s64 read_tier; 39 + s64 write_tier; /* wins for read+write ops */ 38 40 u64 flags; 39 41 char *name; 40 42 }; 41 43 42 44 struct ceph_object_locator { 43 - uint64_t pool; 44 - char *key; 45 + s64 pool; 46 + }; 47 + 48 + /* 49 + * Maximum supported by kernel client object name length 50 + * 51 + * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) 52 + */ 53 + #define CEPH_MAX_OID_NAME_LEN 100 54 + 55 + struct ceph_object_id { 56 + char name[CEPH_MAX_OID_NAME_LEN]; 57 + int name_len; 45 58 }; 46 59 47 60 struct ceph_pg_mapping { ··· 86 73 struct crush_map *crush; 87 74 }; 88 75 89 - /* 90 - * file layout helpers 91 - */ 92 - #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) 93 - #define ceph_file_layout_stripe_count(l) \ 94 - ((__s32)le32_to_cpu((l).fl_stripe_count)) 95 - #define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) 96 - #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) 97 - #define ceph_file_layout_object_su(l) \ 98 - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) 99 - #define ceph_file_layout_pg_pool(l) \ 100 - ((__s32)le32_to_cpu((l).fl_pg_pool)) 101 - 102 - static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) 76 + static inline void ceph_oid_set_name(struct ceph_object_id *oid, 77 + const char *name) 103 78 { 104 - return le32_to_cpu(l->fl_stripe_unit) * 105 - le32_to_cpu(l->fl_stripe_count); 79 + int len; 80 + 81 + len = strlen(name); 82 + if (len > sizeof(oid->name)) { 83 + WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", 84 + name, len, sizeof(oid->name)); 85 + len = sizeof(oid->name); 86 + } 87 + 88 + memcpy(oid->name, name, len); 89 + oid->name_len = len; 106 90 } 107 91 108 - /* "period" == bytes before i start on a new set of objects */ 109 - static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) 92 + static inline void ceph_oid_copy(struct ceph_object_id *dest, 93 + struct ceph_object_id *src) 110 94 { 111 - return le32_to_cpu(l->fl_object_size) * 112 - le32_to_cpu(l->fl_stripe_count); 95 + BUG_ON(src->name_len > sizeof(dest->name)); 96 + memcpy(dest->name, src->name, src->name_len); 97 + dest->name_len = src->name_len; 113 98 } 114 - 115 99 116 100 static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 117 101 { ··· 165 155 u64 *bno, u64 *oxoff, u64 *oxlen); 166 156 167 157 /* calculate mapping of object to a placement group */ 168 - extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, 169 - struct ceph_osdmap *osdmap, uint64_t pool); 158 + extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 159 + struct ceph_object_locator *oloc, 160 + struct ceph_object_id *oid, 161 + struct ceph_pg *pg_out); 162 + 170 163 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 171 164 struct ceph_pg pgid, 172 165 int *acting); 173 166 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 174 167 struct ceph_pg pgid); 168 + 169 + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, 170 + u64 id); 175 171 176 172 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); 177 173 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+4
include/linux/ceph/rados.h
··· 344 344 CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ 345 345 CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ 346 346 CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ 347 + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ 348 + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ 349 + CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ 350 + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ 347 351 }; 348 352 349 353 enum {
+15 -5
include/linux/crush/crush.h
··· 19 19 20 20 #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ 21 21 22 - 23 22 #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ 24 - #define CRUSH_MAX_SET 10 /* max size of a mapping result */ 25 23 24 + 25 + #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ 26 + #define CRUSH_ITEM_NONE 0x7fffffff /* no result */ 26 27 27 28 /* 28 29 * CRUSH uses user-defined "rules" to describe how inputs should be ··· 44 43 /* arg2 = type */ 45 44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ 46 45 CRUSH_RULE_EMIT = 4, /* no args */ 47 - CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, 48 - CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, 46 + CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, 47 + CRUSH_RULE_CHOOSELEAF_INDEP = 7, 48 + 49 + CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ 50 + CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 51 + CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 52 + CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 49 53 }; 50 54 51 55 /* ··· 168 162 __u32 choose_local_fallback_tries; 169 163 /* choose attempts before giving up */ 170 164 __u32 choose_total_tries; 171 - /* attempt chooseleaf inner descent once; on failure retry outer descent */ 165 + /* attempt chooseleaf inner descent once for firstn mode; on 166 + * reject retry outer descent. Note that this does *not* 167 + * apply to a collision: in that case we will retry as we used 168 + * to. */ 172 169 __u32 chooseleaf_descend_once; 173 170 }; 174 171 ··· 183 174 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); 184 175 extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); 185 176 extern void crush_destroy_bucket(struct crush_bucket *b); 177 + extern void crush_destroy_rule(struct crush_rule *r); 186 178 extern void crush_destroy(struct crush_map *map); 187 179 188 180 static inline int crush_calc_tree_node(int i)
+2 -1
include/linux/crush/mapper.h
··· 14 14 extern int crush_do_rule(const struct crush_map *map, 15 15 int ruleno, 16 16 int x, int *result, int result_max, 17 - const __u32 *weights); 17 + const __u32 *weights, int weight_max, 18 + int *scratch); 18 19 19 20 #endif
+6 -16
net/ceph/buffer.c
··· 6 6 7 7 #include <linux/ceph/buffer.h> 8 8 #include <linux/ceph/decode.h> 9 + #include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */ 9 10 10 11 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) 11 12 { ··· 16 15 if (!b) 17 16 return NULL; 18 17 19 - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); 20 - if (b->vec.iov_base) { 21 - b->is_vmalloc = false; 22 - } else { 23 - b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); 24 - if (!b->vec.iov_base) { 25 - kfree(b); 26 - return NULL; 27 - } 28 - b->is_vmalloc = true; 18 + b->vec.iov_base = ceph_kvmalloc(len, gfp); 19 + if (!b->vec.iov_base) { 20 + kfree(b); 21 + return NULL; 29 22 } 30 23 31 24 kref_init(&b->kref); ··· 35 40 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); 36 41 37 42 dout("buffer_release %p\n", b); 38 - if (b->vec.iov_base) { 39 - if (b->is_vmalloc) 40 - vfree(b->vec.iov_base); 41 - else 42 - kfree(b->vec.iov_base); 43 - } 43 + ceph_kvfree(b->vec.iov_base); 44 44 kfree(b); 45 45 } 46 46 EXPORT_SYMBOL(ceph_buffer_release);
+22 -2
net/ceph/ceph_common.c
··· 15 15 #include <linux/slab.h> 16 16 #include <linux/statfs.h> 17 17 #include <linux/string.h> 18 + #include <linux/vmalloc.h> 18 19 #include <linux/nsproxy.h> 19 20 #include <net/net_namespace.h> 20 21 ··· 170 169 return -1; 171 170 } 172 171 EXPORT_SYMBOL(ceph_compare_options); 172 + 173 + void *ceph_kvmalloc(size_t size, gfp_t flags) 174 + { 175 + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 176 + void *ptr = kmalloc(size, flags | __GFP_NOWARN); 177 + if (ptr) 178 + return ptr; 179 + } 180 + 181 + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); 182 + } 183 + 184 + void ceph_kvfree(const void *ptr) 185 + { 186 + if (is_vmalloc_addr(ptr)) 187 + vfree(ptr); 188 + else 189 + kfree(ptr); 190 + } 173 191 174 192 175 193 static int parse_fsid(const char *str, struct ceph_fsid *fsid) ··· 481 461 * create a fresh client instance 482 462 */ 483 463 struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, 484 - unsigned int supported_features, 485 - unsigned int required_features) 464 + u64 supported_features, 465 + u64 required_features) 486 466 { 487 467 struct ceph_client *client; 488 468 struct ceph_entity_addr *myaddr = NULL;
+5 -2
net/ceph/crush/crush.c
··· 116 116 if (map->rules) { 117 117 __u32 b; 118 118 for (b = 0; b < map->max_rules; b++) 119 - kfree(map->rules[b]); 119 + crush_destroy_rule(map->rules[b]); 120 120 kfree(map->rules); 121 121 } 122 122 123 123 kfree(map); 124 124 } 125 125 126 - 126 + void crush_destroy_rule(struct crush_rule *rule) 127 + { 128 + kfree(rule); 129 + }
+269 -67
net/ceph/crush/mapper.c
··· 189 189 static int bucket_tree_choose(struct crush_bucket_tree *bucket, 190 190 int x, int r) 191 191 { 192 - int n, l; 192 + int n; 193 193 __u32 w; 194 194 __u64 t; 195 195 ··· 197 197 n = bucket->num_nodes >> 1; 198 198 199 199 while (!terminal(n)) { 200 + int l; 200 201 /* pick point in [0, w) */ 201 202 w = bucket->node_weights[n]; 202 203 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, ··· 265 264 * true if device is marked "out" (failed, fully offloaded) 266 265 * of the cluster 267 266 */ 268 - static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) 267 + static int is_out(const struct crush_map *map, 268 + const __u32 *weight, int weight_max, 269 + int item, int x) 269 270 { 271 + if (item >= weight_max) 272 + return 1; 270 273 if (weight[item] >= 0x10000) 271 274 return 0; 272 275 if (weight[item] == 0) ··· 282 277 } 283 278 284 279 /** 285 - * crush_choose - choose numrep distinct items of given type 280 + * crush_choose_firstn - choose numrep distinct items of given type 286 281 * @map: the crush_map 287 282 * @bucket: the bucket we are choose an item from 288 283 * @x: crush input value ··· 290 285 * @type: the type of item to choose 291 286 * @out: pointer to output vector 292 287 * @outpos: our position in that vector 293 - * @firstn: true if choosing "first n" items, false if choosing "indep" 294 - * @recurse_to_leaf: true if we want one device under each item of given type 295 - * @descend_once: true if we should only try one descent before giving up 288 + * @tries: number of attempts to make 289 + * @recurse_tries: number of attempts to have recursive chooseleaf make 290 + * @local_tries: localized retries 291 + * @local_fallback_tries: localized fallback retries 292 + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 296 293 * @out2: second output vector for leaf items (if @recurse_to_leaf) 297 294 */ 298 - static int crush_choose(const struct crush_map *map, 299 - struct crush_bucket *bucket, 300 - const __u32 *weight, 301 - int x, int numrep, int type, 302 - int *out, int outpos, 303 - int firstn, int recurse_to_leaf, 304 - int descend_once, int *out2) 295 + static int crush_choose_firstn(const struct crush_map *map, 296 + struct crush_bucket *bucket, 297 + const __u32 *weight, int weight_max, 298 + int x, int numrep, int type, 299 + int *out, int outpos, 300 + unsigned int tries, 301 + unsigned int recurse_tries, 302 + unsigned int local_tries, 303 + unsigned int local_fallback_tries, 304 + int recurse_to_leaf, 305 + int *out2) 305 306 { 306 307 int rep; 307 308 unsigned int ftotal, flocal; ··· 336 325 collide = 0; 337 326 retry_bucket = 0; 338 327 r = rep; 339 - if (in->alg == CRUSH_BUCKET_UNIFORM) { 340 - /* be careful */ 341 - if (firstn || (__u32)numrep >= in->size) 342 - /* r' = r + f_total */ 343 - r += ftotal; 344 - else if (in->size % numrep == 0) 345 - /* r'=r+(n+1)*f_local */ 346 - r += (numrep+1) * 347 - (flocal+ftotal); 348 - else 349 - /* r' = r + n*f_local */ 350 - r += numrep * (flocal+ftotal); 351 - } else { 352 - if (firstn) 353 - /* r' = r + f_total */ 354 - r += ftotal; 355 - else 356 - /* r' = r + n*f_local */ 357 - r += numrep * (flocal+ftotal); 358 - } 328 + /* r' = r + f_total */ 329 + r += ftotal; 359 330 360 331 /* bucket choose */ 361 332 if (in->size == 0) { 362 333 reject = 1; 363 334 goto reject; 364 335 } 365 - if (map->choose_local_fallback_tries > 0 && 336 + if (local_fallback_tries > 0 && 366 337 flocal >= (in->size>>1) && 367 - flocal > map->choose_local_fallback_tries) 338 + flocal > local_fallback_tries) 368 339 item = bucket_perm_choose(in, x, r); 369 340 else 370 341 item = crush_bucket_choose(in, x, r); ··· 387 394 reject = 0; 388 395 if (!collide && recurse_to_leaf) { 389 396 if (item < 0) { 390 - if (crush_choose(map, 397 + if (crush_choose_firstn(map, 391 398 map->buckets[-1-item], 392 - weight, 399 + weight, weight_max, 393 400 x, outpos+1, 0, 394 401 out2, outpos, 395 - firstn, 0, 396 - map->chooseleaf_descend_once, 402 + recurse_tries, 0, 403 + local_tries, 404 + local_fallback_tries, 405 + 0, 397 406 NULL) <= outpos) 398 407 /* didn't get leaf */ 399 408 reject = 1; ··· 409 414 /* out? */ 410 415 if (itemtype == 0) 411 416 reject = is_out(map, weight, 417 + weight_max, 412 418 item, x); 413 419 else 414 420 reject = 0; ··· 420 424 ftotal++; 421 425 flocal++; 422 426 423 - if (reject && descend_once) 424 - /* let outer call try again */ 425 - skip_rep = 1; 426 - else if (collide && flocal <= map->choose_local_tries) 427 + if (collide && flocal <= local_tries) 427 428 /* retry locally a few times */ 428 429 retry_bucket = 1; 429 - else if (map->choose_local_fallback_tries > 0 && 430 - flocal <= in->size + map->choose_local_fallback_tries) 430 + else if (local_fallback_tries > 0 && 431 + flocal <= in->size + local_fallback_tries) 431 432 /* exhaustive bucket search */ 432 433 retry_bucket = 1; 433 - else if (ftotal <= map->choose_total_tries) 434 + else if (ftotal <= tries) 434 435 /* then retry descent */ 435 436 retry_descent = 1; 436 437 else ··· 457 464 458 465 459 466 /** 467 + * crush_choose_indep: alternative breadth-first positionally stable mapping 468 + * 469 + */ 470 + static void crush_choose_indep(const struct crush_map *map, 471 + struct crush_bucket *bucket, 472 + const __u32 *weight, int weight_max, 473 + int x, int left, int numrep, int type, 474 + int *out, int outpos, 475 + unsigned int tries, 476 + unsigned int recurse_tries, 477 + int recurse_to_leaf, 478 + int *out2, 479 + int parent_r) 480 + { 481 + struct crush_bucket *in = bucket; 482 + int endpos = outpos + left; 483 + int rep; 484 + unsigned int ftotal; 485 + int r; 486 + int i; 487 + int item = 0; 488 + int itemtype; 489 + int collide; 490 + 491 + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 492 + bucket->id, x, outpos, numrep); 493 + 494 + /* initially my result is undefined */ 495 + for (rep = outpos; rep < endpos; rep++) { 496 + out[rep] = CRUSH_ITEM_UNDEF; 497 + if (out2) 498 + out2[rep] = CRUSH_ITEM_UNDEF; 499 + } 500 + 501 + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { 502 + for (rep = outpos; rep < endpos; rep++) { 503 + if (out[rep] != CRUSH_ITEM_UNDEF) 504 + continue; 505 + 506 + in = bucket; /* initial bucket */ 507 + 508 + /* choose through intervening buckets */ 509 + for (;;) { 510 + /* note: we base the choice on the position 511 + * even in the nested call. that means that 512 + * if the first layer chooses the same bucket 513 + * in a different position, we will tend to 514 + * choose a different item in that bucket. 515 + * this will involve more devices in data 516 + * movement and tend to distribute the load. 517 + */ 518 + r = rep + parent_r; 519 + 520 + /* be careful */ 521 + if (in->alg == CRUSH_BUCKET_UNIFORM && 522 + in->size % numrep == 0) 523 + /* r'=r+(n+1)*f_total */ 524 + r += (numrep+1) * ftotal; 525 + else 526 + /* r' = r + n*f_total */ 527 + r += numrep * ftotal; 528 + 529 + /* bucket choose */ 530 + if (in->size == 0) { 531 + dprintk(" empty bucket\n"); 532 + break; 533 + } 534 + 535 + item = crush_bucket_choose(in, x, r); 536 + if (item >= map->max_devices) { 537 + dprintk(" bad item %d\n", item); 538 + out[rep] = CRUSH_ITEM_NONE; 539 + if (out2) 540 + out2[rep] = CRUSH_ITEM_NONE; 541 + left--; 542 + break; 543 + } 544 + 545 + /* desired type? */ 546 + if (item < 0) 547 + itemtype = map->buckets[-1-item]->type; 548 + else 549 + itemtype = 0; 550 + dprintk(" item %d type %d\n", item, itemtype); 551 + 552 + /* keep going? */ 553 + if (itemtype != type) { 554 + if (item >= 0 || 555 + (-1-item) >= map->max_buckets) { 556 + dprintk(" bad item type %d\n", type); 557 + out[rep] = CRUSH_ITEM_NONE; 558 + if (out2) 559 + out2[rep] = 560 + CRUSH_ITEM_NONE; 561 + left--; 562 + break; 563 + } 564 + in = map->buckets[-1-item]; 565 + continue; 566 + } 567 + 568 + /* collision? */ 569 + collide = 0; 570 + for (i = outpos; i < endpos; i++) { 571 + if (out[i] == item) { 572 + collide = 1; 573 + break; 574 + } 575 + } 576 + if (collide) 577 + break; 578 + 579 + if (recurse_to_leaf) { 580 + if (item < 0) { 581 + crush_choose_indep(map, 582 + map->buckets[-1-item], 583 + weight, weight_max, 584 + x, 1, numrep, 0, 585 + out2, rep, 586 + recurse_tries, 0, 587 + 0, NULL, r); 588 + if (out2[rep] == CRUSH_ITEM_NONE) { 589 + /* placed nothing; no leaf */ 590 + break; 591 + } 592 + } else { 593 + /* we already have a leaf! */ 594 + out2[rep] = item; 595 + } 596 + } 597 + 598 + /* out? */ 599 + if (itemtype == 0 && 600 + is_out(map, weight, weight_max, item, x)) 601 + break; 602 + 603 + /* yay! */ 604 + out[rep] = item; 605 + left--; 606 + break; 607 + } 608 + } 609 + } 610 + for (rep = outpos; rep < endpos; rep++) { 611 + if (out[rep] == CRUSH_ITEM_UNDEF) { 612 + out[rep] = CRUSH_ITEM_NONE; 613 + } 614 + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { 615 + out2[rep] = CRUSH_ITEM_NONE; 616 + } 617 + } 618 + } 619 + 620 + /** 460 621 * crush_do_rule - calculate a mapping with the given input and rule 461 622 * @map: the crush_map 462 623 * @ruleno: the rule id 463 624 * @x: hash input 464 625 * @result: pointer to result vector 465 626 * @result_max: maximum result size 627 + * @weight: weight vector (for map leaves) 628 + * @weight_max: size of weight vector 629 + * @scratch: scratch vector for private use; must be >= 3 * result_max 466 630 */ 467 631 int crush_do_rule(const struct crush_map *map, 468 632 int ruleno, int x, int *result, int result_max, 469 - const __u32 *weight) 633 + const __u32 *weight, int weight_max, 634 + int *scratch) 470 635 { 471 636 int result_len; 472 - int a[CRUSH_MAX_SET]; 473 - int b[CRUSH_MAX_SET]; 474 - int c[CRUSH_MAX_SET]; 637 + int *a = scratch; 638 + int *b = scratch + result_max; 639 + int *c = scratch + result_max*2; 475 640 int recurse_to_leaf; 476 641 int *w; 477 642 int wsize = 0; ··· 640 489 __u32 step; 641 490 int i, j; 642 491 int numrep; 643 - int firstn; 644 - const int descend_once = 0; 492 + int choose_tries = map->choose_total_tries; 493 + int choose_local_tries = map->choose_local_tries; 494 + int choose_local_fallback_tries = map->choose_local_fallback_tries; 495 + int choose_leaf_tries = 0; 645 496 646 497 if ((__u32)ruleno >= map->max_rules) { 647 498 dprintk(" bad ruleno %d\n", ruleno); ··· 656 503 o = b; 657 504 658 505 for (step = 0; step < rule->len; step++) { 506 + int firstn = 0; 659 507 struct crush_rule_step *curstep = &rule->steps[step]; 660 508 661 - firstn = 0; 662 509 switch (curstep->op) { 663 510 case CRUSH_RULE_TAKE: 664 511 w[0] = curstep->arg1; 665 512 wsize = 1; 666 513 break; 667 514 668 - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 515 + case CRUSH_RULE_SET_CHOOSE_TRIES: 516 + if (curstep->arg1 > 0) 517 + choose_tries = curstep->arg1; 518 + break; 519 + 520 + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: 521 + if (curstep->arg1 > 0) 522 + choose_leaf_tries = curstep->arg1; 523 + break; 524 + 525 + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 526 + if (curstep->arg1 > 0) 527 + choose_local_tries = curstep->arg1; 528 + break; 529 + 530 + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 531 + if (curstep->arg1 > 0) 532 + choose_local_fallback_tries = curstep->arg1; 533 + break; 534 + 535 + case CRUSH_RULE_CHOOSELEAF_FIRSTN: 669 536 case CRUSH_RULE_CHOOSE_FIRSTN: 670 537 firstn = 1; 671 538 /* fall through */ 672 - case CRUSH_RULE_CHOOSE_LEAF_INDEP: 539 + case CRUSH_RULE_CHOOSELEAF_INDEP: 673 540 case CRUSH_RULE_CHOOSE_INDEP: 674 541 if (wsize == 0) 675 542 break; 676 543 677 544 recurse_to_leaf = 678 545 curstep->op == 679 - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 546 + CRUSH_RULE_CHOOSELEAF_FIRSTN || 680 547 curstep->op == 681 - CRUSH_RULE_CHOOSE_LEAF_INDEP; 548 + CRUSH_RULE_CHOOSELEAF_INDEP; 682 549 683 550 /* reset output */ 684 551 osize = 0; ··· 716 543 continue; 717 544 } 718 545 j = 0; 719 - osize += crush_choose(map, 720 - map->buckets[-1-w[i]], 721 - weight, 722 - x, numrep, 723 - curstep->arg2, 724 - o+osize, j, 725 - firstn, 726 - recurse_to_leaf, 727 - descend_once, c+osize); 546 + if (firstn) { 547 + int recurse_tries; 548 + if (choose_leaf_tries) 549 + recurse_tries = 550 + choose_leaf_tries; 551 + else if (map->chooseleaf_descend_once) 552 + recurse_tries = 1; 553 + else 554 + recurse_tries = choose_tries; 555 + osize += crush_choose_firstn( 556 + map, 557 + map->buckets[-1-w[i]], 558 + weight, weight_max, 559 + x, numrep, 560 + curstep->arg2, 561 + o+osize, j, 562 + choose_tries, 563 + recurse_tries, 564 + choose_local_tries, 565 + choose_local_fallback_tries, 566 + recurse_to_leaf, 567 + c+osize); 568 + } else { 569 + crush_choose_indep( 570 + map, 571 + map->buckets[-1-w[i]], 572 + weight, weight_max, 573 + x, numrep, numrep, 574 + curstep->arg2, 575 + o+osize, j, 576 + choose_tries, 577 + choose_leaf_tries ? 578 + choose_leaf_tries : 1, 579 + recurse_to_leaf, 580 + c+osize, 581 + 0); 582 + osize += numrep; 583 + } 728 584 } 729 585 730 586 if (recurse_to_leaf) 731 587 /* copy final _leaf_ values to output set */ 732 588 memcpy(o, c, osize*sizeof(*o)); 733 589 734 - /* swap t and w arrays */ 590 + /* swap o and w arrays */ 735 591 tmp = o; 736 592 o = w; 737 593 w = tmp;
+2 -1
net/ceph/debugfs.c
··· 132 132 req->r_osd ? req->r_osd->o_osd : -1, 133 133 req->r_pgid.pool, req->r_pgid.seed); 134 134 135 - seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); 135 + seq_printf(s, "%.*s", req->r_base_oid.name_len, 136 + req->r_base_oid.name); 136 137 137 138 if (req->r_reassert_version.epoch) 138 139 seq_printf(s, "\t%u'%llu",
+13 -19
net/ceph/messenger.c
··· 15 15 #include <linux/dns_resolver.h> 16 16 #include <net/tcp.h> 17 17 18 + #include <linux/ceph/ceph_features.h> 18 19 #include <linux/ceph/libceph.h> 19 20 #include <linux/ceph/messenger.h> 20 21 #include <linux/ceph/decode.h> ··· 1866 1865 port = (port * 10) + (*p - '0'); 1867 1866 p++; 1868 1867 } 1869 - if (port > 65535 || port == 0) 1868 + if (port == 0) 1869 + port = CEPH_MON_PORT; 1870 + else if (port > 65535) 1870 1871 goto bad; 1871 1872 } else { 1872 1873 port = CEPH_MON_PORT; ··· 1948 1945 { 1949 1946 u64 sup_feat = con->msgr->supported_features; 1950 1947 u64 req_feat = con->msgr->required_features; 1951 - u64 server_feat = le64_to_cpu(con->in_reply.features); 1948 + u64 server_feat = ceph_sanitize_features( 1949 + le64_to_cpu(con->in_reply.features)); 1952 1950 int ret; 1953 1951 1954 1952 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); ··· 2857 2853 */ 2858 2854 void ceph_messenger_init(struct ceph_messenger *msgr, 2859 2855 struct ceph_entity_addr *myaddr, 2860 - u32 supported_features, 2861 - u32 required_features, 2856 + u64 supported_features, 2857 + u64 required_features, 2862 2858 bool nocrc) 2863 2859 { 2864 2860 msgr->supported_features = supported_features; ··· 3130 3126 INIT_LIST_HEAD(&m->data); 3131 3127 3132 3128 /* front */ 3133 - m->front_max = front_len; 3134 3129 if (front_len) { 3135 - if (front_len > PAGE_CACHE_SIZE) { 3136 - m->front.iov_base = __vmalloc(front_len, flags, 3137 - PAGE_KERNEL); 3138 - m->front_is_vmalloc = true; 3139 - } else { 3140 - m->front.iov_base = kmalloc(front_len, flags); 3141 - } 3130 + m->front.iov_base = ceph_kvmalloc(front_len, flags); 3142 3131 if (m->front.iov_base == NULL) { 3143 3132 dout("ceph_msg_new can't allocate %d bytes\n", 3144 3133 front_len); ··· 3140 3143 } else { 3141 3144 m->front.iov_base = NULL; 3142 3145 } 3143 - m->front.iov_len = front_len; 3146 + m->front_alloc_len = m->front.iov_len = front_len; 3144 3147 3145 3148 dout("ceph_msg_new %p front %d\n", m, front_len); 3146 3149 return m; ··· 3253 3256 void ceph_msg_kfree(struct ceph_msg *m) 3254 3257 { 3255 3258 dout("msg_kfree %p\n", m); 3256 - if (m->front_is_vmalloc) 3257 - vfree(m->front.iov_base); 3258 - else 3259 - kfree(m->front.iov_base); 3259 + ceph_kvfree(m->front.iov_base); 3260 3260 kmem_cache_free(ceph_msg_cache, m); 3261 3261 } 3262 3262 ··· 3295 3301 3296 3302 void ceph_msg_dump(struct ceph_msg *msg) 3297 3303 { 3298 - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, 3299 - msg->front_max, msg->data_length); 3304 + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, 3305 + msg->front_alloc_len, msg->data_length); 3300 3306 print_hex_dump(KERN_DEBUG, "header: ", 3301 3307 DUMP_PREFIX_OFFSET, 16, 1, 3302 3308 &msg->hdr, sizeof(msg->hdr), true);
+4 -4
net/ceph/mon_client.c
··· 152 152 /* initiatiate authentication handshake */ 153 153 ret = ceph_auth_build_hello(monc->auth, 154 154 monc->m_auth->front.iov_base, 155 - monc->m_auth->front_max); 155 + monc->m_auth->front_alloc_len); 156 156 __send_prepared_auth_request(monc, ret); 157 157 } else { 158 158 dout("open_session mon%d already open\n", monc->cur_mon); ··· 196 196 int num; 197 197 198 198 p = msg->front.iov_base; 199 - end = p + msg->front_max; 199 + end = p + msg->front_alloc_len; 200 200 201 201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; 202 202 ceph_encode_32(&p, num); ··· 897 897 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 898 898 msg->front.iov_len, 899 899 monc->m_auth->front.iov_base, 900 - monc->m_auth->front_max); 900 + monc->m_auth->front_alloc_len); 901 901 if (ret < 0) { 902 902 monc->client->auth_err = ret; 903 903 wake_up_all(&monc->client->auth_wq); ··· 939 939 return 0; 940 940 941 941 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, 942 - monc->m_auth->front_max); 942 + monc->m_auth->front_alloc_len); 943 943 if (ret <= 0) 944 944 return ret; /* either an error, or no need to authenticate */ 945 945 __send_prepared_auth_request(monc, ret);
+255 -28
net/ceph/osd_client.c
··· 338 338 msg_size = 4 + 4 + 8 + 8 + 4+8; 339 339 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 340 340 msg_size += 1 + 8 + 4 + 4; /* pg_t */ 341 - msg_size += 4 + MAX_OBJ_NAME_SIZE; 341 + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 342 342 msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); 343 343 msg_size += 8; /* snapid */ 344 344 msg_size += 8; /* snap_seq */ ··· 367 367 INIT_LIST_HEAD(&req->r_linger_osd); 368 368 INIT_LIST_HEAD(&req->r_req_lru_item); 369 369 INIT_LIST_HEAD(&req->r_osd_item); 370 + 371 + req->r_base_oloc.pool = -1; 372 + req->r_target_oloc.pool = -1; 370 373 371 374 /* create reply message */ 372 375 if (use_mempool) ··· 764 761 if (num_ops > 1) 765 762 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 766 763 767 - req->r_file_layout = *layout; /* keep a copy */ 764 + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); 768 765 769 - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", 770 - vino.ino, objnum); 771 - req->r_oid_len = strlen(req->r_oid); 766 + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), 767 + "%llx.%08llx", vino.ino, objnum); 768 + req->r_base_oid.name_len = strlen(req->r_base_oid.name); 772 769 773 770 return req; 774 771 } ··· 1047 1044 !ceph_con_opened(&osd->o_con)) { 1048 1045 struct ceph_osd_request *req; 1049 1046 1050 - dout(" osd addr hasn't changed and connection never opened," 1051 - " letting msgr retry"); 1047 + dout("osd addr hasn't changed and connection never opened, " 1048 + "letting msgr retry\n"); 1052 1049 /* touch each r_stamp for handle_timeout()'s benfit */ 1053 1050 list_for_each_entry(req, &osd->o_requests, r_osd_item) 1054 1051 req->r_stamp = jiffies; ··· 1235 1232 EXPORT_SYMBOL(ceph_osdc_set_request_linger); 1236 1233 1237 1234 /* 1235 + * Returns whether a request should be blocked from being sent 1236 + * based on the current osdmap and osd_client settings. 1237 + * 1238 + * Caller should hold map_sem for read. 1239 + */ 1240 + static bool __req_should_be_paused(struct ceph_osd_client *osdc, 1241 + struct ceph_osd_request *req) 1242 + { 1243 + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); 1244 + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || 1245 + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 1246 + return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || 1247 + (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); 1248 + } 1249 + 1250 + /* 1251 + * Calculate mapping of a request to a PG. Takes tiering into account. 1252 + */ 1253 + static int __calc_request_pg(struct ceph_osdmap *osdmap, 1254 + struct ceph_osd_request *req, 1255 + struct ceph_pg *pg_out) 1256 + { 1257 + bool need_check_tiering; 1258 + 1259 + need_check_tiering = false; 1260 + if (req->r_target_oloc.pool == -1) { 1261 + req->r_target_oloc = req->r_base_oloc; /* struct */ 1262 + need_check_tiering = true; 1263 + } 1264 + if (req->r_target_oid.name_len == 0) { 1265 + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); 1266 + need_check_tiering = true; 1267 + } 1268 + 1269 + if (need_check_tiering && 1270 + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 1271 + struct ceph_pg_pool_info *pi; 1272 + 1273 + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); 1274 + if (pi) { 1275 + if ((req->r_flags & CEPH_OSD_FLAG_READ) && 1276 + pi->read_tier >= 0) 1277 + req->r_target_oloc.pool = pi->read_tier; 1278 + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && 1279 + pi->write_tier >= 0) 1280 + req->r_target_oloc.pool = pi->write_tier; 1281 + } 1282 + /* !pi is caught in ceph_oloc_oid_to_pg() */ 1283 + } 1284 + 1285 + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, 1286 + &req->r_target_oid, pg_out); 1287 + } 1288 + 1289 + /* 1238 1290 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct 1239 1291 * (as needed), and set the request r_osd appropriately. If there is 1240 1292 * no up osd, set r_osd to NULL. Move the request to the appropriate list ··· 1306 1248 int acting[CEPH_PG_MAX_SIZE]; 1307 1249 int o = -1, num = 0; 1308 1250 int err; 1251 + bool was_paused; 1309 1252 1310 1253 dout("map_request %p tid %lld\n", req, req->r_tid); 1311 - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, 1312 - ceph_file_layout_pg_pool(req->r_file_layout)); 1254 + 1255 + err = __calc_request_pg(osdc->osdmap, req, &pgid); 1313 1256 if (err) { 1314 1257 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1315 1258 return err; ··· 1323 1264 num = err; 1324 1265 } 1325 1266 1267 + was_paused = req->r_paused; 1268 + req->r_paused = __req_should_be_paused(osdc, req); 1269 + if (was_paused && !req->r_paused) 1270 + force_resend = 1; 1271 + 1326 1272 if ((!force_resend && 1327 1273 req->r_osd && req->r_osd->o_osd == o && 1328 1274 req->r_sent >= req->r_osd->o_incarnation && 1329 1275 req->r_num_pg_osds == num && 1330 1276 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || 1331 - (req->r_osd == NULL && o == -1)) 1277 + (req->r_osd == NULL && o == -1) || 1278 + req->r_paused) 1332 1279 return 0; /* no change */ 1333 1280 1334 1281 dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", ··· 1396 1331 /* fill in message content that changes each time we send it */ 1397 1332 put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); 1398 1333 put_unaligned_le32(req->r_flags, req->r_request_flags); 1399 - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); 1334 + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); 1400 1335 p = req->r_request_pgid; 1401 1336 ceph_encode_64(&p, req->r_pgid.pool); 1402 1337 ceph_encode_32(&p, req->r_pgid.seed); ··· 1497 1432 round_jiffies_relative(delay)); 1498 1433 } 1499 1434 1435 + static int ceph_oloc_decode(void **p, void *end, 1436 + struct ceph_object_locator *oloc) 1437 + { 1438 + u8 struct_v, struct_cv; 1439 + u32 len; 1440 + void *struct_end; 1441 + int ret = 0; 1442 + 1443 + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); 1444 + struct_v = ceph_decode_8(p); 1445 + struct_cv = ceph_decode_8(p); 1446 + if (struct_v < 3) { 1447 + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", 1448 + struct_v, struct_cv); 1449 + goto e_inval; 1450 + } 1451 + if (struct_cv > 6) { 1452 + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", 1453 + struct_v, struct_cv); 1454 + goto e_inval; 1455 + } 1456 + len = ceph_decode_32(p); 1457 + ceph_decode_need(p, end, len, e_inval); 1458 + struct_end = *p + len; 1459 + 1460 + oloc->pool = ceph_decode_64(p); 1461 + *p += 4; /* skip preferred */ 1462 + 1463 + len = ceph_decode_32(p); 1464 + if (len > 0) { 1465 + pr_warn("ceph_object_locator::key is set\n"); 1466 + goto e_inval; 1467 + } 1468 + 1469 + if (struct_v >= 5) { 1470 + len = ceph_decode_32(p); 1471 + if (len > 0) { 1472 + pr_warn("ceph_object_locator::nspace is set\n"); 1473 + goto e_inval; 1474 + } 1475 + } 1476 + 1477 + if (struct_v >= 6) { 1478 + s64 hash = ceph_decode_64(p); 1479 + if (hash != -1) { 1480 + pr_warn("ceph_object_locator::hash is set\n"); 1481 + goto e_inval; 1482 + } 1483 + } 1484 + 1485 + /* skip the rest */ 1486 + *p = struct_end; 1487 + out: 1488 + return ret; 1489 + 1490 + e_inval: 1491 + ret = -EINVAL; 1492 + goto out; 1493 + } 1494 + 1495 + static int ceph_redirect_decode(void **p, void *end, 1496 + struct ceph_request_redirect *redir) 1497 + { 1498 + u8 struct_v, struct_cv; 1499 + u32 len; 1500 + void *struct_end; 1501 + int ret; 1502 + 1503 + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); 1504 + struct_v = ceph_decode_8(p); 1505 + struct_cv = ceph_decode_8(p); 1506 + if (struct_cv > 1) { 1507 + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", 1508 + struct_v, struct_cv); 1509 + goto e_inval; 1510 + } 1511 + len = ceph_decode_32(p); 1512 + ceph_decode_need(p, end, len, e_inval); 1513 + struct_end = *p + len; 1514 + 1515 + ret = ceph_oloc_decode(p, end, &redir->oloc); 1516 + if (ret) 1517 + goto out; 1518 + 1519 + len = ceph_decode_32(p); 1520 + if (len > 0) { 1521 + pr_warn("ceph_request_redirect::object_name is set\n"); 1522 + goto e_inval; 1523 + } 1524 + 1525 + len = ceph_decode_32(p); 1526 + *p += len; /* skip osd_instructions */ 1527 + 1528 + /* skip the rest */ 1529 + *p = struct_end; 1530 + out: 1531 + return ret; 1532 + 1533 + e_inval: 1534 + ret = -EINVAL; 1535 + goto out; 1536 + } 1537 + 1500 1538 static void complete_request(struct ceph_osd_request *req) 1501 1539 { 1502 1540 complete_all(&req->r_safe_completion); /* fsync waiter */ ··· 1614 1446 { 1615 1447 void *p, *end; 1616 1448 struct ceph_osd_request *req; 1449 + struct ceph_request_redirect redir; 1617 1450 u64 tid; 1618 1451 int object_len; 1619 1452 unsigned int numops; ··· 1694 1525 for (i = 0; i < numops; i++) 1695 1526 req->r_reply_op_result[i] = ceph_decode_32(&p); 1696 1527 1528 + if (le16_to_cpu(msg->hdr.version) >= 6) { 1529 + p += 8 + 4; /* skip replay_version */ 1530 + p += 8; /* skip user_version */ 1531 + 1532 + err = ceph_redirect_decode(&p, end, &redir); 1533 + if (err) 1534 + goto bad_put; 1535 + } else { 1536 + redir.oloc.pool = -1; 1537 + } 1538 + 1539 + if (redir.oloc.pool != -1) { 1540 + dout("redirect pool %lld\n", redir.oloc.pool); 1541 + 1542 + __unregister_request(osdc, req); 1543 + mutex_unlock(&osdc->request_mutex); 1544 + 1545 + req->r_target_oloc = redir.oloc; /* struct */ 1546 + 1547 + /* 1548 + * Start redirect requests with nofail=true. If 1549 + * mapping fails, request will end up on the notarget 1550 + * list, waiting for the new osdmap (which can take 1551 + * a while), even though the original request mapped 1552 + * successfully. In the future we might want to follow 1553 + * original request's nofail setting here. 1554 + */ 1555 + err = ceph_osdc_start_request(osdc, req, true); 1556 + BUG_ON(err); 1557 + 1558 + goto done; 1559 + } 1560 + 1697 1561 already_completed = req->r_got_reply; 1698 - 1699 1562 if (!req->r_got_reply) { 1700 - 1701 1563 req->r_result = result; 1702 1564 dout("handle_reply result %d bytes %d\n", req->r_result, 1703 1565 bytes); ··· 1781 1581 return; 1782 1582 1783 1583 bad_put: 1584 + req->r_result = -EIO; 1585 + __unregister_request(osdc, req); 1586 + if (req->r_callback) 1587 + req->r_callback(req, msg); 1588 + else 1589 + complete_all(&req->r_completion); 1590 + complete_request(req); 1784 1591 ceph_osdc_put_request(req); 1785 1592 bad_mutex: 1786 1593 mutex_unlock(&osdc->request_mutex); ··· 1820 1613 * 1821 1614 * Caller should hold map_sem for read. 1822 1615 */ 1823 - static void kick_requests(struct ceph_osd_client *osdc, int force_resend) 1616 + static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, 1617 + bool force_resend_writes) 1824 1618 { 1825 1619 struct ceph_osd_request *req, *nreq; 1826 1620 struct rb_node *p; 1827 1621 int needmap = 0; 1828 1622 int err; 1623 + bool force_resend_req; 1829 1624 1830 - dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1625 + dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", 1626 + force_resend_writes ? " (force resend writes)" : ""); 1831 1627 mutex_lock(&osdc->request_mutex); 1832 1628 for (p = rb_first(&osdc->requests); p; ) { 1833 1629 req = rb_entry(p, struct ceph_osd_request, r_node); ··· 1855 1645 continue; 1856 1646 } 1857 1647 1858 - err = __map_request(osdc, req, force_resend); 1648 + force_resend_req = force_resend || 1649 + (force_resend_writes && 1650 + req->r_flags & CEPH_OSD_FLAG_WRITE); 1651 + err = __map_request(osdc, req, force_resend_req); 1859 1652 if (err < 0) 1860 1653 continue; /* error */ 1861 1654 if (req->r_osd == NULL) { ··· 1878 1665 r_linger_item) { 1879 1666 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 1880 1667 1881 - err = __map_request(osdc, req, force_resend); 1668 + err = __map_request(osdc, req, 1669 + force_resend || force_resend_writes); 1882 1670 dout("__map_request returned %d\n", err); 1883 1671 if (err == 0) 1884 1672 continue; /* no change and no osd was specified */ ··· 1921 1707 struct ceph_osdmap *newmap = NULL, *oldmap; 1922 1708 int err; 1923 1709 struct ceph_fsid fsid; 1710 + bool was_full; 1924 1711 1925 1712 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); 1926 1713 p = msg->front.iov_base; ··· 1934 1719 return; 1935 1720 1936 1721 down_write(&osdc->map_sem); 1722 + 1723 + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 1937 1724 1938 1725 /* incremental maps */ 1939 1726 ceph_decode_32_safe(&p, end, nr_maps, bad); ··· 1961 1744 ceph_osdmap_destroy(osdc->osdmap); 1962 1745 osdc->osdmap = newmap; 1963 1746 } 1964 - kick_requests(osdc, 0); 1747 + was_full = was_full || 1748 + ceph_osdmap_flag(osdc->osdmap, 1749 + CEPH_OSDMAP_FULL); 1750 + kick_requests(osdc, 0, was_full); 1965 1751 } else { 1966 1752 dout("ignoring incremental map %u len %d\n", 1967 1753 epoch, maplen); ··· 2007 1787 skipped_map = 1; 2008 1788 ceph_osdmap_destroy(oldmap); 2009 1789 } 2010 - kick_requests(osdc, skipped_map); 1790 + was_full = was_full || 1791 + ceph_osdmap_flag(osdc->osdmap, 1792 + CEPH_OSDMAP_FULL); 1793 + kick_requests(osdc, skipped_map, was_full); 2011 1794 } 2012 1795 p += maplen; 2013 1796 nr_maps--; ··· 2027 1804 * we find out when we are no longer full and stop returning 2028 1805 * ENOSPC. 2029 1806 */ 2030 - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 1807 + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || 1808 + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || 1809 + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) 2031 1810 ceph_monc_request_next_osdmap(&osdc->client->monc); 2032 1811 2033 1812 mutex_lock(&osdc->request_mutex); ··· 2293 2068 ceph_encode_32(&p, -1); /* preferred */ 2294 2069 2295 2070 /* oid */ 2296 - ceph_encode_32(&p, req->r_oid_len); 2297 - memcpy(p, req->r_oid, req->r_oid_len); 2298 - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); 2299 - p += req->r_oid_len; 2071 + ceph_encode_32(&p, req->r_base_oid.name_len); 2072 + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); 2073 + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, 2074 + req->r_base_oid.name, req->r_base_oid.name_len); 2075 + p += req->r_base_oid.name_len; 2300 2076 2301 2077 /* ops--can imply data */ 2302 2078 ceph_encode_16(&p, (u16)req->r_num_ops); ··· 2680 2454 struct ceph_osd_client *osdc = osd->o_osdc; 2681 2455 struct ceph_msg *m; 2682 2456 struct ceph_osd_request *req; 2683 - int front = le32_to_cpu(hdr->front_len); 2457 + int front_len = le32_to_cpu(hdr->front_len); 2684 2458 int data_len = le32_to_cpu(hdr->data_len); 2685 2459 u64 tid; 2686 2460 ··· 2700 2474 req->r_reply, req->r_reply->con); 2701 2475 ceph_msg_revoke_incoming(req->r_reply); 2702 2476 2703 - if (front > req->r_reply->front.iov_len) { 2477 + if (front_len > req->r_reply->front_alloc_len) { 2704 2478 pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", 2705 - front, (int)req->r_reply->front.iov_len, 2479 + front_len, req->r_reply->front_alloc_len, 2706 2480 (unsigned int)con->peer_name.type, 2707 2481 le64_to_cpu(con->peer_name.num)); 2708 - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); 2482 + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, 2483 + false); 2709 2484 if (!m) 2710 2485 goto out; 2711 2486 ceph_msg_put(req->r_reply);
+60 -18
net/ceph/osdmap.c
··· 464 464 return NULL; 465 465 } 466 466 467 + struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) 468 + { 469 + return __lookup_pg_pool(&map->pg_pools, id); 470 + } 471 + 467 472 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) 468 473 { 469 474 struct ceph_pg_pool_info *pi; ··· 519 514 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); 520 515 return -EINVAL; 521 516 } 522 - if (cv > 7) { 523 - pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); 517 + if (cv > 9) { 518 + pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); 524 519 return -EINVAL; 525 520 } 526 521 len = ceph_decode_32(p); ··· 548 543 *p += len; 549 544 } 550 545 551 - /* skip removed snaps */ 546 + /* skip removed_snaps */ 552 547 num = ceph_decode_32(p); 553 548 *p += num * (8 + 8); 554 549 555 550 *p += 8; /* skip auid */ 556 551 pi->flags = ceph_decode_64(p); 552 + *p += 4; /* skip crash_replay_interval */ 553 + 554 + if (ev >= 7) 555 + *p += 1; /* skip min_size */ 556 + 557 + if (ev >= 8) 558 + *p += 8 + 8; /* skip quota_max_* */ 559 + 560 + if (ev >= 9) { 561 + /* skip tiers */ 562 + num = ceph_decode_32(p); 563 + *p += num * 8; 564 + 565 + *p += 8; /* skip tier_of */ 566 + *p += 1; /* skip cache_mode */ 567 + 568 + pi->read_tier = ceph_decode_64(p); 569 + pi->write_tier = ceph_decode_64(p); 570 + } else { 571 + pi->read_tier = -1; 572 + pi->write_tier = -1; 573 + } 557 574 558 575 /* ignore the rest */ 559 576 ··· 1117 1090 EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1118 1091 1119 1092 /* 1120 - * calculate an object layout (i.e. pgid) from an oid, 1121 - * file_layout, and osdmap 1093 + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be 1094 + * called with target's (oloc, oid), since tiering isn't taken into 1095 + * account. 1122 1096 */ 1123 - int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, 1124 - struct ceph_osdmap *osdmap, uint64_t pool) 1097 + int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 1098 + struct ceph_object_locator *oloc, 1099 + struct ceph_object_id *oid, 1100 + struct ceph_pg *pg_out) 1125 1101 { 1126 - struct ceph_pg_pool_info *pool_info; 1102 + struct ceph_pg_pool_info *pi; 1127 1103 1128 - BUG_ON(!osdmap); 1129 - pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); 1130 - if (!pool_info) 1104 + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); 1105 + if (!pi) 1131 1106 return -EIO; 1132 - pg->pool = pool; 1133 - pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); 1134 1107 1135 - dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); 1108 + pg_out->pool = oloc->pool; 1109 + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, 1110 + oid->name_len); 1111 + 1112 + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, 1113 + pg_out->pool, pg_out->seed); 1136 1114 return 0; 1137 1115 } 1138 - EXPORT_SYMBOL(ceph_calc_ceph_pg); 1116 + EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1117 + 1118 + static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, 1119 + int *result, int result_max, 1120 + const __u32 *weight, int weight_max) 1121 + { 1122 + int scratch[result_max * 3]; 1123 + 1124 + return crush_do_rule(map, ruleno, x, result, result_max, 1125 + weight, weight_max, scratch); 1126 + } 1139 1127 1140 1128 /* 1141 1129 * Calculate raw osd vector for the given pgid. Return pointer to osd ··· 1205 1163 pool->pgp_num_mask) + 1206 1164 (unsigned)pgid.pool; 1207 1165 } 1208 - r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1209 - min_t(int, pool->size, *num), 1210 - osdmap->osd_weight); 1166 + r = crush_do_rule_ary(osdmap->crush, ruleno, pps, 1167 + osds, min_t(int, pool->size, *num), 1168 + osdmap->osd_weight, osdmap->max_osd); 1211 1169 if (r < 0) { 1212 1170 pr_err("error %d from crush rule: pool %lld ruleset %d type %d" 1213 1171 " size %d\n", r, pgid.pool, pool->crush_ruleset,