Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

+26

Documentation/ABI/testing/sysfs-bus-rbd

··· 18 18 19 19 $ echo <dev-id> > /sys/bus/rbd/remove 20 20 21 + What: /sys/bus/rbd/add_single_major 22 + Date: December 2013 23 + KernelVersion: 3.14 24 + Contact: Sage Weil <sage@inktank.com> 25 + Description: Available only if rbd module is inserted with single_major 26 + parameter set to true. 27 + Usage is the same as for /sys/bus/rbd/add. If present, 28 + should be used instead of the latter: any attempts to use 29 + /sys/bus/rbd/add if /sys/bus/rbd/add_single_major is 30 + available will fail for backwards compatibility reasons. 31 + 32 + What: /sys/bus/rbd/remove_single_major 33 + Date: December 2013 34 + KernelVersion: 3.14 35 + Contact: Sage Weil <sage@inktank.com> 36 + Description: Available only if rbd module is inserted with single_major 37 + parameter set to true. 38 + Usage is the same as for /sys/bus/rbd/remove. If present, 39 + should be used instead of the latter: any attempts to use 40 + /sys/bus/rbd/remove if /sys/bus/rbd/remove_single_major is 41 + available will fail for backwards compatibility reasons. 42 + 21 43 Entries under /sys/bus/rbd/devices/<dev-id>/ 22 44 -------------------------------------------- 23 45 ··· 54 32 major 55 33 56 34 The block device major number. 35 + 36 + minor 37 + 38 + The block device minor number. (December 2013, since 3.14.) 57 39 58 40 name 59 41

+1 -1

MAINTAINERS

··· 7075 7075 RADOS BLOCK DEVICE (RBD) 7076 7076 M: Yehuda Sadeh <yehuda@inktank.com> 7077 7077 M: Sage Weil <sage@inktank.com> 7078 - M: Alex Elder <elder@inktank.com> 7078 + M: Alex Elder <elder@kernel.org> 7079 7079 M: ceph-devel@vger.kernel.org 7080 7080 W: http://ceph.com/ 7081 7081 T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git

+205 -100

drivers/block/rbd.c

··· 41 41 #include <linux/fs.h> 42 42 #include <linux/blkdev.h> 43 43 #include <linux/slab.h> 44 + #include <linux/idr.h> 44 45 45 46 #include "rbd_types.h" 46 47 ··· 90 89 } 91 90 92 91 #define RBD_DRV_NAME "rbd" 93 - #define RBD_DRV_NAME_LONG "rbd (rados block device)" 94 92 95 - #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 93 + #define RBD_MINORS_PER_MAJOR 256 94 + #define RBD_SINGLE_MAJOR_PART_SHIFT 4 96 95 97 96 #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 98 97 #define RBD_MAX_SNAP_NAME_LEN \ ··· 324 323 int dev_id; /* blkdev unique id */ 325 324 326 325 int major; /* blkdev assigned major */ 326 + int minor; 327 327 struct gendisk *disk; /* blkdev's gendisk and rq */ 328 328 329 329 u32 image_format; /* Either 1 or 2 */ ··· 388 386 static struct kmem_cache *rbd_obj_request_cache; 389 387 static struct kmem_cache *rbd_segment_name_cache; 390 388 389 + static int rbd_major; 390 + static DEFINE_IDA(rbd_dev_id_ida); 391 + 392 + /* 393 + * Default to false for now, as single-major requires >= 0.75 version of 394 + * userspace rbd utility. 395 + */ 396 + static bool single_major = false; 397 + module_param(single_major, bool, S_IRUGO); 398 + MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 399 + 391 400 static int rbd_img_request_submit(struct rbd_img_request *img_request); 392 401 393 402 static void rbd_dev_device_release(struct device *dev); ··· 407 394 size_t count); 408 395 static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 409 396 size_t count); 397 + static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 398 + size_t count); 399 + static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 400 + size_t count); 410 401 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); 411 402 static void rbd_spec_put(struct rbd_spec *spec); 412 403 404 + static int rbd_dev_id_to_minor(int dev_id) 405 + { 406 + return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 407 + } 408 + 409 + static int minor_to_rbd_dev_id(int minor) 410 + { 411 + return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 412 + } 413 + 413 414 static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 414 415 static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 416 + static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 417 + static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 415 418 416 419 static struct attribute *rbd_bus_attrs[] = { 417 420 &bus_attr_add.attr, 418 421 &bus_attr_remove.attr, 422 + &bus_attr_add_single_major.attr, 423 + &bus_attr_remove_single_major.attr, 419 424 NULL, 420 425 }; 421 - ATTRIBUTE_GROUPS(rbd_bus); 426 + 427 + static umode_t rbd_bus_is_visible(struct kobject *kobj, 428 + struct attribute *attr, int index) 429 + { 430 + if (!single_major && 431 + (attr == &bus_attr_add_single_major.attr || 432 + attr == &bus_attr_remove_single_major.attr)) 433 + return 0; 434 + 435 + return attr->mode; 436 + } 437 + 438 + static const struct attribute_group rbd_bus_group = { 439 + .attrs = rbd_bus_attrs, 440 + .is_visible = rbd_bus_is_visible, 441 + }; 442 + __ATTRIBUTE_GROUPS(rbd_bus); 422 443 423 444 static struct bus_type rbd_bus_type = { 424 445 .name = "rbd", ··· 1088 1041 name_format = "%s.%012llx"; 1089 1042 if (rbd_dev->image_format == 2) 1090 1043 name_format = "%s.%016llx"; 1091 - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, 1044 + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 1092 1045 rbd_dev->header.object_prefix, segment); 1093 - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 1046 + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 1094 1047 pr_err("error formatting segment name for #%llu (%d)\n", 1095 1048 segment, ret); 1096 1049 kfree(name); ··· 1808 1761 osd_req->r_callback = rbd_osd_req_callback; 1809 1762 osd_req->r_priv = obj_request; 1810 1763 1811 - osd_req->r_oid_len = strlen(obj_request->object_name); 1812 - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1813 - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1814 - 1815 - osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1764 + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1765 + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1816 1766 1817 1767 return osd_req; 1818 1768 } ··· 1846 1802 osd_req->r_callback = rbd_osd_req_callback; 1847 1803 osd_req->r_priv = obj_request; 1848 1804 1849 - osd_req->r_oid_len = strlen(obj_request->object_name); 1850 - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1851 - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1852 - 1853 - osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1805 + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1806 + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1854 1807 1855 1808 return osd_req; 1856 1809 } ··· 2907 2866 * Request sync osd watch/unwatch. The value of "start" determines 2908 2867 * whether a watch request is being initiated or torn down. 2909 2868 */ 2910 - static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 2869 + static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 2911 2870 { 2912 2871 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2913 2872 struct rbd_obj_request *obj_request; ··· 2980 2939 rbd_obj_request_put(obj_request); 2981 2940 2982 2941 return ret; 2942 + } 2943 + 2944 + static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 2945 + { 2946 + return __rbd_dev_header_watch_sync(rbd_dev, true); 2947 + } 2948 + 2949 + static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 2950 + { 2951 + int ret; 2952 + 2953 + ret = __rbd_dev_header_watch_sync(rbd_dev, false); 2954 + if (ret) { 2955 + rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", 2956 + ret); 2957 + } 2983 2958 } 2984 2959 2985 2960 /* ··· 3445 3388 u64 segment_size; 3446 3389 3447 3390 /* create gendisk info */ 3448 - disk = alloc_disk(RBD_MINORS_PER_MAJOR); 3391 + disk = alloc_disk(single_major ? 3392 + (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 3393 + RBD_MINORS_PER_MAJOR); 3449 3394 if (!disk) 3450 3395 return -ENOMEM; 3451 3396 3452 3397 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3453 3398 rbd_dev->dev_id); 3454 3399 disk->major = rbd_dev->major; 3455 - disk->first_minor = 0; 3400 + disk->first_minor = rbd_dev->minor; 3401 + if (single_major) 3402 + disk->flags |= GENHD_FL_EXT_DEVT; 3456 3403 disk->fops = &rbd_bd_ops; 3457 3404 disk->private_data = rbd_dev; 3458 3405 ··· 3528 3467 return sprintf(buf, "%d\n", rbd_dev->major); 3529 3468 3530 3469 return sprintf(buf, "(none)\n"); 3470 + } 3531 3471 3472 + static ssize_t rbd_minor_show(struct device *dev, 3473 + struct device_attribute *attr, char *buf) 3474 + { 3475 + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3476 + 3477 + return sprintf(buf, "%d\n", rbd_dev->minor); 3532 3478 } 3533 3479 3534 3480 static ssize_t rbd_client_id_show(struct device *dev, ··· 3657 3589 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 3658 3590 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3659 3591 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3592 + static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3660 3593 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3661 3594 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 3662 3595 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); ··· 3671 3602 &dev_attr_size.attr, 3672 3603 &dev_attr_features.attr, 3673 3604 &dev_attr_major.attr, 3605 + &dev_attr_minor.attr, 3674 3606 &dev_attr_client_id.attr, 3675 3607 &dev_attr_pool.attr, 3676 3608 &dev_attr_pool_id.attr, ··· 4442 4372 device_unregister(&rbd_dev->dev); 4443 4373 } 4444 4374 4445 - static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 4446 - 4447 4375 /* 4448 4376 * Get a unique rbd identifier for the given new rbd_dev, and add 4449 - * the rbd_dev to the global list. The minimum rbd id is 1. 4377 + * the rbd_dev to the global list. 4450 4378 */ 4451 - static void rbd_dev_id_get(struct rbd_device *rbd_dev) 4379 + static int rbd_dev_id_get(struct rbd_device *rbd_dev) 4452 4380 { 4453 - rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 4381 + int new_dev_id; 4382 + 4383 + new_dev_id = ida_simple_get(&rbd_dev_id_ida, 4384 + 0, minor_to_rbd_dev_id(1 << MINORBITS), 4385 + GFP_KERNEL); 4386 + if (new_dev_id < 0) 4387 + return new_dev_id; 4388 + 4389 + rbd_dev->dev_id = new_dev_id; 4454 4390 4455 4391 spin_lock(&rbd_dev_list_lock); 4456 4392 list_add_tail(&rbd_dev->node, &rbd_dev_list); 4457 4393 spin_unlock(&rbd_dev_list_lock); 4458 - dout("rbd_dev %p given dev id %llu\n", rbd_dev, 4459 - (unsigned long long) rbd_dev->dev_id); 4394 + 4395 + dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); 4396 + 4397 + return 0; 4460 4398 } 4461 4399 4462 4400 /* ··· 4473 4395 */ 4474 4396 static void rbd_dev_id_put(struct rbd_device *rbd_dev) 4475 4397 { 4476 - struct list_head *tmp; 4477 - int rbd_id = rbd_dev->dev_id; 4478 - int max_id; 4479 - 4480 - rbd_assert(rbd_id > 0); 4481 - 4482 - dout("rbd_dev %p released dev id %llu\n", rbd_dev, 4483 - (unsigned long long) rbd_dev->dev_id); 4484 4398 spin_lock(&rbd_dev_list_lock); 4485 4399 list_del_init(&rbd_dev->node); 4486 - 4487 - /* 4488 - * If the id being "put" is not the current maximum, there 4489 - * is nothing special we need to do. 4490 - */ 4491 - if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 4492 - spin_unlock(&rbd_dev_list_lock); 4493 - return; 4494 - } 4495 - 4496 - /* 4497 - * We need to update the current maximum id. Search the 4498 - * list to find out what it is. We're more likely to find 4499 - * the maximum at the end, so search the list backward. 4500 - */ 4501 - max_id = 0; 4502 - list_for_each_prev(tmp, &rbd_dev_list) { 4503 - struct rbd_device *rbd_dev; 4504 - 4505 - rbd_dev = list_entry(tmp, struct rbd_device, node); 4506 - if (rbd_dev->dev_id > max_id) 4507 - max_id = rbd_dev->dev_id; 4508 - } 4509 4400 spin_unlock(&rbd_dev_list_lock); 4510 4401 4511 - /* 4512 - * The max id could have been updated by rbd_dev_id_get(), in 4513 - * which case it now accurately reflects the new maximum. 4514 - * Be careful not to overwrite the maximum value in that 4515 - * case. 4516 - */ 4517 - atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 4518 - dout(" max dev id has been reset\n"); 4402 + ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 4403 + 4404 + dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); 4519 4405 } 4520 4406 4521 4407 /* ··· 4902 4860 { 4903 4861 int ret; 4904 4862 4905 - /* generate unique id: find highest unique id, add one */ 4906 - rbd_dev_id_get(rbd_dev); 4863 + /* Get an id and fill in device name. */ 4907 4864 4908 - /* Fill in the device name, now that we have its id. */ 4865 + ret = rbd_dev_id_get(rbd_dev); 4866 + if (ret) 4867 + return ret; 4868 + 4909 4869 BUILD_BUG_ON(DEV_NAME_LEN 4910 4870 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 4911 4871 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 4912 4872 4913 - /* Get our block major device number. */ 4873 + /* Record our major and minor device numbers. */ 4914 4874 4915 - ret = register_blkdev(0, rbd_dev->name); 4916 - if (ret < 0) 4917 - goto err_out_id; 4918 - rbd_dev->major = ret; 4875 + if (!single_major) { 4876 + ret = register_blkdev(0, rbd_dev->name); 4877 + if (ret < 0) 4878 + goto err_out_id; 4879 + 4880 + rbd_dev->major = ret; 4881 + rbd_dev->minor = 0; 4882 + } else { 4883 + rbd_dev->major = rbd_major; 4884 + rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 4885 + } 4919 4886 4920 4887 /* Set up the blkdev mapping. */ 4921 4888 ··· 4956 4905 err_out_disk: 4957 4906 rbd_free_disk(rbd_dev); 4958 4907 err_out_blkdev: 4959 - unregister_blkdev(rbd_dev->major, rbd_dev->name); 4908 + if (!single_major) 4909 + unregister_blkdev(rbd_dev->major, rbd_dev->name); 4960 4910 err_out_id: 4961 4911 rbd_dev_id_put(rbd_dev); 4962 4912 rbd_dev_mapping_clear(rbd_dev); ··· 5013 4961 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) 5014 4962 { 5015 4963 int ret; 5016 - int tmp; 5017 4964 5018 4965 /* 5019 4966 * Get the id from the image id object. Unless there's an ··· 5031 4980 goto err_out_format; 5032 4981 5033 4982 if (mapping) { 5034 - ret = rbd_dev_header_watch_sync(rbd_dev, true); 4983 + ret = rbd_dev_header_watch_sync(rbd_dev); 5035 4984 if (ret) 5036 4985 goto out_header_name; 5037 4986 } ··· 5058 5007 err_out_probe: 5059 5008 rbd_dev_unprobe(rbd_dev); 5060 5009 err_out_watch: 5061 - if (mapping) { 5062 - tmp = rbd_dev_header_watch_sync(rbd_dev, false); 5063 - if (tmp) 5064 - rbd_warn(rbd_dev, "unable to tear down " 5065 - "watch request (%d)\n", tmp); 5066 - } 5010 + if (mapping) 5011 + rbd_dev_header_unwatch_sync(rbd_dev); 5067 5012 out_header_name: 5068 5013 kfree(rbd_dev->header_name); 5069 5014 rbd_dev->header_name = NULL; ··· 5073 5026 return ret; 5074 5027 } 5075 5028 5076 - static ssize_t rbd_add(struct bus_type *bus, 5077 - const char *buf, 5078 - size_t count) 5029 + static ssize_t do_rbd_add(struct bus_type *bus, 5030 + const char *buf, 5031 + size_t count) 5079 5032 { 5080 5033 struct rbd_device *rbd_dev = NULL; 5081 5034 struct ceph_options *ceph_opts = NULL; ··· 5137 5090 5138 5091 rc = rbd_dev_device_setup(rbd_dev); 5139 5092 if (rc) { 5093 + /* 5094 + * rbd_dev_header_unwatch_sync() can't be moved into 5095 + * rbd_dev_image_release() without refactoring, see 5096 + * commit 1f3ef78861ac. 5097 + */ 5098 + rbd_dev_header_unwatch_sync(rbd_dev); 5140 5099 rbd_dev_image_release(rbd_dev); 5141 5100 goto err_out_module; 5142 5101 } ··· 5163 5110 return (ssize_t)rc; 5164 5111 } 5165 5112 5113 + static ssize_t rbd_add(struct bus_type *bus, 5114 + const char *buf, 5115 + size_t count) 5116 + { 5117 + if (single_major) 5118 + return -EINVAL; 5119 + 5120 + return do_rbd_add(bus, buf, count); 5121 + } 5122 + 5123 + static ssize_t rbd_add_single_major(struct bus_type *bus, 5124 + const char *buf, 5125 + size_t count) 5126 + { 5127 + return do_rbd_add(bus, buf, count); 5128 + } 5129 + 5166 5130 static void rbd_dev_device_release(struct device *dev) 5167 5131 { 5168 5132 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); ··· 5187 5117 rbd_free_disk(rbd_dev); 5188 5118 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5189 5119 rbd_dev_mapping_clear(rbd_dev); 5190 - unregister_blkdev(rbd_dev->major, rbd_dev->name); 5191 - rbd_dev->major = 0; 5120 + if (!single_major) 5121 + unregister_blkdev(rbd_dev->major, rbd_dev->name); 5192 5122 rbd_dev_id_put(rbd_dev); 5193 5123 rbd_dev_mapping_clear(rbd_dev); 5194 5124 } ··· 5219 5149 } 5220 5150 } 5221 5151 5222 - static ssize_t rbd_remove(struct bus_type *bus, 5223 - const char *buf, 5224 - size_t count) 5152 + static ssize_t do_rbd_remove(struct bus_type *bus, 5153 + const char *buf, 5154 + size_t count) 5225 5155 { 5226 5156 struct rbd_device *rbd_dev = NULL; 5227 5157 struct list_head *tmp; ··· 5261 5191 if (ret < 0 || already) 5262 5192 return ret; 5263 5193 5264 - ret = rbd_dev_header_watch_sync(rbd_dev, false); 5265 - if (ret) 5266 - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); 5267 - 5194 + rbd_dev_header_unwatch_sync(rbd_dev); 5268 5195 /* 5269 5196 * flush remaining watch callbacks - these must be complete 5270 5197 * before the osd_client is shutdown 5271 5198 */ 5272 5199 dout("%s: flushing notifies", __func__); 5273 5200 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 5201 + 5274 5202 /* 5275 5203 * Don't free anything from rbd_dev->disk until after all 5276 5204 * notifies are completely processed. Otherwise ··· 5280 5212 module_put(THIS_MODULE); 5281 5213 5282 5214 return count; 5215 + } 5216 + 5217 + static ssize_t rbd_remove(struct bus_type *bus, 5218 + const char *buf, 5219 + size_t count) 5220 + { 5221 + if (single_major) 5222 + return -EINVAL; 5223 + 5224 + return do_rbd_remove(bus, buf, count); 5225 + } 5226 + 5227 + static ssize_t rbd_remove_single_major(struct bus_type *bus, 5228 + const char *buf, 5229 + size_t count) 5230 + { 5231 + return do_rbd_remove(bus, buf, count); 5283 5232 } 5284 5233 5285 5234 /* ··· 5344 5259 5345 5260 rbd_assert(!rbd_segment_name_cache); 5346 5261 rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 5347 - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); 5262 + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 5348 5263 if (rbd_segment_name_cache) 5349 5264 return 0; 5350 5265 out_err: ··· 5380 5295 5381 5296 if (!libceph_compatible(NULL)) { 5382 5297 rbd_warn(NULL, "libceph incompatibility (quitting)"); 5383 - 5384 5298 return -EINVAL; 5385 5299 } 5300 + 5386 5301 rc = rbd_slab_init(); 5387 5302 if (rc) 5388 5303 return rc; 5304 + 5305 + if (single_major) { 5306 + rbd_major = register_blkdev(0, RBD_DRV_NAME); 5307 + if (rbd_major < 0) { 5308 + rc = rbd_major; 5309 + goto err_out_slab; 5310 + } 5311 + } 5312 + 5389 5313 rc = rbd_sysfs_init(); 5390 5314 if (rc) 5391 - rbd_slab_exit(); 5392 - else 5393 - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 5315 + goto err_out_blkdev; 5394 5316 5317 + if (single_major) 5318 + pr_info("loaded (major %d)\n", rbd_major); 5319 + else 5320 + pr_info("loaded\n"); 5321 + 5322 + return 0; 5323 + 5324 + err_out_blkdev: 5325 + if (single_major) 5326 + unregister_blkdev(rbd_major, RBD_DRV_NAME); 5327 + err_out_slab: 5328 + rbd_slab_exit(); 5395 5329 return rc; 5396 5330 } 5397 5331 5398 5332 static void __exit rbd_exit(void) 5399 5333 { 5400 5334 rbd_sysfs_cleanup(); 5335 + if (single_major) 5336 + unregister_blkdev(rbd_major, RBD_DRV_NAME); 5401 5337 rbd_slab_exit(); 5402 5338 } 5403 5339 ··· 5428 5322 MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5429 5323 MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5430 5324 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5431 - MODULE_DESCRIPTION("rados block device"); 5432 - 5433 5325 /* following authorship retained from original osdblk.c */ 5434 5326 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5435 5327 5328 + MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5436 5329 MODULE_LICENSE("GPL");

+13

fs/ceph/Kconfig

··· 25 25 caching support for Ceph clients using FS-Cache 26 26 27 27 endif 28 + 29 + config CEPH_FS_POSIX_ACL 30 + bool "Ceph POSIX Access Control Lists" 31 + depends on CEPH_FS 32 + select FS_POSIX_ACL 33 + help 34 + POSIX Access Control Lists (ACLs) support permissions for users and 35 + groups beyond the owner/group/world scheme. 36 + 37 + To learn more about Access Control Lists, visit the POSIX ACLs for 38 + Linux website <http://acl.bestbits.at/>. 39 + 40 + If you don't know what Access Control Lists are, say N

+1

fs/ceph/Makefile

··· 10 10 debugfs.o 11 11 12 12 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o 13 + ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o

+332

fs/ceph/acl.c

··· 1 + /* 2 + * linux/fs/ceph/acl.c 3 + * 4 + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public 8 + * License v2 as published by the Free Software Foundation. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 + * General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public 16 + * License along with this program; if not, write to the 17 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 + * Boston, MA 021110-1307, USA. 19 + */ 20 + 21 + #include <linux/ceph/ceph_debug.h> 22 + #include <linux/fs.h> 23 + #include <linux/string.h> 24 + #include <linux/xattr.h> 25 + #include <linux/posix_acl_xattr.h> 26 + #include <linux/posix_acl.h> 27 + #include <linux/sched.h> 28 + #include <linux/slab.h> 29 + 30 + #include "super.h" 31 + 32 + static inline void ceph_set_cached_acl(struct inode *inode, 33 + int type, struct posix_acl *acl) 34 + { 35 + struct ceph_inode_info *ci = ceph_inode(inode); 36 + 37 + spin_lock(&ci->i_ceph_lock); 38 + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) 39 + set_cached_acl(inode, type, acl); 40 + spin_unlock(&ci->i_ceph_lock); 41 + } 42 + 43 + static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, 44 + int type) 45 + { 46 + struct ceph_inode_info *ci = ceph_inode(inode); 47 + struct posix_acl *acl = ACL_NOT_CACHED; 48 + 49 + spin_lock(&ci->i_ceph_lock); 50 + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) 51 + acl = get_cached_acl(inode, type); 52 + spin_unlock(&ci->i_ceph_lock); 53 + 54 + return acl; 55 + } 56 + 57 + void ceph_forget_all_cached_acls(struct inode *inode) 58 + { 59 + forget_all_cached_acls(inode); 60 + } 61 + 62 + struct posix_acl *ceph_get_acl(struct inode *inode, int type) 63 + { 64 + int size; 65 + const char *name; 66 + char *value = NULL; 67 + struct posix_acl *acl; 68 + 69 + if (!IS_POSIXACL(inode)) 70 + return NULL; 71 + 72 + acl = ceph_get_cached_acl(inode, type); 73 + if (acl != ACL_NOT_CACHED) 74 + return acl; 75 + 76 + switch (type) { 77 + case ACL_TYPE_ACCESS: 78 + name = POSIX_ACL_XATTR_ACCESS; 79 + break; 80 + case ACL_TYPE_DEFAULT: 81 + name = POSIX_ACL_XATTR_DEFAULT; 82 + break; 83 + default: 84 + BUG(); 85 + } 86 + 87 + size = __ceph_getxattr(inode, name, "", 0); 88 + if (size > 0) { 89 + value = kzalloc(size, GFP_NOFS); 90 + if (!value) 91 + return ERR_PTR(-ENOMEM); 92 + size = __ceph_getxattr(inode, name, value, size); 93 + } 94 + 95 + if (size > 0) 96 + acl = posix_acl_from_xattr(&init_user_ns, value, size); 97 + else if (size == -ERANGE || size == -ENODATA || size == 0) 98 + acl = NULL; 99 + else 100 + acl = ERR_PTR(-EIO); 101 + 102 + kfree(value); 103 + 104 + if (!IS_ERR(acl)) 105 + ceph_set_cached_acl(inode, type, acl); 106 + 107 + return acl; 108 + } 109 + 110 + static int ceph_set_acl(struct dentry *dentry, struct inode *inode, 111 + struct posix_acl *acl, int type) 112 + { 113 + int ret = 0, size = 0; 114 + const char *name = NULL; 115 + char *value = NULL; 116 + struct iattr newattrs; 117 + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; 118 + 119 + if (acl) { 120 + ret = posix_acl_valid(acl); 121 + if (ret < 0) 122 + goto out; 123 + } 124 + 125 + switch (type) { 126 + case ACL_TYPE_ACCESS: 127 + name = POSIX_ACL_XATTR_ACCESS; 128 + if (acl) { 129 + ret = posix_acl_equiv_mode(acl, &new_mode); 130 + if (ret < 0) 131 + goto out; 132 + if (ret == 0) 133 + acl = NULL; 134 + } 135 + break; 136 + case ACL_TYPE_DEFAULT: 137 + if (!S_ISDIR(inode->i_mode)) { 138 + ret = acl ? -EINVAL : 0; 139 + goto out; 140 + } 141 + name = POSIX_ACL_XATTR_DEFAULT; 142 + break; 143 + default: 144 + ret = -EINVAL; 145 + goto out; 146 + } 147 + 148 + if (acl) { 149 + size = posix_acl_xattr_size(acl->a_count); 150 + value = kmalloc(size, GFP_NOFS); 151 + if (!value) { 152 + ret = -ENOMEM; 153 + goto out; 154 + } 155 + 156 + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); 157 + if (ret < 0) 158 + goto out_free; 159 + } 160 + 161 + if (new_mode != old_mode) { 162 + newattrs.ia_mode = new_mode; 163 + newattrs.ia_valid = ATTR_MODE; 164 + ret = ceph_setattr(dentry, &newattrs); 165 + if (ret) 166 + goto out_free; 167 + } 168 + 169 + if (value) 170 + ret = __ceph_setxattr(dentry, name, value, size, 0); 171 + else 172 + ret = __ceph_removexattr(dentry, name); 173 + 174 + if (ret) { 175 + if (new_mode != old_mode) { 176 + newattrs.ia_mode = old_mode; 177 + newattrs.ia_valid = ATTR_MODE; 178 + ceph_setattr(dentry, &newattrs); 179 + } 180 + goto out_free; 181 + } 182 + 183 + ceph_set_cached_acl(inode, type, acl); 184 + 185 + out_free: 186 + kfree(value); 187 + out: 188 + return ret; 189 + } 190 + 191 + int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) 192 + { 193 + struct posix_acl *acl = NULL; 194 + int ret = 0; 195 + 196 + if (!S_ISLNK(inode->i_mode)) { 197 + if (IS_POSIXACL(dir)) { 198 + acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT); 199 + if (IS_ERR(acl)) { 200 + ret = PTR_ERR(acl); 201 + goto out; 202 + } 203 + } 204 + 205 + if (!acl) 206 + inode->i_mode &= ~current_umask(); 207 + } 208 + 209 + if (IS_POSIXACL(dir) && acl) { 210 + if (S_ISDIR(inode->i_mode)) { 211 + ret = ceph_set_acl(dentry, inode, acl, 212 + ACL_TYPE_DEFAULT); 213 + if (ret) 214 + goto out_release; 215 + } 216 + ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); 217 + if (ret < 0) 218 + goto out; 219 + else if (ret > 0) 220 + ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS); 221 + else 222 + cache_no_acl(inode); 223 + } else { 224 + cache_no_acl(inode); 225 + } 226 + 227 + out_release: 228 + posix_acl_release(acl); 229 + out: 230 + return ret; 231 + } 232 + 233 + int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) 234 + { 235 + struct posix_acl *acl; 236 + int ret = 0; 237 + 238 + if (S_ISLNK(inode->i_mode)) { 239 + ret = -EOPNOTSUPP; 240 + goto out; 241 + } 242 + 243 + if (!IS_POSIXACL(inode)) 244 + goto out; 245 + 246 + acl = ceph_get_acl(inode, ACL_TYPE_ACCESS); 247 + if (IS_ERR_OR_NULL(acl)) { 248 + ret = PTR_ERR(acl); 249 + goto out; 250 + } 251 + 252 + ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); 253 + if (ret) 254 + goto out; 255 + ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS); 256 + posix_acl_release(acl); 257 + out: 258 + return ret; 259 + } 260 + 261 + static int ceph_xattr_acl_get(struct dentry *dentry, const char *name, 262 + void *value, size_t size, int type) 263 + { 264 + struct posix_acl *acl; 265 + int ret = 0; 266 + 267 + if (!IS_POSIXACL(dentry->d_inode)) 268 + return -EOPNOTSUPP; 269 + 270 + acl = ceph_get_acl(dentry->d_inode, type); 271 + if (IS_ERR(acl)) 272 + return PTR_ERR(acl); 273 + if (acl == NULL) 274 + return -ENODATA; 275 + 276 + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); 277 + posix_acl_release(acl); 278 + 279 + return ret; 280 + } 281 + 282 + static int ceph_xattr_acl_set(struct dentry *dentry, const char *name, 283 + const void *value, size_t size, int flags, int type) 284 + { 285 + int ret = 0; 286 + struct posix_acl *acl = NULL; 287 + 288 + if (!inode_owner_or_capable(dentry->d_inode)) { 289 + ret = -EPERM; 290 + goto out; 291 + } 292 + 293 + if (!IS_POSIXACL(dentry->d_inode)) { 294 + ret = -EOPNOTSUPP; 295 + goto out; 296 + } 297 + 298 + if (value) { 299 + acl = posix_acl_from_xattr(&init_user_ns, value, size); 300 + if (IS_ERR(acl)) { 301 + ret = PTR_ERR(acl); 302 + goto out; 303 + } 304 + 305 + if (acl) { 306 + ret = posix_acl_valid(acl); 307 + if (ret) 308 + goto out_release; 309 + } 310 + } 311 + 312 + ret = ceph_set_acl(dentry, dentry->d_inode, acl, type); 313 + 314 + out_release: 315 + posix_acl_release(acl); 316 + out: 317 + return ret; 318 + } 319 + 320 + const struct xattr_handler ceph_xattr_acl_default_handler = { 321 + .prefix = POSIX_ACL_XATTR_DEFAULT, 322 + .flags = ACL_TYPE_DEFAULT, 323 + .get = ceph_xattr_acl_get, 324 + .set = ceph_xattr_acl_set, 325 + }; 326 + 327 + const struct xattr_handler ceph_xattr_acl_access_handler = { 328 + .prefix = POSIX_ACL_XATTR_ACCESS, 329 + .flags = ACL_TYPE_ACCESS, 330 + .get = ceph_xattr_acl_get, 331 + .set = ceph_xattr_acl_set, 332 + };

+81 -12

fs/ceph/addr.c

··· 209 209 err = 0; 210 210 if (err < 0) { 211 211 SetPageError(page); 212 + ceph_fscache_readpage_cancel(inode, page); 212 213 goto out; 213 214 } else { 214 215 if (err < PAGE_CACHE_SIZE) { ··· 257 256 for (i = 0; i < num_pages; i++) { 258 257 struct page *page = osd_data->pages[i]; 259 258 259 + if (rc < 0) 260 + goto unlock; 260 261 if (bytes < (int)PAGE_CACHE_SIZE) { 261 262 /* zero (remainder of) page */ 262 263 int s = bytes < 0 ? 0 : bytes; ··· 269 266 flush_dcache_page(page); 270 267 SetPageUptodate(page); 271 268 ceph_readpage_to_fscache(inode, page); 269 + unlock: 272 270 unlock_page(page); 273 271 page_cache_release(page); 274 272 bytes -= PAGE_CACHE_SIZE; ··· 1211 1207 /* 1212 1208 * vm ops 1213 1209 */ 1210 + static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1211 + { 1212 + struct inode *inode = file_inode(vma->vm_file); 1213 + struct ceph_inode_info *ci = ceph_inode(inode); 1214 + struct ceph_file_info *fi = vma->vm_file->private_data; 1215 + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; 1216 + int want, got, ret; 1217 + 1218 + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", 1219 + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); 1220 + if (fi->fmode & CEPH_FILE_MODE_LAZY) 1221 + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1222 + else 1223 + want = CEPH_CAP_FILE_CACHE; 1224 + while (1) { 1225 + got = 0; 1226 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 1227 + if (ret == 0) 1228 + break; 1229 + if (ret != -ERESTARTSYS) { 1230 + WARN_ON(1); 1231 + return VM_FAULT_SIGBUS; 1232 + } 1233 + } 1234 + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1235 + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); 1236 + 1237 + ret = filemap_fault(vma, vmf); 1238 + 1239 + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", 1240 + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); 1241 + ceph_put_cap_refs(ci, got); 1242 + 1243 + return ret; 1244 + } 1214 1245 1215 1246 /* 1216 1247 * Reuse write_begin here for simplicity. ··· 1253 1214 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1254 1215 { 1255 1216 struct inode *inode = file_inode(vma->vm_file); 1256 - struct page *page = vmf->page; 1217 + struct ceph_inode_info *ci = ceph_inode(inode); 1218 + struct ceph_file_info *fi = vma->vm_file->private_data; 1257 1219 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1220 + struct page *page = vmf->page; 1258 1221 loff_t off = page_offset(page); 1259 - loff_t size, len; 1260 - int ret; 1222 + loff_t size = i_size_read(inode); 1223 + size_t len; 1224 + int want, got, ret; 1261 1225 1262 - /* Update time before taking page lock */ 1263 - file_update_time(vma->vm_file); 1264 - 1265 - size = i_size_read(inode); 1266 1226 if (off + PAGE_CACHE_SIZE <= size) 1267 1227 len = PAGE_CACHE_SIZE; 1268 1228 else 1269 1229 len = size & ~PAGE_CACHE_MASK; 1270 1230 1271 - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, 1272 - off, len, page, page->index); 1231 + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", 1232 + inode, ceph_vinop(inode), off, len, size); 1233 + if (fi->fmode & CEPH_FILE_MODE_LAZY) 1234 + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1235 + else 1236 + want = CEPH_CAP_FILE_BUFFER; 1237 + while (1) { 1238 + got = 0; 1239 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); 1240 + if (ret == 0) 1241 + break; 1242 + if (ret != -ERESTARTSYS) { 1243 + WARN_ON(1); 1244 + return VM_FAULT_SIGBUS; 1245 + } 1246 + } 1247 + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", 1248 + inode, off, len, ceph_cap_string(got)); 1249 + 1250 + /* Update time before taking page lock */ 1251 + file_update_time(vma->vm_file); 1273 1252 1274 1253 lock_page(page); 1275 1254 ··· 1309 1252 ret = VM_FAULT_SIGBUS; 1310 1253 } 1311 1254 out: 1312 - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); 1313 - if (ret != VM_FAULT_LOCKED) 1255 + if (ret != VM_FAULT_LOCKED) { 1314 1256 unlock_page(page); 1257 + } else { 1258 + int dirty; 1259 + spin_lock(&ci->i_ceph_lock); 1260 + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1261 + spin_unlock(&ci->i_ceph_lock); 1262 + if (dirty) 1263 + __mark_inode_dirty(inode, dirty); 1264 + } 1265 + 1266 + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", 1267 + inode, off, len, ceph_cap_string(got), ret); 1268 + ceph_put_cap_refs(ci, got); 1269 + 1315 1270 return ret; 1316 1271 } 1317 1272 1318 1273 static struct vm_operations_struct ceph_vmops = { 1319 - .fault = filemap_fault, 1274 + .fault = ceph_filemap_fault, 1320 1275 .page_mkwrite = ceph_page_mkwrite, 1321 1276 .remap_pages = generic_file_remap_pages, 1322 1277 };

+13

fs/ceph/cache.h

··· 67 67 return fscache_maybe_release_page(ci->fscache, page, gfp); 68 68 } 69 69 70 + static inline void ceph_fscache_readpage_cancel(struct inode *inode, 71 + struct page *page) 72 + { 73 + struct ceph_inode_info *ci = ceph_inode(inode); 74 + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) 75 + __fscache_uncache_page(ci->fscache, page); 76 + } 77 + 70 78 static inline void ceph_fscache_readpages_cancel(struct inode *inode, 71 79 struct list_head *pages) 72 80 { ··· 151 143 static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) 152 144 { 153 145 return 1; 146 + } 147 + 148 + static inline void ceph_fscache_readpage_cancel(struct inode *inode, 149 + struct page *page) 150 + { 154 151 } 155 152 156 153 static inline void ceph_fscache_readpages_cancel(struct inode *inode,

+225 -119

fs/ceph/caps.c

··· 555 555 cap->ci = ci; 556 556 __insert_cap_node(ci, cap); 557 557 558 - /* clear out old exporting info? (i.e. on cap import) */ 559 - if (ci->i_cap_exporting_mds == mds) { 560 - ci->i_cap_exporting_issued = 0; 561 - ci->i_cap_exporting_mseq = 0; 562 - ci->i_cap_exporting_mds = -1; 563 - } 564 - 565 558 /* add to session cap list */ 566 559 cap->session = session; 567 560 spin_lock(&session->s_cap_lock); 568 561 list_add_tail(&cap->session_caps, &session->s_caps); 569 562 session->s_nr_caps++; 570 563 spin_unlock(&session->s_cap_lock); 571 - } else if (new_cap) 572 - ceph_put_cap(mdsc, new_cap); 564 + } else { 565 + if (new_cap) 566 + ceph_put_cap(mdsc, new_cap); 567 + 568 + /* 569 + * auth mds of the inode changed. we received the cap export 570 + * message, but still haven't received the cap import message. 571 + * handle_cap_export() updated the new auth MDS' cap. 572 + * 573 + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing 574 + * a message that was send before the cap import message. So 575 + * don't remove caps. 576 + */ 577 + if (ceph_seq_cmp(seq, cap->seq) <= 0) { 578 + WARN_ON(cap != ci->i_auth_cap); 579 + WARN_ON(cap->cap_id != cap_id); 580 + seq = cap->seq; 581 + mseq = cap->mseq; 582 + issued |= cap->issued; 583 + flags |= CEPH_CAP_FLAG_AUTH; 584 + } 585 + } 573 586 574 587 if (!ci->i_snap_realm) { 575 588 /* ··· 624 611 if (ci->i_auth_cap == NULL || 625 612 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 626 613 ci->i_auth_cap = cap; 627 - } else if (ci->i_auth_cap == cap) { 628 - ci->i_auth_cap = NULL; 629 - spin_lock(&mdsc->cap_dirty_lock); 630 - if (!list_empty(&ci->i_dirty_item)) { 631 - dout(" moving %p to cap_dirty_migrating\n", inode); 632 - list_move(&ci->i_dirty_item, 633 - &mdsc->cap_dirty_migrating); 634 - } 635 - spin_unlock(&mdsc->cap_dirty_lock); 614 + ci->i_cap_exporting_issued = 0; 615 + } else { 616 + WARN_ON(ci->i_auth_cap == cap); 636 617 } 637 618 638 619 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", ··· 635 628 cap->cap_id = cap_id; 636 629 cap->issued = issued; 637 630 cap->implemented |= issued; 638 - if (mseq > cap->mseq) 631 + if (ceph_seq_cmp(mseq, cap->mseq) > 0) 639 632 cap->mds_wanted = wanted; 640 633 else 641 634 cap->mds_wanted |= wanted; ··· 823 816 824 817 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 825 818 cap = rb_entry(p, struct ceph_cap, ci_node); 826 - if (cap != ocap && __cap_is_valid(cap) && 819 + if (cap != ocap && 827 820 (cap->implemented & ~cap->issued & mask)) 828 821 return 1; 829 822 } ··· 895 888 */ 896 889 static int __ceph_is_any_caps(struct ceph_inode_info *ci) 897 890 { 898 - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; 891 + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; 892 + } 893 + 894 + int ceph_is_any_caps(struct inode *inode) 895 + { 896 + struct ceph_inode_info *ci = ceph_inode(inode); 897 + int ret; 898 + 899 + spin_lock(&ci->i_ceph_lock); 900 + ret = __ceph_is_any_caps(ci); 901 + spin_unlock(&ci->i_ceph_lock); 902 + 903 + return ret; 899 904 } 900 905 901 906 /* ··· 1402 1383 ci->i_snap_realm->cached_context); 1403 1384 dout(" inode %p now dirty snapc %p auth cap %p\n", 1404 1385 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); 1386 + WARN_ON(!ci->i_auth_cap); 1405 1387 BUG_ON(!list_empty(&ci->i_dirty_item)); 1406 1388 spin_lock(&mdsc->cap_dirty_lock); 1407 - if (ci->i_auth_cap) 1408 - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1409 - else 1410 - list_add(&ci->i_dirty_item, 1411 - &mdsc->cap_dirty_migrating); 1389 + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1412 1390 spin_unlock(&mdsc->cap_dirty_lock); 1413 1391 if (ci->i_flushing_caps == 0) { 1414 1392 ihold(inode); ··· 1751 1735 /* 1752 1736 * Try to flush dirty caps back to the auth mds. 1753 1737 */ 1754 - static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1755 - unsigned *flush_tid) 1738 + static int try_flush_caps(struct inode *inode, unsigned *flush_tid) 1756 1739 { 1757 1740 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1758 1741 struct ceph_inode_info *ci = ceph_inode(inode); 1759 - int unlock_session = session ? 0 : 1; 1760 1742 int flushing = 0; 1743 + struct ceph_mds_session *session = NULL; 1761 1744 1762 1745 retry: 1763 1746 spin_lock(&ci->i_ceph_lock); ··· 1770 1755 int want = __ceph_caps_wanted(ci); 1771 1756 int delayed; 1772 1757 1773 - if (!session) { 1758 + if (!session || session != cap->session) { 1774 1759 spin_unlock(&ci->i_ceph_lock); 1760 + if (session) 1761 + mutex_unlock(&session->s_mutex); 1775 1762 session = cap->session; 1776 1763 mutex_lock(&session->s_mutex); 1777 1764 goto retry; 1778 1765 } 1779 - BUG_ON(session != cap->session); 1780 1766 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) 1781 1767 goto out; 1782 1768 ··· 1796 1780 out: 1797 1781 spin_unlock(&ci->i_ceph_lock); 1798 1782 out_unlocked: 1799 - if (session && unlock_session) 1783 + if (session) 1800 1784 mutex_unlock(&session->s_mutex); 1801 1785 return flushing; 1802 1786 } ··· 1881 1865 return ret; 1882 1866 mutex_lock(&inode->i_mutex); 1883 1867 1884 - dirty = try_flush_caps(inode, NULL, &flush_tid); 1868 + dirty = try_flush_caps(inode, &flush_tid); 1885 1869 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 1886 1870 1887 1871 /* ··· 1916 1900 1917 1901 dout("write_inode %p wait=%d\n", inode, wait); 1918 1902 if (wait) { 1919 - dirty = try_flush_caps(inode, NULL, &flush_tid); 1903 + dirty = try_flush_caps(inode, &flush_tid); 1920 1904 if (dirty) 1921 1905 err = wait_event_interruptible(ci->i_cap_wq, 1922 1906 caps_are_flushed(inode, flush_tid)); ··· 2366 2350 d_prune_aliases(inode); 2367 2351 /* 2368 2352 * For non-directory inode, d_find_alias() only returns 2369 - * connected dentry. After calling d_invalidate(), the 2370 - * dentry become disconnected. 2353 + * hashed dentry. After calling d_invalidate(), the 2354 + * dentry becomes unhashed. 2371 2355 * 2372 2356 * For directory inode, d_find_alias() can return 2373 - * disconnected dentry. But directory inode should have 2357 + * unhashed dentry. But directory inode should have 2374 2358 * one alias at most. 2375 2359 */ 2376 2360 while ((dn = d_find_alias(inode))) { ··· 2424 2408 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2425 2409 inode->i_size); 2426 2410 2411 + 2412 + /* 2413 + * auth mds of the inode changed. we received the cap export message, 2414 + * but still haven't received the cap import message. handle_cap_export 2415 + * updated the new auth MDS' cap. 2416 + * 2417 + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message 2418 + * that was sent before the cap import message. So don't remove caps. 2419 + */ 2420 + if (ceph_seq_cmp(seq, cap->seq) <= 0) { 2421 + WARN_ON(cap != ci->i_auth_cap); 2422 + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); 2423 + seq = cap->seq; 2424 + newcaps |= cap->issued; 2425 + } 2426 + 2427 2427 /* 2428 2428 * If CACHE is being revoked, and we have no dirty buffers, 2429 2429 * try to invalidate (once). (If there are dirty buffers, we ··· 2466 2434 issued |= implemented | __ceph_caps_dirty(ci); 2467 2435 2468 2436 cap->cap_gen = session->s_cap_gen; 2437 + cap->seq = seq; 2469 2438 2470 2439 __check_cap_issue(ci, cap, newcaps); 2471 2440 ··· 2497 2464 ceph_buffer_put(ci->i_xattrs.blob); 2498 2465 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); 2499 2466 ci->i_xattrs.version = version; 2467 + ceph_forget_all_cached_acls(inode); 2500 2468 } 2501 2469 } 2502 2470 ··· 2516 2482 ceph_fill_file_time(inode, issued, 2517 2483 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2518 2484 &atime); 2485 + 2486 + 2487 + /* file layout may have changed */ 2488 + ci->i_layout = grant->layout; 2519 2489 2520 2490 /* max size increase? */ 2521 2491 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { ··· 2548 2510 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) 2549 2511 check_caps = 1; 2550 2512 } 2551 - 2552 - cap->seq = seq; 2553 - 2554 - /* file layout may have changed */ 2555 - ci->i_layout = grant->layout; 2556 2513 2557 2514 /* revocation, grant, or no-op? */ 2558 2515 if (cap->issued & ~newcaps) { ··· 2774 2741 * caller holds s_mutex 2775 2742 */ 2776 2743 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2777 - struct ceph_mds_session *session, 2778 - int *open_target_sessions) 2744 + struct ceph_mds_cap_peer *ph, 2745 + struct ceph_mds_session *session) 2779 2746 { 2780 2747 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2748 + struct ceph_mds_session *tsession = NULL; 2749 + struct ceph_cap *cap, *tcap; 2781 2750 struct ceph_inode_info *ci = ceph_inode(inode); 2782 - int mds = session->s_mds; 2751 + u64 t_cap_id; 2783 2752 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2784 - struct ceph_cap *cap = NULL, *t; 2785 - struct rb_node *p; 2786 - int remember = 1; 2753 + unsigned t_seq, t_mseq; 2754 + int target, issued; 2755 + int mds = session->s_mds; 2787 2756 2788 - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", 2789 - inode, ci, mds, mseq); 2790 - 2791 - spin_lock(&ci->i_ceph_lock); 2792 - 2793 - /* make sure we haven't seen a higher mseq */ 2794 - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2795 - t = rb_entry(p, struct ceph_cap, ci_node); 2796 - if (ceph_seq_cmp(t->mseq, mseq) > 0) { 2797 - dout(" higher mseq on cap from mds%d\n", 2798 - t->session->s_mds); 2799 - remember = 0; 2800 - } 2801 - if (t->session->s_mds == mds) 2802 - cap = t; 2757 + if (ph) { 2758 + t_cap_id = le64_to_cpu(ph->cap_id); 2759 + t_seq = le32_to_cpu(ph->seq); 2760 + t_mseq = le32_to_cpu(ph->mseq); 2761 + target = le32_to_cpu(ph->mds); 2762 + } else { 2763 + t_cap_id = t_seq = t_mseq = 0; 2764 + target = -1; 2803 2765 } 2804 2766 2805 - if (cap) { 2806 - if (remember) { 2807 - /* make note */ 2808 - ci->i_cap_exporting_mds = mds; 2809 - ci->i_cap_exporting_mseq = mseq; 2810 - ci->i_cap_exporting_issued = cap->issued; 2767 + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", 2768 + inode, ci, mds, mseq, target); 2769 + retry: 2770 + spin_lock(&ci->i_ceph_lock); 2771 + cap = __get_cap_for_mds(ci, mds); 2772 + if (!cap) 2773 + goto out_unlock; 2811 2774 2812 - /* 2813 - * make sure we have open sessions with all possible 2814 - * export targets, so that we get the matching IMPORT 2815 - */ 2816 - *open_target_sessions = 1; 2775 + if (target < 0) { 2776 + __ceph_remove_cap(cap, false); 2777 + goto out_unlock; 2778 + } 2817 2779 2818 - /* 2819 - * we can't flush dirty caps that we've seen the 2820 - * EXPORT but no IMPORT for 2821 - */ 2822 - spin_lock(&mdsc->cap_dirty_lock); 2823 - if (!list_empty(&ci->i_dirty_item)) { 2824 - dout(" moving %p to cap_dirty_migrating\n", 2825 - inode); 2826 - list_move(&ci->i_dirty_item, 2827 - &mdsc->cap_dirty_migrating); 2780 + /* 2781 + * now we know we haven't received the cap import message yet 2782 + * because the exported cap still exist. 2783 + */ 2784 + 2785 + issued = cap->issued; 2786 + WARN_ON(issued != cap->implemented); 2787 + 2788 + tcap = __get_cap_for_mds(ci, target); 2789 + if (tcap) { 2790 + /* already have caps from the target */ 2791 + if (tcap->cap_id != t_cap_id || 2792 + ceph_seq_cmp(tcap->seq, t_seq) < 0) { 2793 + dout(" updating import cap %p mds%d\n", tcap, target); 2794 + tcap->cap_id = t_cap_id; 2795 + tcap->seq = t_seq - 1; 2796 + tcap->issue_seq = t_seq - 1; 2797 + tcap->mseq = t_mseq; 2798 + tcap->issued |= issued; 2799 + tcap->implemented |= issued; 2800 + if (cap == ci->i_auth_cap) 2801 + ci->i_auth_cap = tcap; 2802 + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { 2803 + spin_lock(&mdsc->cap_dirty_lock); 2804 + list_move_tail(&ci->i_flushing_item, 2805 + &tcap->session->s_cap_flushing); 2806 + spin_unlock(&mdsc->cap_dirty_lock); 2828 2807 } 2829 - spin_unlock(&mdsc->cap_dirty_lock); 2830 2808 } 2831 2809 __ceph_remove_cap(cap, false); 2810 + goto out_unlock; 2832 2811 } 2833 - /* else, we already released it */ 2812 + 2813 + if (tsession) { 2814 + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; 2815 + spin_unlock(&ci->i_ceph_lock); 2816 + /* add placeholder for the export tagert */ 2817 + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, 2818 + t_seq - 1, t_mseq, (u64)-1, flag, NULL); 2819 + goto retry; 2820 + } 2834 2821 2835 2822 spin_unlock(&ci->i_ceph_lock); 2823 + mutex_unlock(&session->s_mutex); 2824 + 2825 + /* open target session */ 2826 + tsession = ceph_mdsc_open_export_target_session(mdsc, target); 2827 + if (!IS_ERR(tsession)) { 2828 + if (mds > target) { 2829 + mutex_lock(&session->s_mutex); 2830 + mutex_lock_nested(&tsession->s_mutex, 2831 + SINGLE_DEPTH_NESTING); 2832 + } else { 2833 + mutex_lock(&tsession->s_mutex); 2834 + mutex_lock_nested(&session->s_mutex, 2835 + SINGLE_DEPTH_NESTING); 2836 + } 2837 + ceph_add_cap_releases(mdsc, tsession); 2838 + } else { 2839 + WARN_ON(1); 2840 + tsession = NULL; 2841 + target = -1; 2842 + } 2843 + goto retry; 2844 + 2845 + out_unlock: 2846 + spin_unlock(&ci->i_ceph_lock); 2847 + mutex_unlock(&session->s_mutex); 2848 + if (tsession) { 2849 + mutex_unlock(&tsession->s_mutex); 2850 + ceph_put_mds_session(tsession); 2851 + } 2836 2852 } 2837 2853 2838 2854 /* ··· 2892 2810 */ 2893 2811 static void handle_cap_import(struct ceph_mds_client *mdsc, 2894 2812 struct inode *inode, struct ceph_mds_caps *im, 2813 + struct ceph_mds_cap_peer *ph, 2895 2814 struct ceph_mds_session *session, 2896 2815 void *snaptrace, int snaptrace_len) 2897 2816 { 2898 2817 struct ceph_inode_info *ci = ceph_inode(inode); 2818 + struct ceph_cap *cap; 2899 2819 int mds = session->s_mds; 2900 2820 unsigned issued = le32_to_cpu(im->caps); 2901 2821 unsigned wanted = le32_to_cpu(im->wanted); ··· 2905 2821 unsigned mseq = le32_to_cpu(im->migrate_seq); 2906 2822 u64 realmino = le64_to_cpu(im->realm); 2907 2823 u64 cap_id = le64_to_cpu(im->cap_id); 2824 + u64 p_cap_id; 2825 + int peer; 2908 2826 2909 - if (ci->i_cap_exporting_mds >= 0 && 2910 - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { 2911 - dout("handle_cap_import inode %p ci %p mds%d mseq %d" 2912 - " - cleared exporting from mds%d\n", 2913 - inode, ci, mds, mseq, 2914 - ci->i_cap_exporting_mds); 2915 - ci->i_cap_exporting_issued = 0; 2916 - ci->i_cap_exporting_mseq = 0; 2917 - ci->i_cap_exporting_mds = -1; 2918 - 2919 - spin_lock(&mdsc->cap_dirty_lock); 2920 - if (!list_empty(&ci->i_dirty_item)) { 2921 - dout(" moving %p back to cap_dirty\n", inode); 2922 - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); 2923 - } 2924 - spin_unlock(&mdsc->cap_dirty_lock); 2827 + if (ph) { 2828 + p_cap_id = le64_to_cpu(ph->cap_id); 2829 + peer = le32_to_cpu(ph->mds); 2925 2830 } else { 2926 - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", 2927 - inode, ci, mds, mseq); 2831 + p_cap_id = 0; 2832 + peer = -1; 2928 2833 } 2834 + 2835 + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", 2836 + inode, ci, mds, mseq, peer); 2837 + 2838 + spin_lock(&ci->i_ceph_lock); 2839 + cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; 2840 + if (cap && cap->cap_id == p_cap_id) { 2841 + dout(" remove export cap %p mds%d flags %d\n", 2842 + cap, peer, ph->flags); 2843 + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && 2844 + (cap->seq != le32_to_cpu(ph->seq) || 2845 + cap->mseq != le32_to_cpu(ph->mseq))) { 2846 + pr_err("handle_cap_import: mismatched seq/mseq: " 2847 + "ino (%llx.%llx) mds%d seq %d mseq %d " 2848 + "importer mds%d has peer seq %d mseq %d\n", 2849 + ceph_vinop(inode), peer, cap->seq, 2850 + cap->mseq, mds, le32_to_cpu(ph->seq), 2851 + le32_to_cpu(ph->mseq)); 2852 + } 2853 + ci->i_cap_exporting_issued = cap->issued; 2854 + __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); 2855 + } 2856 + 2857 + /* make sure we re-request max_size, if necessary */ 2858 + ci->i_wanted_max_size = 0; 2859 + ci->i_requested_max_size = 0; 2860 + spin_unlock(&ci->i_ceph_lock); 2929 2861 2930 2862 down_write(&mdsc->snap_rwsem); 2931 2863 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, ··· 2953 2853 kick_flushing_inode_caps(mdsc, session, inode); 2954 2854 up_read(&mdsc->snap_rwsem); 2955 2855 2956 - /* make sure we re-request max_size, if necessary */ 2957 - spin_lock(&ci->i_ceph_lock); 2958 - ci->i_wanted_max_size = 0; /* reset */ 2959 - ci->i_requested_max_size = 0; 2960 - spin_unlock(&ci->i_ceph_lock); 2961 2856 } 2962 2857 2963 2858 /* ··· 2970 2875 struct ceph_inode_info *ci; 2971 2876 struct ceph_cap *cap; 2972 2877 struct ceph_mds_caps *h; 2878 + struct ceph_mds_cap_peer *peer = NULL; 2973 2879 int mds = session->s_mds; 2974 2880 int op; 2975 2881 u32 seq, mseq; ··· 2981 2885 void *snaptrace; 2982 2886 size_t snaptrace_len; 2983 2887 void *flock; 2888 + void *end; 2984 2889 u32 flock_len; 2985 - int open_target_sessions = 0; 2986 2890 2987 2891 dout("handle_caps from mds%d\n", mds); 2988 2892 2989 2893 /* decode */ 2894 + end = msg->front.iov_base + msg->front.iov_len; 2990 2895 tid = le64_to_cpu(msg->hdr.tid); 2991 2896 if (msg->front.iov_len < sizeof(*h)) 2992 2897 goto bad; ··· 3005 2908 snaptrace_len = le32_to_cpu(h->snap_trace_len); 3006 2909 3007 2910 if (le16_to_cpu(msg->hdr.version) >= 2) { 3008 - void *p, *end; 3009 - 3010 - p = snaptrace + snaptrace_len; 3011 - end = msg->front.iov_base + msg->front.iov_len; 2911 + void *p = snaptrace + snaptrace_len; 3012 2912 ceph_decode_32_safe(&p, end, flock_len, bad); 2913 + if (p + flock_len > end) 2914 + goto bad; 3013 2915 flock = p; 3014 2916 } else { 3015 2917 flock = NULL; 3016 2918 flock_len = 0; 2919 + } 2920 + 2921 + if (le16_to_cpu(msg->hdr.version) >= 3) { 2922 + if (op == CEPH_CAP_OP_IMPORT) { 2923 + void *p = flock + flock_len; 2924 + if (p + sizeof(*peer) > end) 2925 + goto bad; 2926 + peer = p; 2927 + } else if (op == CEPH_CAP_OP_EXPORT) { 2928 + /* recorded in unused fields */ 2929 + peer = (void *)&h->size; 2930 + } 3017 2931 } 3018 2932 3019 2933 mutex_lock(&session->s_mutex); ··· 3059 2951 goto done; 3060 2952 3061 2953 case CEPH_CAP_OP_EXPORT: 3062 - handle_cap_export(inode, h, session, &open_target_sessions); 3063 - goto done; 2954 + handle_cap_export(inode, h, peer, session); 2955 + goto done_unlocked; 3064 2956 3065 2957 case CEPH_CAP_OP_IMPORT: 3066 - handle_cap_import(mdsc, inode, h, session, 2958 + handle_cap_import(mdsc, inode, h, peer, session, 3067 2959 snaptrace, snaptrace_len); 3068 2960 } 3069 2961 ··· 3115 3007 done_unlocked: 3116 3008 if (inode) 3117 3009 iput(inode); 3118 - if (open_target_sessions) 3119 - ceph_mdsc_open_export_target_sessions(mdsc, session); 3120 3010 return; 3121 3011 3122 3012 bad:

+13 -3

fs/ceph/dir.c

··· 693 693 if (!err && !req->r_reply_info.head->is_dentry) 694 694 err = ceph_handle_notrace_create(dir, dentry); 695 695 ceph_mdsc_put_request(req); 696 + 697 + if (!err) 698 + err = ceph_init_acl(dentry, dentry->d_inode, dir); 699 + 696 700 if (err) 697 701 d_drop(dentry); 698 702 return err; ··· 1041 1037 valid = 1; 1042 1038 } else if (dentry_lease_is_valid(dentry) || 1043 1039 dir_lease_is_valid(dir, dentry)) { 1044 - valid = 1; 1040 + if (dentry->d_inode) 1041 + valid = ceph_is_any_caps(dentry->d_inode); 1042 + else 1043 + valid = 1; 1045 1044 } 1046 1045 1047 1046 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1048 - if (valid) 1047 + if (valid) { 1049 1048 ceph_dentry_lru_touch(dentry); 1050 - else 1049 + } else { 1050 + ceph_dir_clear_complete(dir); 1051 1051 d_drop(dentry); 1052 + } 1052 1053 iput(dir); 1053 1054 return valid; 1054 1055 } ··· 1302 1293 .getxattr = ceph_getxattr, 1303 1294 .listxattr = ceph_listxattr, 1304 1295 .removexattr = ceph_removexattr, 1296 + .get_acl = ceph_get_acl, 1305 1297 .mknod = ceph_mknod, 1306 1298 .symlink = ceph_symlink, 1307 1299 .mkdir = ceph_mkdir,

+309 -126

fs/ceph/file.c

··· 408 408 * 409 409 * If the read spans object boundary, just do multiple reads. 410 410 */ 411 - static ssize_t ceph_sync_read(struct file *file, char __user *data, 412 - unsigned len, loff_t *poff, int *checkeof) 411 + static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, 412 + int *checkeof) 413 413 { 414 + struct file *file = iocb->ki_filp; 414 415 struct inode *inode = file_inode(file); 415 416 struct page **pages; 416 - u64 off = *poff; 417 + u64 off = iocb->ki_pos; 417 418 int num_pages, ret; 419 + size_t len = i->count; 418 420 419 - dout("sync_read on file %p %llu~%u %s\n", file, off, len, 421 + dout("sync_read on file %p %llu~%u %s\n", file, off, 422 + (unsigned)len, 420 423 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 421 - 422 - if (file->f_flags & O_DIRECT) { 423 - num_pages = calc_pages_for((unsigned long)data, len); 424 - pages = ceph_get_direct_page_vector(data, num_pages, true); 425 - } else { 426 - num_pages = calc_pages_for(off, len); 427 - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 428 - } 429 - if (IS_ERR(pages)) 430 - return PTR_ERR(pages); 431 - 432 424 /* 433 425 * flush any page cache pages in this range. this 434 426 * will make concurrent normal and sync io slow, 435 427 * but it will at least behave sensibly when they are 436 428 * in sequence. 437 429 */ 438 - ret = filemap_write_and_wait(inode->i_mapping); 430 + ret = filemap_write_and_wait_range(inode->i_mapping, off, 431 + off + len); 439 432 if (ret < 0) 440 - goto done; 433 + return ret; 441 434 442 - ret = striped_read(inode, off, len, pages, num_pages, checkeof, 443 - file->f_flags & O_DIRECT, 444 - (unsigned long)data & ~PAGE_MASK); 435 + if (file->f_flags & O_DIRECT) { 436 + while (iov_iter_count(i)) { 437 + void __user *data = i->iov[0].iov_base + i->iov_offset; 438 + size_t len = i->iov[0].iov_len - i->iov_offset; 445 439 446 - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 447 - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 448 - if (ret >= 0) 449 - *poff = off + ret; 440 + num_pages = calc_pages_for((unsigned long)data, len); 441 + pages = ceph_get_direct_page_vector(data, 442 + num_pages, true); 443 + if (IS_ERR(pages)) 444 + return PTR_ERR(pages); 450 445 451 - done: 452 - if (file->f_flags & O_DIRECT) 453 - ceph_put_page_vector(pages, num_pages, true); 454 - else 446 + ret = striped_read(inode, off, len, 447 + pages, num_pages, checkeof, 448 + 1, (unsigned long)data & ~PAGE_MASK); 449 + ceph_put_page_vector(pages, num_pages, true); 450 + 451 + if (ret <= 0) 452 + break; 453 + off += ret; 454 + iov_iter_advance(i, ret); 455 + if (ret < len) 456 + break; 457 + } 458 + } else { 459 + num_pages = calc_pages_for(off, len); 460 + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 461 + if (IS_ERR(pages)) 462 + return PTR_ERR(pages); 463 + ret = striped_read(inode, off, len, pages, 464 + num_pages, checkeof, 0, 0); 465 + if (ret > 0) { 466 + int l, k = 0; 467 + size_t left = len = ret; 468 + 469 + while (left) { 470 + void __user *data = i->iov[0].iov_base 471 + + i->iov_offset; 472 + l = min(i->iov[0].iov_len - i->iov_offset, 473 + left); 474 + 475 + ret = ceph_copy_page_vector_to_user(&pages[k], 476 + data, off, 477 + l); 478 + if (ret > 0) { 479 + iov_iter_advance(i, ret); 480 + left -= ret; 481 + off += ret; 482 + k = calc_pages_for(iocb->ki_pos, 483 + len - left + 1) - 1; 484 + BUG_ON(k >= num_pages && left); 485 + } else 486 + break; 487 + } 488 + } 455 489 ceph_release_page_vector(pages, num_pages); 490 + } 491 + 492 + if (off > iocb->ki_pos) { 493 + ret = off - iocb->ki_pos; 494 + iocb->ki_pos = off; 495 + } 496 + 456 497 dout("sync_read result %d\n", ret); 457 498 return ret; 458 499 } ··· 530 489 } 531 490 } 532 491 492 + 533 493 /* 534 - * Synchronous write, straight from __user pointer or user pages (if 535 - * O_DIRECT). 494 + * Synchronous write, straight from __user pointer or user pages. 536 495 * 537 496 * If write spans object boundary, just do multiple writes. (For a 538 497 * correct atomic write, we should e.g. take write locks on all 539 498 * objects, rollback on failure, etc.) 540 499 */ 541 - static ssize_t ceph_sync_write(struct file *file, const char __user *data, 542 - size_t left, loff_t pos, loff_t *ppos) 500 + static ssize_t 501 + ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, 502 + unsigned long nr_segs, size_t count) 543 503 { 504 + struct file *file = iocb->ki_filp; 544 505 struct inode *inode = file_inode(file); 545 506 struct ceph_inode_info *ci = ceph_inode(inode); 546 507 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 547 508 struct ceph_snap_context *snapc; 548 509 struct ceph_vino vino; 549 510 struct ceph_osd_request *req; 550 - int num_ops = 1; 551 511 struct page **pages; 552 512 int num_pages; 553 - u64 len; 554 513 int written = 0; 555 514 int flags; 556 515 int check_caps = 0; 557 - int page_align, io_align; 558 - unsigned long buf_align; 516 + int page_align; 559 517 int ret; 560 518 struct timespec mtime = CURRENT_TIME; 561 - bool own_pages = false; 519 + loff_t pos = iocb->ki_pos; 520 + struct iov_iter i; 562 521 563 522 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 564 523 return -EROFS; 565 524 566 - dout("sync_write on file %p %lld~%u %s\n", file, pos, 567 - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 525 + dout("sync_direct_write on file %p %lld~%u\n", file, pos, 526 + (unsigned)count); 568 527 569 - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 528 + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 570 529 if (ret < 0) 571 530 return ret; 572 531 573 532 ret = invalidate_inode_pages2_range(inode->i_mapping, 574 533 pos >> PAGE_CACHE_SHIFT, 575 - (pos + left) >> PAGE_CACHE_SHIFT); 534 + (pos + count) >> PAGE_CACHE_SHIFT); 576 535 if (ret < 0) 577 536 dout("invalidate_inode_pages2_range returned %d\n", ret); 578 537 579 538 flags = CEPH_OSD_FLAG_ORDERSNAP | 580 539 CEPH_OSD_FLAG_ONDISK | 581 540 CEPH_OSD_FLAG_WRITE; 582 - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) 583 - flags |= CEPH_OSD_FLAG_ACK; 584 - else 585 - num_ops++; /* Also include a 'startsync' command. */ 586 541 587 - /* 588 - * we may need to do multiple writes here if we span an object 589 - * boundary. this isn't atomic, unfortunately. :( 590 - */ 591 - more: 592 - io_align = pos & ~PAGE_MASK; 593 - buf_align = (unsigned long)data & ~PAGE_MASK; 594 - len = left; 542 + iov_iter_init(&i, iov, nr_segs, count, 0); 595 543 596 - snapc = ci->i_snap_realm->cached_context; 597 - vino = ceph_vino(inode); 598 - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 599 - vino, pos, &len, num_ops, 600 - CEPH_OSD_OP_WRITE, flags, snapc, 601 - ci->i_truncate_seq, ci->i_truncate_size, 602 - false); 603 - if (IS_ERR(req)) 604 - return PTR_ERR(req); 544 + while (iov_iter_count(&i) > 0) { 545 + void __user *data = i.iov->iov_base + i.iov_offset; 546 + u64 len = i.iov->iov_len - i.iov_offset; 605 547 606 - /* write from beginning of first page, regardless of io alignment */ 607 - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; 608 - num_pages = calc_pages_for(page_align, len); 609 - if (file->f_flags & O_DIRECT) { 548 + page_align = (unsigned long)data & ~PAGE_MASK; 549 + 550 + snapc = ci->i_snap_realm->cached_context; 551 + vino = ceph_vino(inode); 552 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 553 + vino, pos, &len, 554 + 2,/*include a 'startsync' command*/ 555 + CEPH_OSD_OP_WRITE, flags, snapc, 556 + ci->i_truncate_seq, 557 + ci->i_truncate_size, 558 + false); 559 + if (IS_ERR(req)) { 560 + ret = PTR_ERR(req); 561 + goto out; 562 + } 563 + 564 + num_pages = calc_pages_for(page_align, len); 610 565 pages = ceph_get_direct_page_vector(data, num_pages, false); 611 566 if (IS_ERR(pages)) { 612 567 ret = PTR_ERR(pages); ··· 614 577 * may block. 615 578 */ 616 579 truncate_inode_pages_range(inode->i_mapping, pos, 617 - (pos+len) | (PAGE_CACHE_SIZE-1)); 618 - } else { 580 + (pos+len) | (PAGE_CACHE_SIZE-1)); 581 + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, 582 + false, false); 583 + 584 + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 585 + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 586 + 587 + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 588 + if (!ret) 589 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 590 + 591 + ceph_put_page_vector(pages, num_pages, false); 592 + 593 + out: 594 + ceph_osdc_put_request(req); 595 + if (ret == 0) { 596 + pos += len; 597 + written += len; 598 + iov_iter_advance(&i, (size_t)len); 599 + 600 + if (pos > i_size_read(inode)) { 601 + check_caps = ceph_inode_set_size(inode, pos); 602 + if (check_caps) 603 + ceph_check_caps(ceph_inode(inode), 604 + CHECK_CAPS_AUTHONLY, 605 + NULL); 606 + } 607 + } else 608 + break; 609 + } 610 + 611 + if (ret != -EOLDSNAPC && written > 0) { 612 + iocb->ki_pos = pos; 613 + ret = written; 614 + } 615 + return ret; 616 + } 617 + 618 + 619 + /* 620 + * Synchronous write, straight from __user pointer or user pages. 621 + * 622 + * If write spans object boundary, just do multiple writes. (For a 623 + * correct atomic write, we should e.g. take write locks on all 624 + * objects, rollback on failure, etc.) 625 + */ 626 + static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, 627 + unsigned long nr_segs, size_t count) 628 + { 629 + struct file *file = iocb->ki_filp; 630 + struct inode *inode = file_inode(file); 631 + struct ceph_inode_info *ci = ceph_inode(inode); 632 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 633 + struct ceph_snap_context *snapc; 634 + struct ceph_vino vino; 635 + struct ceph_osd_request *req; 636 + struct page **pages; 637 + u64 len; 638 + int num_pages; 639 + int written = 0; 640 + int flags; 641 + int check_caps = 0; 642 + int ret; 643 + struct timespec mtime = CURRENT_TIME; 644 + loff_t pos = iocb->ki_pos; 645 + struct iov_iter i; 646 + 647 + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 648 + return -EROFS; 649 + 650 + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); 651 + 652 + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 653 + if (ret < 0) 654 + return ret; 655 + 656 + ret = invalidate_inode_pages2_range(inode->i_mapping, 657 + pos >> PAGE_CACHE_SHIFT, 658 + (pos + count) >> PAGE_CACHE_SHIFT); 659 + if (ret < 0) 660 + dout("invalidate_inode_pages2_range returned %d\n", ret); 661 + 662 + flags = CEPH_OSD_FLAG_ORDERSNAP | 663 + CEPH_OSD_FLAG_ONDISK | 664 + CEPH_OSD_FLAG_WRITE | 665 + CEPH_OSD_FLAG_ACK; 666 + 667 + iov_iter_init(&i, iov, nr_segs, count, 0); 668 + 669 + while ((len = iov_iter_count(&i)) > 0) { 670 + size_t left; 671 + int n; 672 + 673 + snapc = ci->i_snap_realm->cached_context; 674 + vino = ceph_vino(inode); 675 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 676 + vino, pos, &len, 1, 677 + CEPH_OSD_OP_WRITE, flags, snapc, 678 + ci->i_truncate_seq, 679 + ci->i_truncate_size, 680 + false); 681 + if (IS_ERR(req)) { 682 + ret = PTR_ERR(req); 683 + goto out; 684 + } 685 + 686 + /* 687 + * write from beginning of first page, 688 + * regardless of io alignment 689 + */ 690 + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 691 + 619 692 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 620 693 if (IS_ERR(pages)) { 621 694 ret = PTR_ERR(pages); 622 695 goto out; 623 696 } 624 - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); 697 + 698 + left = len; 699 + for (n = 0; n < num_pages; n++) { 700 + size_t plen = min_t(size_t, left, PAGE_SIZE); 701 + ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); 702 + if (ret != plen) { 703 + ret = -EFAULT; 704 + break; 705 + } 706 + left -= ret; 707 + iov_iter_advance(&i, ret); 708 + } 709 + 625 710 if (ret < 0) { 626 711 ceph_release_page_vector(pages, num_pages); 627 712 goto out; 628 713 } 629 714 630 - if ((file->f_flags & O_SYNC) == 0) { 631 - /* get a second commit callback */ 632 - req->r_unsafe_callback = ceph_sync_write_unsafe; 633 - req->r_inode = inode; 634 - own_pages = true; 635 - } 636 - } 637 - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, 638 - false, own_pages); 715 + /* get a second commit callback */ 716 + req->r_unsafe_callback = ceph_sync_write_unsafe; 717 + req->r_inode = inode; 639 718 640 - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 641 - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 719 + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 720 + false, true); 642 721 643 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 644 - if (!ret) 645 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 722 + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 723 + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 646 724 647 - if (file->f_flags & O_DIRECT) 648 - ceph_put_page_vector(pages, num_pages, false); 649 - else if (file->f_flags & O_SYNC) 650 - ceph_release_page_vector(pages, num_pages); 725 + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 726 + if (!ret) 727 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 651 728 652 729 out: 653 - ceph_osdc_put_request(req); 654 - if (ret == 0) { 655 - pos += len; 656 - written += len; 657 - left -= len; 658 - data += len; 659 - if (left) 660 - goto more; 730 + ceph_osdc_put_request(req); 731 + if (ret == 0) { 732 + pos += len; 733 + written += len; 661 734 735 + if (pos > i_size_read(inode)) { 736 + check_caps = ceph_inode_set_size(inode, pos); 737 + if (check_caps) 738 + ceph_check_caps(ceph_inode(inode), 739 + CHECK_CAPS_AUTHONLY, 740 + NULL); 741 + } 742 + } else 743 + break; 744 + } 745 + 746 + if (ret != -EOLDSNAPC && written > 0) { 662 747 ret = written; 663 - *ppos = pos; 664 - if (pos > i_size_read(inode)) 665 - check_caps = ceph_inode_set_size(inode, pos); 666 - if (check_caps) 667 - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, 668 - NULL); 669 - } else if (ret != -EOLDSNAPC && written > 0) { 670 - ret = written; 748 + iocb->ki_pos = pos; 671 749 } 672 750 return ret; 673 751 } ··· 799 647 { 800 648 struct file *filp = iocb->ki_filp; 801 649 struct ceph_file_info *fi = filp->private_data; 802 - loff_t *ppos = &iocb->ki_pos; 803 - size_t len = iov->iov_len; 650 + size_t len = iocb->ki_nbytes; 804 651 struct inode *inode = file_inode(filp); 805 652 struct ceph_inode_info *ci = ceph_inode(inode); 806 - void __user *base = iov->iov_base; 807 653 ssize_t ret; 808 654 int want, got = 0; 809 655 int checkeof = 0, read = 0; 810 656 811 - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 812 - inode, ceph_vinop(inode), pos, (unsigned)len, inode); 813 657 again: 658 + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 659 + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); 660 + 814 661 if (fi->fmode & CEPH_FILE_MODE_LAZY) 815 662 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 816 663 else 817 664 want = CEPH_CAP_FILE_CACHE; 818 665 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 819 666 if (ret < 0) 820 - goto out; 821 - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 822 - inode, ceph_vinop(inode), pos, (unsigned)len, 823 - ceph_cap_string(got)); 667 + return ret; 824 668 825 669 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 826 670 (iocb->ki_filp->f_flags & O_DIRECT) || 827 - (fi->flags & CEPH_F_SYNC)) 828 - /* hmm, this isn't really async... */ 829 - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); 830 - else 831 - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 671 + (fi->flags & CEPH_F_SYNC)) { 672 + struct iov_iter i; 832 673 674 + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", 675 + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 676 + ceph_cap_string(got)); 677 + 678 + if (!read) { 679 + ret = generic_segment_checks(iov, &nr_segs, 680 + &len, VERIFY_WRITE); 681 + if (ret) 682 + goto out; 683 + } 684 + 685 + iov_iter_init(&i, iov, nr_segs, len, read); 686 + 687 + /* hmm, this isn't really async... */ 688 + ret = ceph_sync_read(iocb, &i, &checkeof); 689 + } else { 690 + /* 691 + * We can't modify the content of iov, 692 + * so we only read from beginning. 693 + */ 694 + if (read) { 695 + iocb->ki_pos = pos; 696 + len = iocb->ki_nbytes; 697 + read = 0; 698 + } 699 + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 700 + inode, ceph_vinop(inode), pos, (unsigned)len, 701 + ceph_cap_string(got)); 702 + 703 + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 704 + } 833 705 out: 834 706 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 835 707 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 836 708 ceph_put_cap_refs(ci, got); 837 709 838 710 if (checkeof && ret >= 0) { 839 - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 711 + int statret = ceph_do_getattr(inode, 712 + CEPH_STAT_CAP_SIZE); 840 713 841 714 /* hit EOF or hole? */ 842 - if (statret == 0 && *ppos < inode->i_size) { 843 - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); 715 + if (statret == 0 && iocb->ki_pos < inode->i_size && 716 + ret < len) { 717 + dout("sync_read hit hole, ppos %lld < size %lld" 718 + ", reading more\n", iocb->ki_pos, 719 + inode->i_size); 720 + 844 721 read += ret; 845 - base += ret; 846 722 len -= ret; 847 723 checkeof = 0; 848 724 goto again; 849 725 } 850 726 } 727 + 851 728 if (ret >= 0) 852 729 ret += read; 853 730 ··· 953 772 inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); 954 773 955 774 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 956 - (iocb->ki_filp->f_flags & O_DIRECT) || 957 - (fi->flags & CEPH_F_SYNC)) { 775 + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { 958 776 mutex_unlock(&inode->i_mutex); 959 - written = ceph_sync_write(file, iov->iov_base, count, 960 - pos, &iocb->ki_pos); 777 + if (file->f_flags & O_DIRECT) 778 + written = ceph_sync_direct_write(iocb, iov, 779 + nr_segs, count); 780 + else 781 + written = ceph_sync_write(iocb, iov, nr_segs, count); 961 782 if (written == -EOLDSNAPC) { 962 783 dout("aio_write %p %llx.%llx %llu~%u" 963 784 "got EOLDSNAPC, retrying\n", ··· 1201 1018 loff_t offset, loff_t length) 1202 1019 { 1203 1020 struct ceph_file_info *fi = file->private_data; 1204 - struct inode *inode = file->f_dentry->d_inode; 1021 + struct inode *inode = file_inode(file); 1205 1022 struct ceph_inode_info *ci = ceph_inode(inode); 1206 1023 struct ceph_osd_client *osdc = 1207 1024 &ceph_inode_to_client(inode)->client->osdc;

+27 -6

fs/ceph/inode.c

··· 95 95 .getxattr = ceph_getxattr, 96 96 .listxattr = ceph_listxattr, 97 97 .removexattr = ceph_removexattr, 98 + .get_acl = ceph_get_acl, 98 99 }; 99 100 100 101 ··· 336 335 ci->i_hold_caps_min = 0; 337 336 ci->i_hold_caps_max = 0; 338 337 INIT_LIST_HEAD(&ci->i_cap_delay_list); 339 - ci->i_cap_exporting_mds = 0; 340 - ci->i_cap_exporting_mseq = 0; 341 - ci->i_cap_exporting_issued = 0; 342 338 INIT_LIST_HEAD(&ci->i_cap_snaps); 343 339 ci->i_head_snapc = NULL; 344 340 ci->i_snap_caps = 0; 341 + ci->i_cap_exporting_issued = 0; 345 342 346 343 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 347 344 ci->i_nr_by_mode[i] = 0; ··· 433 434 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 434 435 435 436 call_rcu(&inode->i_rcu, ceph_i_callback); 437 + } 438 + 439 + int ceph_drop_inode(struct inode *inode) 440 + { 441 + /* 442 + * Positve dentry and corresponding inode are always accompanied 443 + * in MDS reply. So no need to keep inode in the cache after 444 + * dropping all its aliases. 445 + */ 446 + return 1; 436 447 } 437 448 438 449 /* ··· 679 670 memcpy(ci->i_xattrs.blob->vec.iov_base, 680 671 iinfo->xattr_data, iinfo->xattr_len); 681 672 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 673 + ceph_forget_all_cached_acls(inode); 682 674 xattr_blob = NULL; 683 675 } 684 676 ··· 1464 1454 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1465 1455 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1466 1456 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1467 - /* nevermind! */ 1457 + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 1458 + check = 1; 1468 1459 spin_unlock(&ci->i_ceph_lock); 1469 1460 mutex_unlock(&ci->i_truncate_mutex); 1470 1461 goto out; ··· 1486 1475 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", 1487 1476 inode, orig_gen, ci->i_rdcache_gen, 1488 1477 ci->i_rdcache_revoking); 1478 + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 1479 + check = 1; 1489 1480 } 1490 1481 spin_unlock(&ci->i_ceph_lock); 1491 1482 mutex_unlock(&ci->i_truncate_mutex); 1492 - 1483 + out: 1493 1484 if (check) 1494 1485 ceph_check_caps(ci, 0, NULL); 1495 - out: 1496 1486 iput(inode); 1497 1487 } 1498 1488 ··· 1614 1602 .getxattr = ceph_getxattr, 1615 1603 .listxattr = ceph_listxattr, 1616 1604 .removexattr = ceph_removexattr, 1605 + .get_acl = ceph_get_acl, 1617 1606 }; 1618 1607 1619 1608 /* ··· 1688 1675 dirtied |= CEPH_CAP_AUTH_EXCL; 1689 1676 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 1690 1677 attr->ia_mode != inode->i_mode) { 1678 + inode->i_mode = attr->ia_mode; 1691 1679 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); 1692 1680 mask |= CEPH_SETATTR_MODE; 1693 1681 release |= CEPH_CAP_AUTH_SHARED; ··· 1804 1790 if (inode_dirty_flags) 1805 1791 __mark_inode_dirty(inode, inode_dirty_flags); 1806 1792 1793 + if (ia_valid & ATTR_MODE) { 1794 + err = ceph_acl_chmod(dentry, inode); 1795 + if (err) 1796 + goto out_put; 1797 + } 1798 + 1807 1799 if (mask) { 1808 1800 req->r_inode = inode; 1809 1801 ihold(inode); ··· 1829 1809 return err; 1830 1810 out: 1831 1811 spin_unlock(&ci->i_ceph_lock); 1812 + out_put: 1832 1813 ceph_mdsc_put_request(req); 1833 1814 return err; 1834 1815 }

+6 -2

fs/ceph/ioctl.c

··· 183 183 struct ceph_inode_info *ci = ceph_inode(inode); 184 184 struct ceph_osd_client *osdc = 185 185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 186 + struct ceph_object_locator oloc; 187 + struct ceph_object_id oid; 186 188 u64 len = 1, olen; 187 189 u64 tmp; 188 190 struct ceph_pg pgid; ··· 213 211 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 214 212 ceph_ino(inode), dl.object_no); 215 213 216 - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, 217 - ceph_file_layout_pg_pool(ci->i_layout)); 214 + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); 215 + ceph_oid_set_name(&oid, dl.object_name); 216 + 217 + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); 218 218 if (r < 0) { 219 219 up_read(&osdc->map_sem); 220 220 return r;

+84 -48

fs/ceph/mds_client.c

··· 63 63 */ 64 64 static int parse_reply_info_in(void **p, void *end, 65 65 struct ceph_mds_reply_info_in *info, 66 - int features) 66 + u64 features) 67 67 { 68 68 int err = -EIO; 69 69 ··· 98 98 */ 99 99 static int parse_reply_info_trace(void **p, void *end, 100 100 struct ceph_mds_reply_info_parsed *info, 101 - int features) 101 + u64 features) 102 102 { 103 103 int err; 104 104 ··· 145 145 */ 146 146 static int parse_reply_info_dir(void **p, void *end, 147 147 struct ceph_mds_reply_info_parsed *info, 148 - int features) 148 + u64 features) 149 149 { 150 150 u32 num, i = 0; 151 151 int err; ··· 217 217 */ 218 218 static int parse_reply_info_filelock(void **p, void *end, 219 219 struct ceph_mds_reply_info_parsed *info, 220 - int features) 220 + u64 features) 221 221 { 222 222 if (*p + sizeof(*info->filelock_reply) > end) 223 223 goto bad; ··· 238 238 */ 239 239 static int parse_reply_info_create(void **p, void *end, 240 240 struct ceph_mds_reply_info_parsed *info, 241 - int features) 241 + u64 features) 242 242 { 243 243 if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { 244 244 if (*p == end) { ··· 262 262 */ 263 263 static int parse_reply_info_extra(void **p, void *end, 264 264 struct ceph_mds_reply_info_parsed *info, 265 - int features) 265 + u64 features) 266 266 { 267 267 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 268 268 return parse_reply_info_filelock(p, end, info, features); ··· 280 280 */ 281 281 static int parse_reply_info(struct ceph_msg *msg, 282 282 struct ceph_mds_reply_info_parsed *info, 283 - int features) 283 + u64 features) 284 284 { 285 285 void *p, *end; 286 286 u32 len; ··· 713 713 struct dentry *dn = get_nonsnap_parent(parent); 714 714 inode = dn->d_inode; 715 715 dout("__choose_mds using nonsnap parent %p\n", inode); 716 - } else if (req->r_dentry->d_inode) { 716 + } else { 717 717 /* dentry target */ 718 718 inode = req->r_dentry->d_inode; 719 - } else { 720 - /* dir + name */ 721 - inode = dir; 722 - hash = ceph_dentry_hash(dir, req->r_dentry); 723 - is_hash = true; 719 + if (!inode || mode == USE_AUTH_MDS) { 720 + /* dir + name */ 721 + inode = dir; 722 + hash = ceph_dentry_hash(dir, req->r_dentry); 723 + is_hash = true; 724 + } 724 725 } 725 726 } 726 727 ··· 847 846 * 848 847 * called under mdsc->mutex 849 848 */ 849 + static struct ceph_mds_session * 850 + __open_export_target_session(struct ceph_mds_client *mdsc, int target) 851 + { 852 + struct ceph_mds_session *session; 853 + 854 + session = __ceph_lookup_mds_session(mdsc, target); 855 + if (!session) { 856 + session = register_session(mdsc, target); 857 + if (IS_ERR(session)) 858 + return session; 859 + } 860 + if (session->s_state == CEPH_MDS_SESSION_NEW || 861 + session->s_state == CEPH_MDS_SESSION_CLOSING) 862 + __open_session(mdsc, session); 863 + 864 + return session; 865 + } 866 + 867 + struct ceph_mds_session * 868 + ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) 869 + { 870 + struct ceph_mds_session *session; 871 + 872 + dout("open_export_target_session to mds%d\n", target); 873 + 874 + mutex_lock(&mdsc->mutex); 875 + session = __open_export_target_session(mdsc, target); 876 + mutex_unlock(&mdsc->mutex); 877 + 878 + return session; 879 + } 880 + 850 881 static void __open_export_target_sessions(struct ceph_mds_client *mdsc, 851 882 struct ceph_mds_session *session) 852 883 { 853 884 struct ceph_mds_info *mi; 854 885 struct ceph_mds_session *ts; 855 886 int i, mds = session->s_mds; 856 - int target; 857 887 858 888 if (mds >= mdsc->mdsmap->m_max_mds) 859 889 return; 890 + 860 891 mi = &mdsc->mdsmap->m_info[mds]; 861 892 dout("open_export_target_sessions for mds%d (%d targets)\n", 862 893 session->s_mds, mi->num_export_targets); 863 894 864 895 for (i = 0; i < mi->num_export_targets; i++) { 865 - target = mi->export_targets[i]; 866 - ts = __ceph_lookup_mds_session(mdsc, target); 867 - if (!ts) { 868 - ts = register_session(mdsc, target); 869 - if (IS_ERR(ts)) 870 - return; 871 - } 872 - if (session->s_state == CEPH_MDS_SESSION_NEW || 873 - session->s_state == CEPH_MDS_SESSION_CLOSING) 874 - __open_session(mdsc, session); 875 - else 876 - dout(" mds%d target mds%d %p is %s\n", session->s_mds, 877 - i, ts, session_state_name(ts->s_state)); 878 - ceph_put_mds_session(ts); 896 + ts = __open_export_target_session(mdsc, mi->export_targets[i]); 897 + if (!IS_ERR(ts)) 898 + ceph_put_mds_session(ts); 879 899 } 880 900 } 881 901 ··· 1158 1136 return 0; 1159 1137 } 1160 1138 1139 + static int send_flushmsg_ack(struct ceph_mds_client *mdsc, 1140 + struct ceph_mds_session *session, u64 seq) 1141 + { 1142 + struct ceph_msg *msg; 1143 + 1144 + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", 1145 + session->s_mds, session_state_name(session->s_state), seq); 1146 + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); 1147 + if (!msg) 1148 + return -ENOMEM; 1149 + ceph_con_send(&session->s_con, msg); 1150 + return 0; 1151 + } 1152 + 1153 + 1161 1154 /* 1162 1155 * Note new cap ttl, and any transition from stale -> not stale (fresh?). 1163 1156 * ··· 1251 1214 { 1252 1215 struct ceph_mds_session *session = arg; 1253 1216 struct ceph_inode_info *ci = ceph_inode(inode); 1254 - int used, oissued, mine; 1217 + int used, wanted, oissued, mine; 1255 1218 1256 1219 if (session->s_trim_caps <= 0) 1257 1220 return -1; ··· 1259 1222 spin_lock(&ci->i_ceph_lock); 1260 1223 mine = cap->issued | cap->implemented; 1261 1224 used = __ceph_caps_used(ci); 1225 + wanted = __ceph_caps_file_wanted(ci); 1262 1226 oissued = __ceph_caps_issued_other(ci, cap); 1263 1227 1264 - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", 1228 + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", 1265 1229 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), 1266 - ceph_cap_string(used)); 1267 - if (ci->i_dirty_caps) 1268 - goto out; /* dirty caps */ 1269 - if ((used & ~oissued) & mine) 1230 + ceph_cap_string(used), ceph_cap_string(wanted)); 1231 + if (cap == ci->i_auth_cap) { 1232 + if (ci->i_dirty_caps | ci->i_flushing_caps) 1233 + goto out; 1234 + if ((used | wanted) & CEPH_CAP_ANY_WR) 1235 + goto out; 1236 + } 1237 + if ((used | wanted) & ~oissued & mine) 1270 1238 goto out; /* we need these caps */ 1271 1239 1272 1240 session->s_trim_caps--; ··· 2198 2156 */ 2199 2157 if (result == -ESTALE) { 2200 2158 dout("got ESTALE on request %llu", req->r_tid); 2201 - if (!req->r_inode) { 2202 - /* do nothing; not an authority problem */ 2203 - } else if (req->r_direct_mode != USE_AUTH_MDS) { 2159 + if (req->r_direct_mode != USE_AUTH_MDS) { 2204 2160 dout("not using auth, setting for that now"); 2205 2161 req->r_direct_mode = USE_AUTH_MDS; 2206 2162 __do_request(mdsc, req); 2207 2163 mutex_unlock(&mdsc->mutex); 2208 2164 goto out; 2209 2165 } else { 2210 - struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2211 - struct ceph_cap *cap = NULL; 2212 - 2213 - if (req->r_session) 2214 - cap = ceph_get_cap_for_mds(ci, 2215 - req->r_session->s_mds); 2216 - 2217 - dout("already using auth"); 2218 - if ((!cap || cap != ci->i_auth_cap) || 2219 - (cap->mseq != req->r_sent_on_mseq)) { 2220 - dout("but cap changed, so resending"); 2166 + int mds = __choose_mds(mdsc, req); 2167 + if (mds >= 0 && mds != req->r_session->s_mds) { 2168 + dout("but auth changed, so resending"); 2221 2169 __do_request(mdsc, req); 2222 2170 mutex_unlock(&mdsc->mutex); 2223 2171 goto out; ··· 2430 2398 2431 2399 case CEPH_SESSION_RECALL_STATE: 2432 2400 trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); 2401 + break; 2402 + 2403 + case CEPH_SESSION_FLUSHMSG: 2404 + send_flushmsg_ack(mdsc, session, seq); 2433 2405 break; 2434 2406 2435 2407 default:

+2

fs/ceph/mds_client.h

··· 383 383 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 384 384 struct ceph_msg *msg); 385 385 386 + extern struct ceph_mds_session * 387 + ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); 386 388 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 387 389 struct ceph_mds_session *session); 388 390

+2

fs/ceph/strings.c

··· 41 41 case CEPH_SESSION_RENEWCAPS: return "renewcaps"; 42 42 case CEPH_SESSION_STALE: return "stale"; 43 43 case CEPH_SESSION_RECALL_STATE: return "recall_state"; 44 + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; 45 + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; 44 46 } 45 47 return "???"; 46 48 }

+7 -2

fs/ceph/super.c

··· 490 490 struct ceph_options *opt) 491 491 { 492 492 struct ceph_fs_client *fsc; 493 - const unsigned supported_features = 493 + const u64 supported_features = 494 494 CEPH_FEATURE_FLOCK | 495 495 CEPH_FEATURE_DIRLAYOUTHASH; 496 - const unsigned required_features = 0; 496 + const u64 required_features = 0; 497 497 int page_count; 498 498 size_t size; 499 499 int err = -ENOMEM; ··· 686 686 .alloc_inode = ceph_alloc_inode, 687 687 .destroy_inode = ceph_destroy_inode, 688 688 .write_inode = ceph_write_inode, 689 + .drop_inode = ceph_drop_inode, 689 690 .sync_fs = ceph_sync_fs, 690 691 .put_super = ceph_put_super, 691 692 .show_options = ceph_show_options, ··· 819 818 820 819 s->s_flags = fsc->mount_options->sb_flags; 821 820 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 821 + #ifdef CONFIG_CEPH_FS_POSIX_ACL 822 + s->s_flags |= MS_POSIXACL; 823 + #endif 822 824 825 + s->s_xattr = ceph_xattr_handlers; 823 826 s->s_fs_info = fsc; 824 827 fsc->sb = s; 825 828

+41 -4

fs/ceph/super.h

··· 287 287 unsigned long i_hold_caps_min; /* jiffies */ 288 288 unsigned long i_hold_caps_max; /* jiffies */ 289 289 struct list_head i_cap_delay_list; /* for delayed cap release to mds */ 290 - int i_cap_exporting_mds; /* to handle cap migration between */ 291 - unsigned i_cap_exporting_mseq; /* mds's. */ 292 - unsigned i_cap_exporting_issued; 293 290 struct ceph_cap_reservation i_cap_migration_resv; 294 291 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 295 292 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 296 293 dirty|flushing caps */ 297 294 unsigned i_snap_caps; /* cap bits for snapped files */ 295 + unsigned i_cap_exporting_issued; 298 296 299 297 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 300 298 ··· 333 335 u32 i_fscache_gen; /* sequence, for delayed fscache validate */ 334 336 struct work_struct i_revalidate_work; 335 337 #endif 336 - 337 338 struct inode vfs_inode; /* at end */ 338 339 }; 339 340 ··· 526 529 } 527 530 extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); 528 531 532 + extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, 533 + struct ceph_cap *ocap, int mask); 529 534 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); 530 535 extern int __ceph_caps_used(struct ceph_inode_info *ci); 531 536 ··· 690 691 691 692 extern struct inode *ceph_alloc_inode(struct super_block *sb); 692 693 extern void ceph_destroy_inode(struct inode *inode); 694 + extern int ceph_drop_inode(struct inode *inode); 693 695 694 696 extern struct inode *ceph_get_inode(struct super_block *sb, 695 697 struct ceph_vino vino); ··· 724 724 /* xattr.c */ 725 725 extern int ceph_setxattr(struct dentry *, const char *, const void *, 726 726 size_t, int); 727 + int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); 728 + ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); 729 + int __ceph_removexattr(struct dentry *, const char *); 727 730 extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); 728 731 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); 729 732 extern int ceph_removexattr(struct dentry *, const char *); ··· 734 731 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); 735 732 extern void __init ceph_xattr_init(void); 736 733 extern void ceph_xattr_exit(void); 734 + 735 + /* acl.c */ 736 + extern const struct xattr_handler ceph_xattr_acl_access_handler; 737 + extern const struct xattr_handler ceph_xattr_acl_default_handler; 738 + extern const struct xattr_handler *ceph_xattr_handlers[]; 739 + 740 + #ifdef CONFIG_CEPH_FS_POSIX_ACL 741 + 742 + struct posix_acl *ceph_get_acl(struct inode *, int); 743 + int ceph_init_acl(struct dentry *, struct inode *, struct inode *); 744 + int ceph_acl_chmod(struct dentry *, struct inode *); 745 + void ceph_forget_all_cached_acls(struct inode *inode); 746 + 747 + #else 748 + 749 + #define ceph_get_acl NULL 750 + 751 + static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, 752 + struct inode *dir) 753 + { 754 + return 0; 755 + } 756 + 757 + static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) 758 + { 759 + return 0; 760 + } 761 + 762 + static inline void ceph_forget_all_cached_acls(struct inode *inode) 763 + { 764 + } 765 + 766 + #endif 737 767 738 768 /* caps.c */ 739 769 extern const char *ceph_cap_string(int c); ··· 780 744 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); 781 745 extern void ceph_put_cap(struct ceph_mds_client *mdsc, 782 746 struct ceph_cap *cap); 747 + extern int ceph_is_any_caps(struct inode *inode); 783 748 784 749 extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, 785 750 u64 cap_id, u32 migrate_seq, u32 issue_seq);

+48 -12

fs/ceph/xattr.c

··· 11 11 #define XATTR_CEPH_PREFIX "ceph." 12 12 #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) 13 13 14 + /* 15 + * List of handlers for synthetic system.* attributes. Other 16 + * attributes are handled directly. 17 + */ 18 + const struct xattr_handler *ceph_xattr_handlers[] = { 19 + #ifdef CONFIG_CEPH_FS_POSIX_ACL 20 + &ceph_xattr_acl_access_handler, 21 + &ceph_xattr_acl_default_handler, 22 + #endif 23 + NULL, 24 + }; 25 + 14 26 static bool ceph_is_valid_xattr(const char *name) 15 27 { 16 28 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 17 29 !strncmp(name, XATTR_SECURITY_PREFIX, 18 30 XATTR_SECURITY_PREFIX_LEN) || 31 + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || 19 32 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 20 33 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 21 34 } ··· 676 663 } 677 664 } 678 665 679 - ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, 666 + ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, 680 667 size_t size) 681 668 { 682 - struct inode *inode = dentry->d_inode; 683 669 struct ceph_inode_info *ci = ceph_inode(inode); 684 670 int err; 685 671 struct ceph_inode_xattr *xattr; ··· 686 674 687 675 if (!ceph_is_valid_xattr(name)) 688 676 return -ENODATA; 689 - 690 677 691 678 /* let's see if a virtual xattr was requested */ 692 679 vxattr = ceph_match_vxattr(inode, name); ··· 734 723 out: 735 724 spin_unlock(&ci->i_ceph_lock); 736 725 return err; 726 + } 727 + 728 + ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, 729 + size_t size) 730 + { 731 + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 732 + return generic_getxattr(dentry, name, value, size); 733 + 734 + return __ceph_getxattr(dentry->d_inode, name, value, size); 737 735 } 738 736 739 737 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ··· 883 863 return err; 884 864 } 885 865 886 - int ceph_setxattr(struct dentry *dentry, const char *name, 887 - const void *value, size_t size, int flags) 866 + int __ceph_setxattr(struct dentry *dentry, const char *name, 867 + const void *value, size_t size, int flags) 888 868 { 889 869 struct inode *inode = dentry->d_inode; 890 870 struct ceph_vxattr *vxattr; ··· 898 878 char *newval = NULL; 899 879 struct ceph_inode_xattr *xattr = NULL; 900 880 int required_blob_size; 901 - 902 - if (ceph_snap(inode) != CEPH_NOSNAP) 903 - return -EROFS; 904 881 905 882 if (!ceph_is_valid_xattr(name)) 906 883 return -EOPNOTSUPP; ··· 975 958 return err; 976 959 } 977 960 961 + int ceph_setxattr(struct dentry *dentry, const char *name, 962 + const void *value, size_t size, int flags) 963 + { 964 + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 965 + return -EROFS; 966 + 967 + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 968 + return generic_setxattr(dentry, name, value, size, flags); 969 + 970 + return __ceph_setxattr(dentry, name, value, size, flags); 971 + } 972 + 978 973 static int ceph_send_removexattr(struct dentry *dentry, const char *name) 979 974 { 980 975 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); ··· 1013 984 return err; 1014 985 } 1015 986 1016 - int ceph_removexattr(struct dentry *dentry, const char *name) 987 + int __ceph_removexattr(struct dentry *dentry, const char *name) 1017 988 { 1018 989 struct inode *inode = dentry->d_inode; 1019 990 struct ceph_vxattr *vxattr; ··· 1022 993 int err; 1023 994 int required_blob_size; 1024 995 int dirty; 1025 - 1026 - if (ceph_snap(inode) != CEPH_NOSNAP) 1027 - return -EROFS; 1028 996 1029 997 if (!ceph_is_valid_xattr(name)) 1030 998 return -EOPNOTSUPP; ··· 1079 1053 return err; 1080 1054 } 1081 1055 1056 + int ceph_removexattr(struct dentry *dentry, const char *name) 1057 + { 1058 + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 1059 + return -EROFS; 1060 + 1061 + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1062 + return generic_removexattr(dentry, name); 1063 + 1064 + return __ceph_removexattr(dentry, name); 1065 + }

-1

include/linux/ceph/buffer.h

··· 17 17 struct kref kref; 18 18 struct kvec vec; 19 19 size_t alloc_len; 20 - bool is_vmalloc; 21 20 }; 22 21 23 22 extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);

+68 -33

include/linux/ceph/ceph_features.h

··· 4 4 /* 5 5 * feature bits 6 6 */ 7 - #define CEPH_FEATURE_UID (1<<0) 8 - #define CEPH_FEATURE_NOSRCADDR (1<<1) 9 - #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) 10 - #define CEPH_FEATURE_FLOCK (1<<3) 11 - #define CEPH_FEATURE_SUBSCRIBE2 (1<<4) 12 - #define CEPH_FEATURE_MONNAMES (1<<5) 13 - #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 14 - #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 15 - #define CEPH_FEATURE_OBJECTLOCATOR (1<<8) 16 - #define CEPH_FEATURE_PGID64 (1<<9) 17 - #define CEPH_FEATURE_INCSUBOSDMAP (1<<10) 18 - #define CEPH_FEATURE_PGPOOL3 (1<<11) 19 - #define CEPH_FEATURE_OSDREPLYMUX (1<<12) 20 - #define CEPH_FEATURE_OSDENC (1<<13) 21 - #define CEPH_FEATURE_OMAP (1<<14) 22 - #define CEPH_FEATURE_MONENC (1<<15) 23 - #define CEPH_FEATURE_QUERY_T (1<<16) 24 - #define CEPH_FEATURE_INDEP_PG_MAP (1<<17) 25 - #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) 26 - #define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) 27 - #define CEPH_FEATURE_MON_NULLROUTE (1<<20) 28 - #define CEPH_FEATURE_MON_GV (1<<21) 29 - #define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) 30 - #define CEPH_FEATURE_MSG_AUTH (1<<23) 31 - #define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) 32 - #define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) 33 - #define CEPH_FEATURE_CREATEPOOLID (1<<26) 34 - #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) 35 - #define CEPH_FEATURE_OSD_HBMSGS (1<<28) 36 - #define CEPH_FEATURE_MDSENC (1<<29) 37 - #define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) 7 + #define CEPH_FEATURE_UID (1ULL<<0) 8 + #define CEPH_FEATURE_NOSRCADDR (1ULL<<1) 9 + #define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) 10 + #define CEPH_FEATURE_FLOCK (1ULL<<3) 11 + #define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) 12 + #define CEPH_FEATURE_MONNAMES (1ULL<<5) 13 + #define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) 14 + #define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) 15 + #define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) 16 + #define CEPH_FEATURE_PGID64 (1ULL<<9) 17 + #define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) 18 + #define CEPH_FEATURE_PGPOOL3 (1ULL<<11) 19 + #define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) 20 + #define CEPH_FEATURE_OSDENC (1ULL<<13) 21 + #define CEPH_FEATURE_OMAP (1ULL<<14) 22 + #define CEPH_FEATURE_MONENC (1ULL<<15) 23 + #define CEPH_FEATURE_QUERY_T (1ULL<<16) 24 + #define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) 25 + #define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) 26 + #define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) 27 + #define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) 28 + #define CEPH_FEATURE_MON_GV (1ULL<<21) 29 + #define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) 30 + #define CEPH_FEATURE_MSG_AUTH (1ULL<<23) 31 + #define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) 32 + #define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) 33 + #define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) 34 + #define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) 35 + #define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) 36 + #define CEPH_FEATURE_MDSENC (1ULL<<29) 37 + #define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) 38 + #define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) 39 + #define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) 40 + #define CEPH_FEATURE_MON_SCRUB (1ULL<<33) 41 + #define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) 42 + #define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) 43 + #define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ 44 + #define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) 45 + #define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) 46 + 47 + /* 48 + * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature 49 + * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 50 + * to mean 33 bit ~0, and introduce a helper below to do the 51 + * translation. 52 + * 53 + * This was introduced by ceph.git commit 54 + * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 55 + * and fixed by ceph.git commit 56 + * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c 57 + */ 58 + #define CEPH_FEATURE_RESERVED (1ULL<<63) 59 + 60 + static inline u64 ceph_sanitize_features(u64 features) 61 + { 62 + if (features & CEPH_FEATURE_RESERVED) { 63 + /* everything through OSD_SNAPMAPPER */ 64 + return 0x1ffffffffull; 65 + } else { 66 + return features; 67 + } 68 + } 38 69 39 70 /* 40 71 * Features supported. 41 72 */ 42 - #define CEPH_FEATURES_SUPPORTED_DEFAULT \ 73 + #define CEPH_FEATURES_SUPPORTED_DEFAULT \ 43 74 (CEPH_FEATURE_NOSRCADDR | \ 44 75 CEPH_FEATURE_RECONNECT_SEQ | \ 45 76 CEPH_FEATURE_PGID64 | \ ··· 79 48 CEPH_FEATURE_CRUSH_TUNABLES | \ 80 49 CEPH_FEATURE_CRUSH_TUNABLES2 | \ 81 50 CEPH_FEATURE_REPLY_CREATE_INODE | \ 82 - CEPH_FEATURE_OSDHASHPSPOOL) 51 + CEPH_FEATURE_OSDHASHPSPOOL | \ 52 + CEPH_FEATURE_OSD_CACHEPOOL | \ 53 + CEPH_FEATURE_CRUSH_V2 | \ 54 + CEPH_FEATURE_EXPORT_PEER) 83 55 84 56 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 85 57 (CEPH_FEATURE_NOSRCADDR | \ ··· 90 56 CEPH_FEATURE_PGID64 | \ 91 57 CEPH_FEATURE_PGPOOL3 | \ 92 58 CEPH_FEATURE_OSDENC) 59 + 93 60 #endif

+35 -1

include/linux/ceph/ceph_fs.h

··· 53 53 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ 54 54 } __attribute__ ((packed)); 55 55 56 + #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) 57 + #define ceph_file_layout_stripe_count(l) \ 58 + ((__s32)le32_to_cpu((l).fl_stripe_count)) 59 + #define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) 60 + #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) 61 + #define ceph_file_layout_object_su(l) \ 62 + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) 63 + #define ceph_file_layout_pg_pool(l) \ 64 + ((__s32)le32_to_cpu((l).fl_pg_pool)) 65 + 66 + static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) 67 + { 68 + return le32_to_cpu(l->fl_stripe_unit) * 69 + le32_to_cpu(l->fl_stripe_count); 70 + } 71 + 72 + /* "period" == bytes before i start on a new set of objects */ 73 + static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) 74 + { 75 + return le32_to_cpu(l->fl_object_size) * 76 + le32_to_cpu(l->fl_stripe_count); 77 + } 78 + 56 79 #define CEPH_MIN_STRIPE_UNIT 65536 57 80 58 81 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); ··· 305 282 CEPH_SESSION_RENEWCAPS, 306 283 CEPH_SESSION_STALE, 307 284 CEPH_SESSION_RECALL_STATE, 285 + CEPH_SESSION_FLUSHMSG, 286 + CEPH_SESSION_FLUSHMSG_ACK, 308 287 }; 309 288 310 289 extern const char *ceph_session_op_name(int op); ··· 482 457 __u8 flags; /* CEPH_CAP_FLAG_* */ 483 458 } __attribute__ ((packed)); 484 459 485 - #define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ 460 + #define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ 461 + #define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */ 486 462 487 463 /* inode record, for bundling with mds reply */ 488 464 struct ceph_mds_reply_inode { ··· 682 656 struct ceph_timespec mtime, atime, ctime; 683 657 struct ceph_file_layout layout; 684 658 __le32 time_warp_seq; 659 + } __attribute__ ((packed)); 660 + 661 + struct ceph_mds_cap_peer { 662 + __le64 cap_id; 663 + __le32 seq; 664 + __le32 mseq; 665 + __le32 mds; 666 + __u8 flags; 685 667 } __attribute__ ((packed)); 686 668 687 669 /* cap release msg head */

+11 -8

include/linux/ceph/libceph.h

··· 122 122 123 123 int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); 124 124 125 - u32 supported_features; 126 - u32 required_features; 125 + u64 supported_features; 126 + u64 required_features; 127 127 128 128 struct ceph_messenger msgr; /* messenger instance */ 129 129 struct ceph_mon_client monc; ··· 173 173 (off >> PAGE_CACHE_SHIFT); 174 174 } 175 175 176 + extern struct kmem_cache *ceph_inode_cachep; 177 + extern struct kmem_cache *ceph_cap_cachep; 178 + extern struct kmem_cache *ceph_dentry_cachep; 179 + extern struct kmem_cache *ceph_file_cachep; 180 + 176 181 /* ceph_common.c */ 177 182 extern bool libceph_compatible(void *data); 178 183 179 184 extern const char *ceph_msg_type_name(int type); 180 185 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 181 - extern struct kmem_cache *ceph_inode_cachep; 182 - extern struct kmem_cache *ceph_cap_cachep; 183 - extern struct kmem_cache *ceph_dentry_cachep; 184 - extern struct kmem_cache *ceph_file_cachep; 186 + extern void *ceph_kvmalloc(size_t size, gfp_t flags); 187 + extern void ceph_kvfree(const void *ptr); 185 188 186 189 extern struct ceph_options *ceph_parse_options(char *options, 187 190 const char *dev_name, const char *dev_name_end, ··· 195 192 struct ceph_client *client); 196 193 extern struct ceph_client *ceph_create_client(struct ceph_options *opt, 197 194 void *private, 198 - unsigned supported_features, 199 - unsigned required_features); 195 + u64 supported_features, 196 + u64 required_features); 200 197 extern u64 ceph_client_id(struct ceph_client *client); 201 198 extern void ceph_destroy_client(struct ceph_client *client); 202 199 extern int __ceph_open_session(struct ceph_client *client,

+6 -7

include/linux/ceph/messenger.h

··· 60 60 u32 global_seq; 61 61 spinlock_t global_seq_lock; 62 62 63 - u32 supported_features; 64 - u32 required_features; 63 + u64 supported_features; 64 + u64 required_features; 65 65 }; 66 66 67 67 enum ceph_msg_data_type { ··· 154 154 struct list_head list_head; /* links for connection lists */ 155 155 156 156 struct kref kref; 157 - bool front_is_vmalloc; 158 157 bool more_to_follow; 159 158 bool needs_out_seq; 160 - int front_max; 159 + int front_alloc_len; 161 160 unsigned long ack_stamp; /* tx: when we were acked */ 162 161 163 162 struct ceph_msgpool *pool; ··· 191 192 192 193 struct ceph_entity_name peer_name; /* peer name */ 193 194 194 - unsigned peer_features; 195 + u64 peer_features; 195 196 u32 connect_seq; /* identify the most recent connection 196 197 attempt for this connection, client */ 197 198 u32 peer_global_seq; /* peer's global seq for this connection */ ··· 255 256 256 257 extern void ceph_messenger_init(struct ceph_messenger *msgr, 257 258 struct ceph_entity_addr *myaddr, 258 - u32 supported_features, 259 - u32 required_features, 259 + u64 supported_features, 260 + u64 required_features, 260 261 bool nocrc); 261 262 262 263 extern void ceph_con_init(struct ceph_connection *con, void *private,

+10 -9

include/linux/ceph/osd_client.h

··· 12 12 #include <linux/ceph/auth.h> 13 13 #include <linux/ceph/pagelist.h> 14 14 15 - /* 16 - * Maximum object name size 17 - * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) 18 - */ 19 - #define MAX_OBJ_NAME_SIZE 100 20 - 21 15 struct ceph_msg; 22 16 struct ceph_snap_context; 23 17 struct ceph_osd_request; ··· 132 138 __le64 *r_request_pool; 133 139 void *r_request_pgid; 134 140 __le32 *r_request_attempts; 141 + bool r_paused; 135 142 struct ceph_eversion *r_request_reassert_version; 136 143 137 144 int r_result; ··· 153 158 struct inode *r_inode; /* for use by callbacks */ 154 159 void *r_priv; /* ditto */ 155 160 156 - char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ 157 - int r_oid_len; 161 + struct ceph_object_locator r_base_oloc; 162 + struct ceph_object_id r_base_oid; 163 + struct ceph_object_locator r_target_oloc; 164 + struct ceph_object_id r_target_oid; 165 + 158 166 u64 r_snapid; 159 167 unsigned long r_stamp; /* send OR check time */ 160 168 161 - struct ceph_file_layout r_file_layout; 162 169 struct ceph_snap_context *r_snapc; /* snap context for writes */ 170 + }; 171 + 172 + struct ceph_request_redirect { 173 + struct ceph_object_locator oloc; 163 174 }; 164 175 165 176 struct ceph_osd_event {

+41 -25

include/linux/ceph/osdmap.h

··· 35 35 u8 object_hash; 36 36 u32 pg_num, pgp_num; 37 37 int pg_num_mask, pgp_num_mask; 38 + s64 read_tier; 39 + s64 write_tier; /* wins for read+write ops */ 38 40 u64 flags; 39 41 char *name; 40 42 }; 41 43 42 44 struct ceph_object_locator { 43 - uint64_t pool; 44 - char *key; 45 + s64 pool; 46 + }; 47 + 48 + /* 49 + * Maximum supported by kernel client object name length 50 + * 51 + * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) 52 + */ 53 + #define CEPH_MAX_OID_NAME_LEN 100 54 + 55 + struct ceph_object_id { 56 + char name[CEPH_MAX_OID_NAME_LEN]; 57 + int name_len; 45 58 }; 46 59 47 60 struct ceph_pg_mapping { ··· 86 73 struct crush_map *crush; 87 74 }; 88 75 89 - /* 90 - * file layout helpers 91 - */ 92 - #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) 93 - #define ceph_file_layout_stripe_count(l) \ 94 - ((__s32)le32_to_cpu((l).fl_stripe_count)) 95 - #define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) 96 - #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) 97 - #define ceph_file_layout_object_su(l) \ 98 - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) 99 - #define ceph_file_layout_pg_pool(l) \ 100 - ((__s32)le32_to_cpu((l).fl_pg_pool)) 101 - 102 - static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) 76 + static inline void ceph_oid_set_name(struct ceph_object_id *oid, 77 + const char *name) 103 78 { 104 - return le32_to_cpu(l->fl_stripe_unit) * 105 - le32_to_cpu(l->fl_stripe_count); 79 + int len; 80 + 81 + len = strlen(name); 82 + if (len > sizeof(oid->name)) { 83 + WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", 84 + name, len, sizeof(oid->name)); 85 + len = sizeof(oid->name); 86 + } 87 + 88 + memcpy(oid->name, name, len); 89 + oid->name_len = len; 106 90 } 107 91 108 - /* "period" == bytes before i start on a new set of objects */ 109 - static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) 92 + static inline void ceph_oid_copy(struct ceph_object_id *dest, 93 + struct ceph_object_id *src) 110 94 { 111 - return le32_to_cpu(l->fl_object_size) * 112 - le32_to_cpu(l->fl_stripe_count); 95 + BUG_ON(src->name_len > sizeof(dest->name)); 96 + memcpy(dest->name, src->name, src->name_len); 97 + dest->name_len = src->name_len; 113 98 } 114 - 115 99 116 100 static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 117 101 { ··· 165 155 u64 *bno, u64 *oxoff, u64 *oxlen); 166 156 167 157 /* calculate mapping of object to a placement group */ 168 - extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, 169 - struct ceph_osdmap *osdmap, uint64_t pool); 158 + extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 159 + struct ceph_object_locator *oloc, 160 + struct ceph_object_id *oid, 161 + struct ceph_pg *pg_out); 162 + 170 163 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 171 164 struct ceph_pg pgid, 172 165 int *acting); 173 166 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 174 167 struct ceph_pg pgid); 168 + 169 + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, 170 + u64 id); 175 171 176 172 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); 177 173 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);

+4

include/linux/ceph/rados.h

··· 344 344 CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ 345 345 CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ 346 346 CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ 347 + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ 348 + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ 349 + CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ 350 + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ 347 351 }; 348 352 349 353 enum {

+15 -5

include/linux/crush/crush.h

··· 19 19 20 20 #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ 21 21 22 - 23 22 #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ 24 - #define CRUSH_MAX_SET 10 /* max size of a mapping result */ 25 23 24 + 25 + #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ 26 + #define CRUSH_ITEM_NONE 0x7fffffff /* no result */ 26 27 27 28 /* 28 29 * CRUSH uses user-defined "rules" to describe how inputs should be ··· 44 43 /* arg2 = type */ 45 44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ 46 45 CRUSH_RULE_EMIT = 4, /* no args */ 47 - CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, 48 - CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, 46 + CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, 47 + CRUSH_RULE_CHOOSELEAF_INDEP = 7, 48 + 49 + CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ 50 + CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 51 + CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 52 + CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 49 53 }; 50 54 51 55 /* ··· 168 162 __u32 choose_local_fallback_tries; 169 163 /* choose attempts before giving up */ 170 164 __u32 choose_total_tries; 171 - /* attempt chooseleaf inner descent once; on failure retry outer descent */ 165 + /* attempt chooseleaf inner descent once for firstn mode; on 166 + * reject retry outer descent. Note that this does *not* 167 + * apply to a collision: in that case we will retry as we used 168 + * to. */ 172 169 __u32 chooseleaf_descend_once; 173 170 }; 174 171 ··· 183 174 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); 184 175 extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); 185 176 extern void crush_destroy_bucket(struct crush_bucket *b); 177 + extern void crush_destroy_rule(struct crush_rule *r); 186 178 extern void crush_destroy(struct crush_map *map); 187 179 188 180 static inline int crush_calc_tree_node(int i)

+2 -1

include/linux/crush/mapper.h

··· 14 14 extern int crush_do_rule(const struct crush_map *map, 15 15 int ruleno, 16 16 int x, int *result, int result_max, 17 - const __u32 *weights); 17 + const __u32 *weights, int weight_max, 18 + int *scratch); 18 19 19 20 #endif

+6 -16

net/ceph/buffer.c

··· 6 6 7 7 #include <linux/ceph/buffer.h> 8 8 #include <linux/ceph/decode.h> 9 + #include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */ 9 10 10 11 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) 11 12 { ··· 16 15 if (!b) 17 16 return NULL; 18 17 19 - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); 20 - if (b->vec.iov_base) { 21 - b->is_vmalloc = false; 22 - } else { 23 - b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); 24 - if (!b->vec.iov_base) { 25 - kfree(b); 26 - return NULL; 27 - } 28 - b->is_vmalloc = true; 18 + b->vec.iov_base = ceph_kvmalloc(len, gfp); 19 + if (!b->vec.iov_base) { 20 + kfree(b); 21 + return NULL; 29 22 } 30 23 31 24 kref_init(&b->kref); ··· 35 40 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); 36 41 37 42 dout("buffer_release %p\n", b); 38 - if (b->vec.iov_base) { 39 - if (b->is_vmalloc) 40 - vfree(b->vec.iov_base); 41 - else 42 - kfree(b->vec.iov_base); 43 - } 43 + ceph_kvfree(b->vec.iov_base); 44 44 kfree(b); 45 45 } 46 46 EXPORT_SYMBOL(ceph_buffer_release);

+22 -2

net/ceph/ceph_common.c

··· 15 15 #include <linux/slab.h> 16 16 #include <linux/statfs.h> 17 17 #include <linux/string.h> 18 + #include <linux/vmalloc.h> 18 19 #include <linux/nsproxy.h> 19 20 #include <net/net_namespace.h> 20 21 ··· 170 169 return -1; 171 170 } 172 171 EXPORT_SYMBOL(ceph_compare_options); 172 + 173 + void *ceph_kvmalloc(size_t size, gfp_t flags) 174 + { 175 + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 176 + void *ptr = kmalloc(size, flags | __GFP_NOWARN); 177 + if (ptr) 178 + return ptr; 179 + } 180 + 181 + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); 182 + } 183 + 184 + void ceph_kvfree(const void *ptr) 185 + { 186 + if (is_vmalloc_addr(ptr)) 187 + vfree(ptr); 188 + else 189 + kfree(ptr); 190 + } 173 191 174 192 175 193 static int parse_fsid(const char *str, struct ceph_fsid *fsid) ··· 481 461 * create a fresh client instance 482 462 */ 483 463 struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, 484 - unsigned int supported_features, 485 - unsigned int required_features) 464 + u64 supported_features, 465 + u64 required_features) 486 466 { 487 467 struct ceph_client *client; 488 468 struct ceph_entity_addr *myaddr = NULL;

+5 -2

net/ceph/crush/crush.c

··· 116 116 if (map->rules) { 117 117 __u32 b; 118 118 for (b = 0; b < map->max_rules; b++) 119 - kfree(map->rules[b]); 119 + crush_destroy_rule(map->rules[b]); 120 120 kfree(map->rules); 121 121 } 122 122 123 123 kfree(map); 124 124 } 125 125 126 - 126 + void crush_destroy_rule(struct crush_rule *rule) 127 + { 128 + kfree(rule); 129 + }

+269 -67

net/ceph/crush/mapper.c

··· 189 189 static int bucket_tree_choose(struct crush_bucket_tree *bucket, 190 190 int x, int r) 191 191 { 192 - int n, l; 192 + int n; 193 193 __u32 w; 194 194 __u64 t; 195 195 ··· 197 197 n = bucket->num_nodes >> 1; 198 198 199 199 while (!terminal(n)) { 200 + int l; 200 201 /* pick point in [0, w) */ 201 202 w = bucket->node_weights[n]; 202 203 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, ··· 265 264 * true if device is marked "out" (failed, fully offloaded) 266 265 * of the cluster 267 266 */ 268 - static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) 267 + static int is_out(const struct crush_map *map, 268 + const __u32 *weight, int weight_max, 269 + int item, int x) 269 270 { 271 + if (item >= weight_max) 272 + return 1; 270 273 if (weight[item] >= 0x10000) 271 274 return 0; 272 275 if (weight[item] == 0) ··· 282 277 } 283 278 284 279 /** 285 - * crush_choose - choose numrep distinct items of given type 280 + * crush_choose_firstn - choose numrep distinct items of given type 286 281 * @map: the crush_map 287 282 * @bucket: the bucket we are choose an item from 288 283 * @x: crush input value ··· 290 285 * @type: the type of item to choose 291 286 * @out: pointer to output vector 292 287 * @outpos: our position in that vector 293 - * @firstn: true if choosing "first n" items, false if choosing "indep" 294 - * @recurse_to_leaf: true if we want one device under each item of given type 295 - * @descend_once: true if we should only try one descent before giving up 288 + * @tries: number of attempts to make 289 + * @recurse_tries: number of attempts to have recursive chooseleaf make 290 + * @local_tries: localized retries 291 + * @local_fallback_tries: localized fallback retries 292 + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 296 293 * @out2: second output vector for leaf items (if @recurse_to_leaf) 297 294 */ 298 - static int crush_choose(const struct crush_map *map, 299 - struct crush_bucket *bucket, 300 - const __u32 *weight, 301 - int x, int numrep, int type, 302 - int *out, int outpos, 303 - int firstn, int recurse_to_leaf, 304 - int descend_once, int *out2) 295 + static int crush_choose_firstn(const struct crush_map *map, 296 + struct crush_bucket *bucket, 297 + const __u32 *weight, int weight_max, 298 + int x, int numrep, int type, 299 + int *out, int outpos, 300 + unsigned int tries, 301 + unsigned int recurse_tries, 302 + unsigned int local_tries, 303 + unsigned int local_fallback_tries, 304 + int recurse_to_leaf, 305 + int *out2) 305 306 { 306 307 int rep; 307 308 unsigned int ftotal, flocal; ··· 336 325 collide = 0; 337 326 retry_bucket = 0; 338 327 r = rep; 339 - if (in->alg == CRUSH_BUCKET_UNIFORM) { 340 - /* be careful */ 341 - if (firstn || (__u32)numrep >= in->size) 342 - /* r' = r + f_total */ 343 - r += ftotal; 344 - else if (in->size % numrep == 0) 345 - /* r'=r+(n+1)*f_local */ 346 - r += (numrep+1) * 347 - (flocal+ftotal); 348 - else 349 - /* r' = r + n*f_local */ 350 - r += numrep * (flocal+ftotal); 351 - } else { 352 - if (firstn) 353 - /* r' = r + f_total */ 354 - r += ftotal; 355 - else 356 - /* r' = r + n*f_local */ 357 - r += numrep * (flocal+ftotal); 358 - } 328 + /* r' = r + f_total */ 329 + r += ftotal; 359 330 360 331 /* bucket choose */ 361 332 if (in->size == 0) { 362 333 reject = 1; 363 334 goto reject; 364 335 } 365 - if (map->choose_local_fallback_tries > 0 && 336 + if (local_fallback_tries > 0 && 366 337 flocal >= (in->size>>1) && 367 - flocal > map->choose_local_fallback_tries) 338 + flocal > local_fallback_tries) 368 339 item = bucket_perm_choose(in, x, r); 369 340 else 370 341 item = crush_bucket_choose(in, x, r); ··· 387 394 reject = 0; 388 395 if (!collide && recurse_to_leaf) { 389 396 if (item < 0) { 390 - if (crush_choose(map, 397 + if (crush_choose_firstn(map, 391 398 map->buckets[-1-item], 392 - weight, 399 + weight, weight_max, 393 400 x, outpos+1, 0, 394 401 out2, outpos, 395 - firstn, 0, 396 - map->chooseleaf_descend_once, 402 + recurse_tries, 0, 403 + local_tries, 404 + local_fallback_tries, 405 + 0, 397 406 NULL) <= outpos) 398 407 /* didn't get leaf */ 399 408 reject = 1; ··· 409 414 /* out? */ 410 415 if (itemtype == 0) 411 416 reject = is_out(map, weight, 417 + weight_max, 412 418 item, x); 413 419 else 414 420 reject = 0; ··· 420 424 ftotal++; 421 425 flocal++; 422 426 423 - if (reject && descend_once) 424 - /* let outer call try again */ 425 - skip_rep = 1; 426 - else if (collide && flocal <= map->choose_local_tries) 427 + if (collide && flocal <= local_tries) 427 428 /* retry locally a few times */ 428 429 retry_bucket = 1; 429 - else if (map->choose_local_fallback_tries > 0 && 430 - flocal <= in->size + map->choose_local_fallback_tries) 430 + else if (local_fallback_tries > 0 && 431 + flocal <= in->size + local_fallback_tries) 431 432 /* exhaustive bucket search */ 432 433 retry_bucket = 1; 433 - else if (ftotal <= map->choose_total_tries) 434 + else if (ftotal <= tries) 434 435 /* then retry descent */ 435 436 retry_descent = 1; 436 437 else ··· 457 464 458 465 459 466 /** 467 + * crush_choose_indep: alternative breadth-first positionally stable mapping 468 + * 469 + */ 470 + static void crush_choose_indep(const struct crush_map *map, 471 + struct crush_bucket *bucket, 472 + const __u32 *weight, int weight_max, 473 + int x, int left, int numrep, int type, 474 + int *out, int outpos, 475 + unsigned int tries, 476 + unsigned int recurse_tries, 477 + int recurse_to_leaf, 478 + int *out2, 479 + int parent_r) 480 + { 481 + struct crush_bucket *in = bucket; 482 + int endpos = outpos + left; 483 + int rep; 484 + unsigned int ftotal; 485 + int r; 486 + int i; 487 + int item = 0; 488 + int itemtype; 489 + int collide; 490 + 491 + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 492 + bucket->id, x, outpos, numrep); 493 + 494 + /* initially my result is undefined */ 495 + for (rep = outpos; rep < endpos; rep++) { 496 + out[rep] = CRUSH_ITEM_UNDEF; 497 + if (out2) 498 + out2[rep] = CRUSH_ITEM_UNDEF; 499 + } 500 + 501 + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { 502 + for (rep = outpos; rep < endpos; rep++) { 503 + if (out[rep] != CRUSH_ITEM_UNDEF) 504 + continue; 505 + 506 + in = bucket; /* initial bucket */ 507 + 508 + /* choose through intervening buckets */ 509 + for (;;) { 510 + /* note: we base the choice on the position 511 + * even in the nested call. that means that 512 + * if the first layer chooses the same bucket 513 + * in a different position, we will tend to 514 + * choose a different item in that bucket. 515 + * this will involve more devices in data 516 + * movement and tend to distribute the load. 517 + */ 518 + r = rep + parent_r; 519 + 520 + /* be careful */ 521 + if (in->alg == CRUSH_BUCKET_UNIFORM && 522 + in->size % numrep == 0) 523 + /* r'=r+(n+1)*f_total */ 524 + r += (numrep+1) * ftotal; 525 + else 526 + /* r' = r + n*f_total */ 527 + r += numrep * ftotal; 528 + 529 + /* bucket choose */ 530 + if (in->size == 0) { 531 + dprintk(" empty bucket\n"); 532 + break; 533 + } 534 + 535 + item = crush_bucket_choose(in, x, r); 536 + if (item >= map->max_devices) { 537 + dprintk(" bad item %d\n", item); 538 + out[rep] = CRUSH_ITEM_NONE; 539 + if (out2) 540 + out2[rep] = CRUSH_ITEM_NONE; 541 + left--; 542 + break; 543 + } 544 + 545 + /* desired type? */ 546 + if (item < 0) 547 + itemtype = map->buckets[-1-item]->type; 548 + else 549 + itemtype = 0; 550 + dprintk(" item %d type %d\n", item, itemtype); 551 + 552 + /* keep going? */ 553 + if (itemtype != type) { 554 + if (item >= 0 || 555 + (-1-item) >= map->max_buckets) { 556 + dprintk(" bad item type %d\n", type); 557 + out[rep] = CRUSH_ITEM_NONE; 558 + if (out2) 559 + out2[rep] = 560 + CRUSH_ITEM_NONE; 561 + left--; 562 + break; 563 + } 564 + in = map->buckets[-1-item]; 565 + continue; 566 + } 567 + 568 + /* collision? */ 569 + collide = 0; 570 + for (i = outpos; i < endpos; i++) { 571 + if (out[i] == item) { 572 + collide = 1; 573 + break; 574 + } 575 + } 576 + if (collide) 577 + break; 578 + 579 + if (recurse_to_leaf) { 580 + if (item < 0) { 581 + crush_choose_indep(map, 582 + map->buckets[-1-item], 583 + weight, weight_max, 584 + x, 1, numrep, 0, 585 + out2, rep, 586 + recurse_tries, 0, 587 + 0, NULL, r); 588 + if (out2[rep] == CRUSH_ITEM_NONE) { 589 + /* placed nothing; no leaf */ 590 + break; 591 + } 592 + } else { 593 + /* we already have a leaf! */ 594 + out2[rep] = item; 595 + } 596 + } 597 + 598 + /* out? */ 599 + if (itemtype == 0 && 600 + is_out(map, weight, weight_max, item, x)) 601 + break; 602 + 603 + /* yay! */ 604 + out[rep] = item; 605 + left--; 606 + break; 607 + } 608 + } 609 + } 610 + for (rep = outpos; rep < endpos; rep++) { 611 + if (out[rep] == CRUSH_ITEM_UNDEF) { 612 + out[rep] = CRUSH_ITEM_NONE; 613 + } 614 + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { 615 + out2[rep] = CRUSH_ITEM_NONE; 616 + } 617 + } 618 + } 619 + 620 + /** 460 621 * crush_do_rule - calculate a mapping with the given input and rule 461 622 * @map: the crush_map 462 623 * @ruleno: the rule id 463 624 * @x: hash input 464 625 * @result: pointer to result vector 465 626 * @result_max: maximum result size 627 + * @weight: weight vector (for map leaves) 628 + * @weight_max: size of weight vector 629 + * @scratch: scratch vector for private use; must be >= 3 * result_max 466 630 */ 467 631 int crush_do_rule(const struct crush_map *map, 468 632 int ruleno, int x, int *result, int result_max, 469 - const __u32 *weight) 633 + const __u32 *weight, int weight_max, 634 + int *scratch) 470 635 { 471 636 int result_len; 472 - int a[CRUSH_MAX_SET]; 473 - int b[CRUSH_MAX_SET]; 474 - int c[CRUSH_MAX_SET]; 637 + int *a = scratch; 638 + int *b = scratch + result_max; 639 + int *c = scratch + result_max*2; 475 640 int recurse_to_leaf; 476 641 int *w; 477 642 int wsize = 0; ··· 640 489 __u32 step; 641 490 int i, j; 642 491 int numrep; 643 - int firstn; 644 - const int descend_once = 0; 492 + int choose_tries = map->choose_total_tries; 493 + int choose_local_tries = map->choose_local_tries; 494 + int choose_local_fallback_tries = map->choose_local_fallback_tries; 495 + int choose_leaf_tries = 0; 645 496 646 497 if ((__u32)ruleno >= map->max_rules) { 647 498 dprintk(" bad ruleno %d\n", ruleno); ··· 656 503 o = b; 657 504 658 505 for (step = 0; step < rule->len; step++) { 506 + int firstn = 0; 659 507 struct crush_rule_step *curstep = &rule->steps[step]; 660 508 661 - firstn = 0; 662 509 switch (curstep->op) { 663 510 case CRUSH_RULE_TAKE: 664 511 w[0] = curstep->arg1; 665 512 wsize = 1; 666 513 break; 667 514 668 - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 515 + case CRUSH_RULE_SET_CHOOSE_TRIES: 516 + if (curstep->arg1 > 0) 517 + choose_tries = curstep->arg1; 518 + break; 519 + 520 + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: 521 + if (curstep->arg1 > 0) 522 + choose_leaf_tries = curstep->arg1; 523 + break; 524 + 525 + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 526 + if (curstep->arg1 > 0) 527 + choose_local_tries = curstep->arg1; 528 + break; 529 + 530 + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 531 + if (curstep->arg1 > 0) 532 + choose_local_fallback_tries = curstep->arg1; 533 + break; 534 + 535 + case CRUSH_RULE_CHOOSELEAF_FIRSTN: 669 536 case CRUSH_RULE_CHOOSE_FIRSTN: 670 537 firstn = 1; 671 538 /* fall through */ 672 - case CRUSH_RULE_CHOOSE_LEAF_INDEP: 539 + case CRUSH_RULE_CHOOSELEAF_INDEP: 673 540 case CRUSH_RULE_CHOOSE_INDEP: 674 541 if (wsize == 0) 675 542 break; 676 543 677 544 recurse_to_leaf = 678 545 curstep->op == 679 - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 546 + CRUSH_RULE_CHOOSELEAF_FIRSTN || 680 547 curstep->op == 681 - CRUSH_RULE_CHOOSE_LEAF_INDEP; 548 + CRUSH_RULE_CHOOSELEAF_INDEP; 682 549 683 550 /* reset output */ 684 551 osize = 0; ··· 716 543 continue; 717 544 } 718 545 j = 0; 719 - osize += crush_choose(map, 720 - map->buckets[-1-w[i]], 721 - weight, 722 - x, numrep, 723 - curstep->arg2, 724 - o+osize, j, 725 - firstn, 726 - recurse_to_leaf, 727 - descend_once, c+osize); 546 + if (firstn) { 547 + int recurse_tries; 548 + if (choose_leaf_tries) 549 + recurse_tries = 550 + choose_leaf_tries; 551 + else if (map->chooseleaf_descend_once) 552 + recurse_tries = 1; 553 + else 554 + recurse_tries = choose_tries; 555 + osize += crush_choose_firstn( 556 + map, 557 + map->buckets[-1-w[i]], 558 + weight, weight_max, 559 + x, numrep, 560 + curstep->arg2, 561 + o+osize, j, 562 + choose_tries, 563 + recurse_tries, 564 + choose_local_tries, 565 + choose_local_fallback_tries, 566 + recurse_to_leaf, 567 + c+osize); 568 + } else { 569 + crush_choose_indep( 570 + map, 571 + map->buckets[-1-w[i]], 572 + weight, weight_max, 573 + x, numrep, numrep, 574 + curstep->arg2, 575 + o+osize, j, 576 + choose_tries, 577 + choose_leaf_tries ? 578 + choose_leaf_tries : 1, 579 + recurse_to_leaf, 580 + c+osize, 581 + 0); 582 + osize += numrep; 583 + } 728 584 } 729 585 730 586 if (recurse_to_leaf) 731 587 /* copy final _leaf_ values to output set */ 732 588 memcpy(o, c, osize*sizeof(*o)); 733 589 734 - /* swap t and w arrays */ 590 + /* swap o and w arrays */ 735 591 tmp = o; 736 592 o = w; 737 593 w = tmp;

+2 -1

net/ceph/debugfs.c

··· 132 132 req->r_osd ? req->r_osd->o_osd : -1, 133 133 req->r_pgid.pool, req->r_pgid.seed); 134 134 135 - seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); 135 + seq_printf(s, "%.*s", req->r_base_oid.name_len, 136 + req->r_base_oid.name); 136 137 137 138 if (req->r_reassert_version.epoch) 138 139 seq_printf(s, "\t%u'%llu",

+13 -19

net/ceph/messenger.c

··· 15 15 #include <linux/dns_resolver.h> 16 16 #include <net/tcp.h> 17 17 18 + #include <linux/ceph/ceph_features.h> 18 19 #include <linux/ceph/libceph.h> 19 20 #include <linux/ceph/messenger.h> 20 21 #include <linux/ceph/decode.h> ··· 1866 1865 port = (port * 10) + (*p - '0'); 1867 1866 p++; 1868 1867 } 1869 - if (port > 65535 || port == 0) 1868 + if (port == 0) 1869 + port = CEPH_MON_PORT; 1870 + else if (port > 65535) 1870 1871 goto bad; 1871 1872 } else { 1872 1873 port = CEPH_MON_PORT; ··· 1948 1945 { 1949 1946 u64 sup_feat = con->msgr->supported_features; 1950 1947 u64 req_feat = con->msgr->required_features; 1951 - u64 server_feat = le64_to_cpu(con->in_reply.features); 1948 + u64 server_feat = ceph_sanitize_features( 1949 + le64_to_cpu(con->in_reply.features)); 1952 1950 int ret; 1953 1951 1954 1952 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); ··· 2857 2853 */ 2858 2854 void ceph_messenger_init(struct ceph_messenger *msgr, 2859 2855 struct ceph_entity_addr *myaddr, 2860 - u32 supported_features, 2861 - u32 required_features, 2856 + u64 supported_features, 2857 + u64 required_features, 2862 2858 bool nocrc) 2863 2859 { 2864 2860 msgr->supported_features = supported_features; ··· 3130 3126 INIT_LIST_HEAD(&m->data); 3131 3127 3132 3128 /* front */ 3133 - m->front_max = front_len; 3134 3129 if (front_len) { 3135 - if (front_len > PAGE_CACHE_SIZE) { 3136 - m->front.iov_base = __vmalloc(front_len, flags, 3137 - PAGE_KERNEL); 3138 - m->front_is_vmalloc = true; 3139 - } else { 3140 - m->front.iov_base = kmalloc(front_len, flags); 3141 - } 3130 + m->front.iov_base = ceph_kvmalloc(front_len, flags); 3142 3131 if (m->front.iov_base == NULL) { 3143 3132 dout("ceph_msg_new can't allocate %d bytes\n", 3144 3133 front_len); ··· 3140 3143 } else { 3141 3144 m->front.iov_base = NULL; 3142 3145 } 3143 - m->front.iov_len = front_len; 3146 + m->front_alloc_len = m->front.iov_len = front_len; 3144 3147 3145 3148 dout("ceph_msg_new %p front %d\n", m, front_len); 3146 3149 return m; ··· 3253 3256 void ceph_msg_kfree(struct ceph_msg *m) 3254 3257 { 3255 3258 dout("msg_kfree %p\n", m); 3256 - if (m->front_is_vmalloc) 3257 - vfree(m->front.iov_base); 3258 - else 3259 - kfree(m->front.iov_base); 3259 + ceph_kvfree(m->front.iov_base); 3260 3260 kmem_cache_free(ceph_msg_cache, m); 3261 3261 } 3262 3262 ··· 3295 3301 3296 3302 void ceph_msg_dump(struct ceph_msg *msg) 3297 3303 { 3298 - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, 3299 - msg->front_max, msg->data_length); 3304 + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, 3305 + msg->front_alloc_len, msg->data_length); 3300 3306 print_hex_dump(KERN_DEBUG, "header: ", 3301 3307 DUMP_PREFIX_OFFSET, 16, 1, 3302 3308 &msg->hdr, sizeof(msg->hdr), true);

+4 -4

net/ceph/mon_client.c

··· 152 152 /* initiatiate authentication handshake */ 153 153 ret = ceph_auth_build_hello(monc->auth, 154 154 monc->m_auth->front.iov_base, 155 - monc->m_auth->front_max); 155 + monc->m_auth->front_alloc_len); 156 156 __send_prepared_auth_request(monc, ret); 157 157 } else { 158 158 dout("open_session mon%d already open\n", monc->cur_mon); ··· 196 196 int num; 197 197 198 198 p = msg->front.iov_base; 199 - end = p + msg->front_max; 199 + end = p + msg->front_alloc_len; 200 200 201 201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; 202 202 ceph_encode_32(&p, num); ··· 897 897 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 898 898 msg->front.iov_len, 899 899 monc->m_auth->front.iov_base, 900 - monc->m_auth->front_max); 900 + monc->m_auth->front_alloc_len); 901 901 if (ret < 0) { 902 902 monc->client->auth_err = ret; 903 903 wake_up_all(&monc->client->auth_wq); ··· 939 939 return 0; 940 940 941 941 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, 942 - monc->m_auth->front_max); 942 + monc->m_auth->front_alloc_len); 943 943 if (ret <= 0) 944 944 return ret; /* either an error, or no need to authenticate */ 945 945 __send_prepared_auth_request(monc, ret);

+255 -28

net/ceph/osd_client.c

··· 338 338 msg_size = 4 + 4 + 8 + 8 + 4+8; 339 339 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 340 340 msg_size += 1 + 8 + 4 + 4; /* pg_t */ 341 - msg_size += 4 + MAX_OBJ_NAME_SIZE; 341 + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 342 342 msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); 343 343 msg_size += 8; /* snapid */ 344 344 msg_size += 8; /* snap_seq */ ··· 367 367 INIT_LIST_HEAD(&req->r_linger_osd); 368 368 INIT_LIST_HEAD(&req->r_req_lru_item); 369 369 INIT_LIST_HEAD(&req->r_osd_item); 370 + 371 + req->r_base_oloc.pool = -1; 372 + req->r_target_oloc.pool = -1; 370 373 371 374 /* create reply message */ 372 375 if (use_mempool) ··· 764 761 if (num_ops > 1) 765 762 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 766 763 767 - req->r_file_layout = *layout; /* keep a copy */ 764 + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); 768 765 769 - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", 770 - vino.ino, objnum); 771 - req->r_oid_len = strlen(req->r_oid); 766 + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), 767 + "%llx.%08llx", vino.ino, objnum); 768 + req->r_base_oid.name_len = strlen(req->r_base_oid.name); 772 769 773 770 return req; 774 771 } ··· 1047 1044 !ceph_con_opened(&osd->o_con)) { 1048 1045 struct ceph_osd_request *req; 1049 1046 1050 - dout(" osd addr hasn't changed and connection never opened," 1051 - " letting msgr retry"); 1047 + dout("osd addr hasn't changed and connection never opened, " 1048 + "letting msgr retry\n"); 1052 1049 /* touch each r_stamp for handle_timeout()'s benfit */ 1053 1050 list_for_each_entry(req, &osd->o_requests, r_osd_item) 1054 1051 req->r_stamp = jiffies; ··· 1235 1232 EXPORT_SYMBOL(ceph_osdc_set_request_linger); 1236 1233 1237 1234 /* 1235 + * Returns whether a request should be blocked from being sent 1236 + * based on the current osdmap and osd_client settings. 1237 + * 1238 + * Caller should hold map_sem for read. 1239 + */ 1240 + static bool __req_should_be_paused(struct ceph_osd_client *osdc, 1241 + struct ceph_osd_request *req) 1242 + { 1243 + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); 1244 + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || 1245 + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 1246 + return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || 1247 + (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); 1248 + } 1249 + 1250 + /* 1251 + * Calculate mapping of a request to a PG. Takes tiering into account. 1252 + */ 1253 + static int __calc_request_pg(struct ceph_osdmap *osdmap, 1254 + struct ceph_osd_request *req, 1255 + struct ceph_pg *pg_out) 1256 + { 1257 + bool need_check_tiering; 1258 + 1259 + need_check_tiering = false; 1260 + if (req->r_target_oloc.pool == -1) { 1261 + req->r_target_oloc = req->r_base_oloc; /* struct */ 1262 + need_check_tiering = true; 1263 + } 1264 + if (req->r_target_oid.name_len == 0) { 1265 + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); 1266 + need_check_tiering = true; 1267 + } 1268 + 1269 + if (need_check_tiering && 1270 + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 1271 + struct ceph_pg_pool_info *pi; 1272 + 1273 + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); 1274 + if (pi) { 1275 + if ((req->r_flags & CEPH_OSD_FLAG_READ) && 1276 + pi->read_tier >= 0) 1277 + req->r_target_oloc.pool = pi->read_tier; 1278 + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && 1279 + pi->write_tier >= 0) 1280 + req->r_target_oloc.pool = pi->write_tier; 1281 + } 1282 + /* !pi is caught in ceph_oloc_oid_to_pg() */ 1283 + } 1284 + 1285 + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, 1286 + &req->r_target_oid, pg_out); 1287 + } 1288 + 1289 + /* 1238 1290 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct 1239 1291 * (as needed), and set the request r_osd appropriately. If there is 1240 1292 * no up osd, set r_osd to NULL. Move the request to the appropriate list ··· 1306 1248 int acting[CEPH_PG_MAX_SIZE]; 1307 1249 int o = -1, num = 0; 1308 1250 int err; 1251 + bool was_paused; 1309 1252 1310 1253 dout("map_request %p tid %lld\n", req, req->r_tid); 1311 - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, 1312 - ceph_file_layout_pg_pool(req->r_file_layout)); 1254 + 1255 + err = __calc_request_pg(osdc->osdmap, req, &pgid); 1313 1256 if (err) { 1314 1257 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1315 1258 return err; ··· 1323 1264 num = err; 1324 1265 } 1325 1266 1267 + was_paused = req->r_paused; 1268 + req->r_paused = __req_should_be_paused(osdc, req); 1269 + if (was_paused && !req->r_paused) 1270 + force_resend = 1; 1271 + 1326 1272 if ((!force_resend && 1327 1273 req->r_osd && req->r_osd->o_osd == o && 1328 1274 req->r_sent >= req->r_osd->o_incarnation && 1329 1275 req->r_num_pg_osds == num && 1330 1276 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || 1331 - (req->r_osd == NULL && o == -1)) 1277 + (req->r_osd == NULL && o == -1) || 1278 + req->r_paused) 1332 1279 return 0; /* no change */ 1333 1280 1334 1281 dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", ··· 1396 1331 /* fill in message content that changes each time we send it */ 1397 1332 put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); 1398 1333 put_unaligned_le32(req->r_flags, req->r_request_flags); 1399 - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); 1334 + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); 1400 1335 p = req->r_request_pgid; 1401 1336 ceph_encode_64(&p, req->r_pgid.pool); 1402 1337 ceph_encode_32(&p, req->r_pgid.seed); ··· 1497 1432 round_jiffies_relative(delay)); 1498 1433 } 1499 1434 1435 + static int ceph_oloc_decode(void **p, void *end, 1436 + struct ceph_object_locator *oloc) 1437 + { 1438 + u8 struct_v, struct_cv; 1439 + u32 len; 1440 + void *struct_end; 1441 + int ret = 0; 1442 + 1443 + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); 1444 + struct_v = ceph_decode_8(p); 1445 + struct_cv = ceph_decode_8(p); 1446 + if (struct_v < 3) { 1447 + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", 1448 + struct_v, struct_cv); 1449 + goto e_inval; 1450 + } 1451 + if (struct_cv > 6) { 1452 + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", 1453 + struct_v, struct_cv); 1454 + goto e_inval; 1455 + } 1456 + len = ceph_decode_32(p); 1457 + ceph_decode_need(p, end, len, e_inval); 1458 + struct_end = *p + len; 1459 + 1460 + oloc->pool = ceph_decode_64(p); 1461 + *p += 4; /* skip preferred */ 1462 + 1463 + len = ceph_decode_32(p); 1464 + if (len > 0) { 1465 + pr_warn("ceph_object_locator::key is set\n"); 1466 + goto e_inval; 1467 + } 1468 + 1469 + if (struct_v >= 5) { 1470 + len = ceph_decode_32(p); 1471 + if (len > 0) { 1472 + pr_warn("ceph_object_locator::nspace is set\n"); 1473 + goto e_inval; 1474 + } 1475 + } 1476 + 1477 + if (struct_v >= 6) { 1478 + s64 hash = ceph_decode_64(p); 1479 + if (hash != -1) { 1480 + pr_warn("ceph_object_locator::hash is set\n"); 1481 + goto e_inval; 1482 + } 1483 + } 1484 + 1485 + /* skip the rest */ 1486 + *p = struct_end; 1487 + out: 1488 + return ret; 1489 + 1490 + e_inval: 1491 + ret = -EINVAL; 1492 + goto out; 1493 + } 1494 + 1495 + static int ceph_redirect_decode(void **p, void *end, 1496 + struct ceph_request_redirect *redir) 1497 + { 1498 + u8 struct_v, struct_cv; 1499 + u32 len; 1500 + void *struct_end; 1501 + int ret; 1502 + 1503 + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); 1504 + struct_v = ceph_decode_8(p); 1505 + struct_cv = ceph_decode_8(p); 1506 + if (struct_cv > 1) { 1507 + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", 1508 + struct_v, struct_cv); 1509 + goto e_inval; 1510 + } 1511 + len = ceph_decode_32(p); 1512 + ceph_decode_need(p, end, len, e_inval); 1513 + struct_end = *p + len; 1514 + 1515 + ret = ceph_oloc_decode(p, end, &redir->oloc); 1516 + if (ret) 1517 + goto out; 1518 + 1519 + len = ceph_decode_32(p); 1520 + if (len > 0) { 1521 + pr_warn("ceph_request_redirect::object_name is set\n"); 1522 + goto e_inval; 1523 + } 1524 + 1525 + len = ceph_decode_32(p); 1526 + *p += len; /* skip osd_instructions */ 1527 + 1528 + /* skip the rest */ 1529 + *p = struct_end; 1530 + out: 1531 + return ret; 1532 + 1533 + e_inval: 1534 + ret = -EINVAL; 1535 + goto out; 1536 + } 1537 + 1500 1538 static void complete_request(struct ceph_osd_request *req) 1501 1539 { 1502 1540 complete_all(&req->r_safe_completion); /* fsync waiter */ ··· 1614 1446 { 1615 1447 void *p, *end; 1616 1448 struct ceph_osd_request *req; 1449 + struct ceph_request_redirect redir; 1617 1450 u64 tid; 1618 1451 int object_len; 1619 1452 unsigned int numops; ··· 1694 1525 for (i = 0; i < numops; i++) 1695 1526 req->r_reply_op_result[i] = ceph_decode_32(&p); 1696 1527 1528 + if (le16_to_cpu(msg->hdr.version) >= 6) { 1529 + p += 8 + 4; /* skip replay_version */ 1530 + p += 8; /* skip user_version */ 1531 + 1532 + err = ceph_redirect_decode(&p, end, &redir); 1533 + if (err) 1534 + goto bad_put; 1535 + } else { 1536 + redir.oloc.pool = -1; 1537 + } 1538 + 1539 + if (redir.oloc.pool != -1) { 1540 + dout("redirect pool %lld\n", redir.oloc.pool); 1541 + 1542 + __unregister_request(osdc, req); 1543 + mutex_unlock(&osdc->request_mutex); 1544 + 1545 + req->r_target_oloc = redir.oloc; /* struct */ 1546 + 1547 + /* 1548 + * Start redirect requests with nofail=true. If 1549 + * mapping fails, request will end up on the notarget 1550 + * list, waiting for the new osdmap (which can take 1551 + * a while), even though the original request mapped 1552 + * successfully. In the future we might want to follow 1553 + * original request's nofail setting here. 1554 + */ 1555 + err = ceph_osdc_start_request(osdc, req, true); 1556 + BUG_ON(err); 1557 + 1558 + goto done; 1559 + } 1560 + 1697 1561 already_completed = req->r_got_reply; 1698 - 1699 1562 if (!req->r_got_reply) { 1700 - 1701 1563 req->r_result = result; 1702 1564 dout("handle_reply result %d bytes %d\n", req->r_result, 1703 1565 bytes); ··· 1781 1581 return; 1782 1582 1783 1583 bad_put: 1584 + req->r_result = -EIO; 1585 + __unregister_request(osdc, req); 1586 + if (req->r_callback) 1587 + req->r_callback(req, msg); 1588 + else 1589 + complete_all(&req->r_completion); 1590 + complete_request(req); 1784 1591 ceph_osdc_put_request(req); 1785 1592 bad_mutex: 1786 1593 mutex_unlock(&osdc->request_mutex); ··· 1820 1613 * 1821 1614 * Caller should hold map_sem for read. 1822 1615 */ 1823 - static void kick_requests(struct ceph_osd_client *osdc, int force_resend) 1616 + static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, 1617 + bool force_resend_writes) 1824 1618 { 1825 1619 struct ceph_osd_request *req, *nreq; 1826 1620 struct rb_node *p; 1827 1621 int needmap = 0; 1828 1622 int err; 1623 + bool force_resend_req; 1829 1624 1830 - dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1625 + dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", 1626 + force_resend_writes ? " (force resend writes)" : ""); 1831 1627 mutex_lock(&osdc->request_mutex); 1832 1628 for (p = rb_first(&osdc->requests); p; ) { 1833 1629 req = rb_entry(p, struct ceph_osd_request, r_node); ··· 1855 1645 continue; 1856 1646 } 1857 1647 1858 - err = __map_request(osdc, req, force_resend); 1648 + force_resend_req = force_resend || 1649 + (force_resend_writes && 1650 + req->r_flags & CEPH_OSD_FLAG_WRITE); 1651 + err = __map_request(osdc, req, force_resend_req); 1859 1652 if (err < 0) 1860 1653 continue; /* error */ 1861 1654 if (req->r_osd == NULL) { ··· 1878 1665 r_linger_item) { 1879 1666 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 1880 1667 1881 - err = __map_request(osdc, req, force_resend); 1668 + err = __map_request(osdc, req, 1669 + force_resend || force_resend_writes); 1882 1670 dout("__map_request returned %d\n", err); 1883 1671 if (err == 0) 1884 1672 continue; /* no change and no osd was specified */ ··· 1921 1707 struct ceph_osdmap *newmap = NULL, *oldmap; 1922 1708 int err; 1923 1709 struct ceph_fsid fsid; 1710 + bool was_full; 1924 1711 1925 1712 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); 1926 1713 p = msg->front.iov_base; ··· 1934 1719 return; 1935 1720 1936 1721 down_write(&osdc->map_sem); 1722 + 1723 + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 1937 1724 1938 1725 /* incremental maps */ 1939 1726 ceph_decode_32_safe(&p, end, nr_maps, bad); ··· 1961 1744 ceph_osdmap_destroy(osdc->osdmap); 1962 1745 osdc->osdmap = newmap; 1963 1746 } 1964 - kick_requests(osdc, 0); 1747 + was_full = was_full || 1748 + ceph_osdmap_flag(osdc->osdmap, 1749 + CEPH_OSDMAP_FULL); 1750 + kick_requests(osdc, 0, was_full); 1965 1751 } else { 1966 1752 dout("ignoring incremental map %u len %d\n", 1967 1753 epoch, maplen); ··· 2007 1787 skipped_map = 1; 2008 1788 ceph_osdmap_destroy(oldmap); 2009 1789 } 2010 - kick_requests(osdc, skipped_map); 1790 + was_full = was_full || 1791 + ceph_osdmap_flag(osdc->osdmap, 1792 + CEPH_OSDMAP_FULL); 1793 + kick_requests(osdc, skipped_map, was_full); 2011 1794 } 2012 1795 p += maplen; 2013 1796 nr_maps--; ··· 2027 1804 * we find out when we are no longer full and stop returning 2028 1805 * ENOSPC. 2029 1806 */ 2030 - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 1807 + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || 1808 + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || 1809 + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) 2031 1810 ceph_monc_request_next_osdmap(&osdc->client->monc); 2032 1811 2033 1812 mutex_lock(&osdc->request_mutex); ··· 2293 2068 ceph_encode_32(&p, -1); /* preferred */ 2294 2069 2295 2070 /* oid */ 2296 - ceph_encode_32(&p, req->r_oid_len); 2297 - memcpy(p, req->r_oid, req->r_oid_len); 2298 - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); 2299 - p += req->r_oid_len; 2071 + ceph_encode_32(&p, req->r_base_oid.name_len); 2072 + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); 2073 + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, 2074 + req->r_base_oid.name, req->r_base_oid.name_len); 2075 + p += req->r_base_oid.name_len; 2300 2076 2301 2077 /* ops--can imply data */ 2302 2078 ceph_encode_16(&p, (u16)req->r_num_ops); ··· 2680 2454 struct ceph_osd_client *osdc = osd->o_osdc; 2681 2455 struct ceph_msg *m; 2682 2456 struct ceph_osd_request *req; 2683 - int front = le32_to_cpu(hdr->front_len); 2457 + int front_len = le32_to_cpu(hdr->front_len); 2684 2458 int data_len = le32_to_cpu(hdr->data_len); 2685 2459 u64 tid; 2686 2460 ··· 2700 2474 req->r_reply, req->r_reply->con); 2701 2475 ceph_msg_revoke_incoming(req->r_reply); 2702 2476 2703 - if (front > req->r_reply->front.iov_len) { 2477 + if (front_len > req->r_reply->front_alloc_len) { 2704 2478 pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", 2705 - front, (int)req->r_reply->front.iov_len, 2479 + front_len, req->r_reply->front_alloc_len, 2706 2480 (unsigned int)con->peer_name.type, 2707 2481 le64_to_cpu(con->peer_name.num)); 2708 - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); 2482 + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, 2483 + false); 2709 2484 if (!m) 2710 2485 goto out; 2711 2486 ceph_msg_put(req->r_reply);

+60 -18

net/ceph/osdmap.c

··· 464 464 return NULL; 465 465 } 466 466 467 + struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) 468 + { 469 + return __lookup_pg_pool(&map->pg_pools, id); 470 + } 471 + 467 472 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) 468 473 { 469 474 struct ceph_pg_pool_info *pi; ··· 519 514 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); 520 515 return -EINVAL; 521 516 } 522 - if (cv > 7) { 523 - pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); 517 + if (cv > 9) { 518 + pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); 524 519 return -EINVAL; 525 520 } 526 521 len = ceph_decode_32(p); ··· 548 543 *p += len; 549 544 } 550 545 551 - /* skip removed snaps */ 546 + /* skip removed_snaps */ 552 547 num = ceph_decode_32(p); 553 548 *p += num * (8 + 8); 554 549 555 550 *p += 8; /* skip auid */ 556 551 pi->flags = ceph_decode_64(p); 552 + *p += 4; /* skip crash_replay_interval */ 553 + 554 + if (ev >= 7) 555 + *p += 1; /* skip min_size */ 556 + 557 + if (ev >= 8) 558 + *p += 8 + 8; /* skip quota_max_* */ 559 + 560 + if (ev >= 9) { 561 + /* skip tiers */ 562 + num = ceph_decode_32(p); 563 + *p += num * 8; 564 + 565 + *p += 8; /* skip tier_of */ 566 + *p += 1; /* skip cache_mode */ 567 + 568 + pi->read_tier = ceph_decode_64(p); 569 + pi->write_tier = ceph_decode_64(p); 570 + } else { 571 + pi->read_tier = -1; 572 + pi->write_tier = -1; 573 + } 557 574 558 575 /* ignore the rest */ 559 576 ··· 1117 1090 EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1118 1091 1119 1092 /* 1120 - * calculate an object layout (i.e. pgid) from an oid, 1121 - * file_layout, and osdmap 1093 + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be 1094 + * called with target's (oloc, oid), since tiering isn't taken into 1095 + * account. 1122 1096 */ 1123 - int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, 1124 - struct ceph_osdmap *osdmap, uint64_t pool) 1097 + int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 1098 + struct ceph_object_locator *oloc, 1099 + struct ceph_object_id *oid, 1100 + struct ceph_pg *pg_out) 1125 1101 { 1126 - struct ceph_pg_pool_info *pool_info; 1102 + struct ceph_pg_pool_info *pi; 1127 1103 1128 - BUG_ON(!osdmap); 1129 - pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); 1130 - if (!pool_info) 1104 + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); 1105 + if (!pi) 1131 1106 return -EIO; 1132 - pg->pool = pool; 1133 - pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); 1134 1107 1135 - dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); 1108 + pg_out->pool = oloc->pool; 1109 + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, 1110 + oid->name_len); 1111 + 1112 + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, 1113 + pg_out->pool, pg_out->seed); 1136 1114 return 0; 1137 1115 } 1138 - EXPORT_SYMBOL(ceph_calc_ceph_pg); 1116 + EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1117 + 1118 + static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, 1119 + int *result, int result_max, 1120 + const __u32 *weight, int weight_max) 1121 + { 1122 + int scratch[result_max * 3]; 1123 + 1124 + return crush_do_rule(map, ruleno, x, result, result_max, 1125 + weight, weight_max, scratch); 1126 + } 1139 1127 1140 1128 /* 1141 1129 * Calculate raw osd vector for the given pgid. Return pointer to osd ··· 1205 1163 pool->pgp_num_mask) + 1206 1164 (unsigned)pgid.pool; 1207 1165 } 1208 - r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1209 - min_t(int, pool->size, *num), 1210 - osdmap->osd_weight); 1166 + r = crush_do_rule_ary(osdmap->crush, ruleno, pps, 1167 + osds, min_t(int, pool->size, *num), 1168 + osdmap->osd_weight, osdmap->max_osd); 1211 1169 if (r < 0) { 1212 1170 pr_err("error %d from crush rule: pool %lld ruleset %d type %d" 1213 1171 " size %d\n", r, pgid.pool, pool->crush_ruleset,