Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph fixes from Sage Weil:
"Yes, this is a much larger pull than I would like after -rc1. There
are a few things included:

- a few fixes for leaks and incorrect assertions
- a few patches fixing behavior when mapped images are resized
- handling for cloned/layered images that are flattened out from
underneath the client

The last bit was non-trivial, and there is some code movement and
associated cleanup mixed in. This was ready and was meant to go in
last week but I missed the boat on Friday. My only excuse is that I
was waiting for an all clear from the testing and there were many
other shiny things to distract me.

Strictly speaking, handling the flatten case isn't a regression and
could wait, so if you like we can try to pull the series apart, but
Alex and I would much prefer to have it all in as it is a case real
users will hit with 3.10."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (33 commits)
rbd: re-submit flattened write request (part 2)
rbd: re-submit write request for flattened clone
rbd: re-submit read request for flattened clone
rbd: detect when clone image is flattened
rbd: reference count parent requests
rbd: define parent image request routines
rbd: define rbd_dev_unparent()
rbd: don't release write request until necessary
rbd: get parent info on refresh
rbd: ignore zero-overlap parent
rbd: support reading parent page data for writes
rbd: fix parent request size assumption
libceph: init sent and completed when starting
rbd: kill rbd_img_request_get()
rbd: only set up watch for mapped images
rbd: set mapping read-only flag in rbd_add()
rbd: support reading parent page data
rbd: fix an incorrect assertion condition
rbd: define rbd_dev_v2_header_info()
rbd: get rid of trivial v1 header wrappers
...

+558 -392
+554 -391
drivers/block/rbd.c
··· 55 55 #define SECTOR_SHIFT 9 56 56 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 57 57 58 + /* 59 + * Increment the given counter and return its updated value. 60 + * If the counter is already 0 it will not be incremented. 61 + * If the counter is already at its maximum value returns 62 + * -EINVAL without updating it. 63 + */ 64 + static int atomic_inc_return_safe(atomic_t *v) 65 + { 66 + unsigned int counter; 67 + 68 + counter = (unsigned int)__atomic_add_unless(v, 1, 0); 69 + if (counter <= (unsigned int)INT_MAX) 70 + return (int)counter; 71 + 72 + atomic_dec(v); 73 + 74 + return -EINVAL; 75 + } 76 + 77 + /* Decrement the counter. Return the resulting value, or -EINVAL */ 78 + static int atomic_dec_return_safe(atomic_t *v) 79 + { 80 + int counter; 81 + 82 + counter = atomic_dec_return(v); 83 + if (counter >= 0) 84 + return counter; 85 + 86 + atomic_inc(v); 87 + 88 + return -EINVAL; 89 + } 90 + 58 91 #define RBD_DRV_NAME "rbd" 59 92 #define RBD_DRV_NAME_LONG "rbd (rados block device)" 60 93 ··· 133 100 * block device image metadata (in-memory version) 134 101 */ 135 102 struct rbd_image_header { 136 - /* These four fields never change for a given rbd image */ 103 + /* These six fields never change for a given rbd image */ 137 104 char *object_prefix; 138 - u64 features; 139 105 __u8 obj_order; 140 106 __u8 crypt_type; 141 107 __u8 comp_type; 108 + u64 stripe_unit; 109 + u64 stripe_count; 110 + u64 features; /* Might be changeable someday? */ 142 111 143 112 /* The remaining fields need to be updated occasionally */ 144 113 u64 image_size; 145 114 struct ceph_snap_context *snapc; 146 - char *snap_names; 147 - u64 *snap_sizes; 148 - 149 - u64 stripe_unit; 150 - u64 stripe_count; 115 + char *snap_names; /* format 1 only */ 116 + u64 *snap_sizes; /* format 1 only */ 151 117 }; 152 118 153 119 /* ··· 257 225 }; 258 226 }; 259 227 struct page **copyup_pages; 228 + u32 copyup_page_count; 260 229 261 230 struct ceph_osd_request *osd_req; 262 231 ··· 290 257 struct rbd_obj_request *obj_request; /* obj req initiator */ 291 258 }; 292 259 struct page **copyup_pages; 260 + u32 copyup_page_count; 293 261 spinlock_t completion_lock;/* protects next_completion */ 294 262 u32 next_completion; 295 263 rbd_img_callback_t callback; ··· 345 311 346 312 struct rbd_spec *parent_spec; 347 313 u64 parent_overlap; 314 + atomic_t parent_ref; 348 315 struct rbd_device *parent; 349 316 350 317 /* protects updating the header */ ··· 394 359 size_t count); 395 360 static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 396 361 size_t count); 397 - static int rbd_dev_image_probe(struct rbd_device *rbd_dev); 362 + static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); 363 + static void rbd_spec_put(struct rbd_spec *spec); 398 364 399 365 static struct bus_attribute rbd_bus_attrs[] = { 400 366 __ATTR(add, S_IWUSR, NULL, rbd_add), ··· 462 426 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 463 427 464 428 static int rbd_dev_refresh(struct rbd_device *rbd_dev); 465 - static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); 429 + static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 430 + static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); 466 431 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 467 432 u64 snap_id); 468 433 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ··· 763 726 } 764 727 765 728 /* 766 - * Create a new header structure, translate header format from the on-disk 767 - * header. 729 + * Fill an rbd image header with information from the given format 1 730 + * on-disk header. 768 731 */ 769 - static int rbd_header_from_disk(struct rbd_image_header *header, 732 + static int rbd_header_from_disk(struct rbd_device *rbd_dev, 770 733 struct rbd_image_header_ondisk *ondisk) 771 734 { 735 + struct rbd_image_header *header = &rbd_dev->header; 736 + bool first_time = header->object_prefix == NULL; 737 + struct ceph_snap_context *snapc; 738 + char *object_prefix = NULL; 739 + char *snap_names = NULL; 740 + u64 *snap_sizes = NULL; 772 741 u32 snap_count; 773 - size_t len; 774 742 size_t size; 743 + int ret = -ENOMEM; 775 744 u32 i; 776 745 777 - memset(header, 0, sizeof (*header)); 746 + /* Allocate this now to avoid having to handle failure below */ 747 + 748 + if (first_time) { 749 + size_t len; 750 + 751 + len = strnlen(ondisk->object_prefix, 752 + sizeof (ondisk->object_prefix)); 753 + object_prefix = kmalloc(len + 1, GFP_KERNEL); 754 + if (!object_prefix) 755 + return -ENOMEM; 756 + memcpy(object_prefix, ondisk->object_prefix, len); 757 + object_prefix[len] = '\0'; 758 + } 759 + 760 + /* Allocate the snapshot context and fill it in */ 778 761 779 762 snap_count = le32_to_cpu(ondisk->snap_count); 780 - 781 - len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 782 - header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 783 - if (!header->object_prefix) 784 - return -ENOMEM; 785 - memcpy(header->object_prefix, ondisk->object_prefix, len); 786 - header->object_prefix[len] = '\0'; 787 - 763 + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 764 + if (!snapc) 765 + goto out_err; 766 + snapc->seq = le64_to_cpu(ondisk->snap_seq); 788 767 if (snap_count) { 768 + struct rbd_image_snap_ondisk *snaps; 789 769 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 790 770 791 - /* Save a copy of the snapshot names */ 771 + /* We'll keep a copy of the snapshot names... */ 792 772 793 - if (snap_names_len > (u64) SIZE_MAX) 794 - return -EIO; 795 - header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 796 - if (!header->snap_names) 773 + if (snap_names_len > (u64)SIZE_MAX) 774 + goto out_2big; 775 + snap_names = kmalloc(snap_names_len, GFP_KERNEL); 776 + if (!snap_names) 797 777 goto out_err; 778 + 779 + /* ...as well as the array of their sizes. */ 780 + 781 + size = snap_count * sizeof (*header->snap_sizes); 782 + snap_sizes = kmalloc(size, GFP_KERNEL); 783 + if (!snap_sizes) 784 + goto out_err; 785 + 798 786 /* 799 - * Note that rbd_dev_v1_header_read() guarantees 800 - * the ondisk buffer we're working with has 787 + * Copy the names, and fill in each snapshot's id 788 + * and size. 789 + * 790 + * Note that rbd_dev_v1_header_info() guarantees the 791 + * ondisk buffer we're working with has 801 792 * snap_names_len bytes beyond the end of the 802 793 * snapshot id array, this memcpy() is safe. 803 794 */ 804 - memcpy(header->snap_names, &ondisk->snaps[snap_count], 805 - snap_names_len); 806 - 807 - /* Record each snapshot's size */ 808 - 809 - size = snap_count * sizeof (*header->snap_sizes); 810 - header->snap_sizes = kmalloc(size, GFP_KERNEL); 811 - if (!header->snap_sizes) 812 - goto out_err; 813 - for (i = 0; i < snap_count; i++) 814 - header->snap_sizes[i] = 815 - le64_to_cpu(ondisk->snaps[i].image_size); 816 - } else { 817 - header->snap_names = NULL; 818 - header->snap_sizes = NULL; 795 + memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 796 + snaps = ondisk->snaps; 797 + for (i = 0; i < snap_count; i++) { 798 + snapc->snaps[i] = le64_to_cpu(snaps[i].id); 799 + snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 800 + } 819 801 } 820 802 821 - header->features = 0; /* No features support in v1 images */ 822 - header->obj_order = ondisk->options.order; 823 - header->crypt_type = ondisk->options.crypt_type; 824 - header->comp_type = ondisk->options.comp_type; 803 + /* We won't fail any more, fill in the header */ 825 804 826 - /* Allocate and fill in the snapshot context */ 805 + down_write(&rbd_dev->header_rwsem); 806 + if (first_time) { 807 + header->object_prefix = object_prefix; 808 + header->obj_order = ondisk->options.order; 809 + header->crypt_type = ondisk->options.crypt_type; 810 + header->comp_type = ondisk->options.comp_type; 811 + /* The rest aren't used for format 1 images */ 812 + header->stripe_unit = 0; 813 + header->stripe_count = 0; 814 + header->features = 0; 815 + } else { 816 + ceph_put_snap_context(header->snapc); 817 + kfree(header->snap_names); 818 + kfree(header->snap_sizes); 819 + } 820 + 821 + /* The remaining fields always get updated (when we refresh) */ 827 822 828 823 header->image_size = le64_to_cpu(ondisk->image_size); 824 + header->snapc = snapc; 825 + header->snap_names = snap_names; 826 + header->snap_sizes = snap_sizes; 829 827 830 - header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 831 - if (!header->snapc) 832 - goto out_err; 833 - header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 834 - for (i = 0; i < snap_count; i++) 835 - header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); 828 + /* Make sure mapping size is consistent with header info */ 829 + 830 + if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) 831 + if (rbd_dev->mapping.size != header->image_size) 832 + rbd_dev->mapping.size = header->image_size; 833 + 834 + up_write(&rbd_dev->header_rwsem); 836 835 837 836 return 0; 838 - 837 + out_2big: 838 + ret = -EIO; 839 839 out_err: 840 - kfree(header->snap_sizes); 841 - header->snap_sizes = NULL; 842 - kfree(header->snap_names); 843 - header->snap_names = NULL; 844 - kfree(header->object_prefix); 845 - header->object_prefix = NULL; 840 + kfree(snap_sizes); 841 + kfree(snap_names); 842 + ceph_put_snap_context(snapc); 843 + kfree(object_prefix); 846 844 847 - return -ENOMEM; 845 + return ret; 848 846 } 849 847 850 848 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) ··· 1006 934 1007 935 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1008 936 { 1009 - const char *snap_name = rbd_dev->spec->snap_name; 1010 - u64 snap_id; 937 + u64 snap_id = rbd_dev->spec->snap_id; 1011 938 u64 size = 0; 1012 939 u64 features = 0; 1013 940 int ret; 1014 - 1015 - if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) { 1016 - snap_id = rbd_snap_id_by_name(rbd_dev, snap_name); 1017 - if (snap_id == CEPH_NOSNAP) 1018 - return -ENOENT; 1019 - } else { 1020 - snap_id = CEPH_NOSNAP; 1021 - } 1022 941 1023 942 ret = rbd_snap_size(rbd_dev, snap_id, &size); 1024 943 if (ret) ··· 1021 958 rbd_dev->mapping.size = size; 1022 959 rbd_dev->mapping.features = features; 1023 960 1024 - /* If we are mapping a snapshot it must be marked read-only */ 1025 - 1026 - if (snap_id != CEPH_NOSNAP) 1027 - rbd_dev->mapping.read_only = true; 1028 - 1029 961 return 0; 1030 962 } 1031 963 ··· 1028 970 { 1029 971 rbd_dev->mapping.size = 0; 1030 972 rbd_dev->mapping.features = 0; 1031 - rbd_dev->mapping.read_only = true; 1032 - } 1033 - 1034 - static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) 1035 - { 1036 - rbd_dev->mapping.size = 0; 1037 - rbd_dev->mapping.features = 0; 1038 - rbd_dev->mapping.read_only = true; 1039 973 } 1040 974 1041 975 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) ··· 1392 1342 kref_put(&obj_request->kref, rbd_obj_request_destroy); 1393 1343 } 1394 1344 1395 - static void rbd_img_request_get(struct rbd_img_request *img_request) 1396 - { 1397 - dout("%s: img %p (was %d)\n", __func__, img_request, 1398 - atomic_read(&img_request->kref.refcount)); 1399 - kref_get(&img_request->kref); 1400 - } 1401 - 1345 + static bool img_request_child_test(struct rbd_img_request *img_request); 1346 + static void rbd_parent_request_destroy(struct kref *kref); 1402 1347 static void rbd_img_request_destroy(struct kref *kref); 1403 1348 static void rbd_img_request_put(struct rbd_img_request *img_request) 1404 1349 { 1405 1350 rbd_assert(img_request != NULL); 1406 1351 dout("%s: img %p (was %d)\n", __func__, img_request, 1407 1352 atomic_read(&img_request->kref.refcount)); 1408 - kref_put(&img_request->kref, rbd_img_request_destroy); 1353 + if (img_request_child_test(img_request)) 1354 + kref_put(&img_request->kref, rbd_parent_request_destroy); 1355 + else 1356 + kref_put(&img_request->kref, rbd_img_request_destroy); 1409 1357 } 1410 1358 1411 1359 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, ··· 1520 1472 smp_mb(); 1521 1473 } 1522 1474 1475 + static void img_request_child_clear(struct rbd_img_request *img_request) 1476 + { 1477 + clear_bit(IMG_REQ_CHILD, &img_request->flags); 1478 + smp_mb(); 1479 + } 1480 + 1523 1481 static bool img_request_child_test(struct rbd_img_request *img_request) 1524 1482 { 1525 1483 smp_mb(); ··· 1535 1481 static void img_request_layered_set(struct rbd_img_request *img_request) 1536 1482 { 1537 1483 set_bit(IMG_REQ_LAYERED, &img_request->flags); 1484 + smp_mb(); 1485 + } 1486 + 1487 + static void img_request_layered_clear(struct rbd_img_request *img_request) 1488 + { 1489 + clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1538 1490 smp_mb(); 1539 1491 } 1540 1492 ··· 1887 1827 kmem_cache_free(rbd_obj_request_cache, obj_request); 1888 1828 } 1889 1829 1830 + /* It's OK to call this for a device with no parent */ 1831 + 1832 + static void rbd_spec_put(struct rbd_spec *spec); 1833 + static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1834 + { 1835 + rbd_dev_remove_parent(rbd_dev); 1836 + rbd_spec_put(rbd_dev->parent_spec); 1837 + rbd_dev->parent_spec = NULL; 1838 + rbd_dev->parent_overlap = 0; 1839 + } 1840 + 1841 + /* 1842 + * Parent image reference counting is used to determine when an 1843 + * image's parent fields can be safely torn down--after there are no 1844 + * more in-flight requests to the parent image. When the last 1845 + * reference is dropped, cleaning them up is safe. 1846 + */ 1847 + static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1848 + { 1849 + int counter; 1850 + 1851 + if (!rbd_dev->parent_spec) 1852 + return; 1853 + 1854 + counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1855 + if (counter > 0) 1856 + return; 1857 + 1858 + /* Last reference; clean up parent data structures */ 1859 + 1860 + if (!counter) 1861 + rbd_dev_unparent(rbd_dev); 1862 + else 1863 + rbd_warn(rbd_dev, "parent reference underflow\n"); 1864 + } 1865 + 1866 + /* 1867 + * If an image has a non-zero parent overlap, get a reference to its 1868 + * parent. 1869 + * 1870 + * We must get the reference before checking for the overlap to 1871 + * coordinate properly with zeroing the parent overlap in 1872 + * rbd_dev_v2_parent_info() when an image gets flattened. We 1873 + * drop it again if there is no overlap. 1874 + * 1875 + * Returns true if the rbd device has a parent with a non-zero 1876 + * overlap and a reference for it was successfully taken, or 1877 + * false otherwise. 1878 + */ 1879 + static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1880 + { 1881 + int counter; 1882 + 1883 + if (!rbd_dev->parent_spec) 1884 + return false; 1885 + 1886 + counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1887 + if (counter > 0 && rbd_dev->parent_overlap) 1888 + return true; 1889 + 1890 + /* Image was flattened, but parent is not yet torn down */ 1891 + 1892 + if (counter < 0) 1893 + rbd_warn(rbd_dev, "parent reference overflow\n"); 1894 + 1895 + return false; 1896 + } 1897 + 1890 1898 /* 1891 1899 * Caller is responsible for filling in the list of object requests 1892 1900 * that comprises the image request, and the Linux request pointer ··· 1963 1835 static struct rbd_img_request *rbd_img_request_create( 1964 1836 struct rbd_device *rbd_dev, 1965 1837 u64 offset, u64 length, 1966 - bool write_request, 1967 - bool child_request) 1838 + bool write_request) 1968 1839 { 1969 1840 struct rbd_img_request *img_request; 1970 1841 ··· 1988 1861 } else { 1989 1862 img_request->snap_id = rbd_dev->spec->snap_id; 1990 1863 } 1991 - if (child_request) 1992 - img_request_child_set(img_request); 1993 - if (rbd_dev->parent_spec) 1864 + if (rbd_dev_parent_get(rbd_dev)) 1994 1865 img_request_layered_set(img_request); 1995 1866 spin_lock_init(&img_request->completion_lock); 1996 1867 img_request->next_completion = 0; ··· 1997 1872 img_request->obj_request_count = 0; 1998 1873 INIT_LIST_HEAD(&img_request->obj_requests); 1999 1874 kref_init(&img_request->kref); 2000 - 2001 - rbd_img_request_get(img_request); /* Avoid a warning */ 2002 - rbd_img_request_put(img_request); /* TEMPORARY */ 2003 1875 2004 1876 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 2005 1877 write_request ? "write" : "read", offset, length, ··· 2019 1897 rbd_img_obj_request_del(img_request, obj_request); 2020 1898 rbd_assert(img_request->obj_request_count == 0); 2021 1899 1900 + if (img_request_layered_test(img_request)) { 1901 + img_request_layered_clear(img_request); 1902 + rbd_dev_parent_put(img_request->rbd_dev); 1903 + } 1904 + 2022 1905 if (img_request_write_test(img_request)) 2023 1906 ceph_put_snap_context(img_request->snapc); 2024 1907 2025 - if (img_request_child_test(img_request)) 2026 - rbd_obj_request_put(img_request->obj_request); 2027 - 2028 1908 kmem_cache_free(rbd_img_request_cache, img_request); 1909 + } 1910 + 1911 + static struct rbd_img_request *rbd_parent_request_create( 1912 + struct rbd_obj_request *obj_request, 1913 + u64 img_offset, u64 length) 1914 + { 1915 + struct rbd_img_request *parent_request; 1916 + struct rbd_device *rbd_dev; 1917 + 1918 + rbd_assert(obj_request->img_request); 1919 + rbd_dev = obj_request->img_request->rbd_dev; 1920 + 1921 + parent_request = rbd_img_request_create(rbd_dev->parent, 1922 + img_offset, length, false); 1923 + if (!parent_request) 1924 + return NULL; 1925 + 1926 + img_request_child_set(parent_request); 1927 + rbd_obj_request_get(obj_request); 1928 + parent_request->obj_request = obj_request; 1929 + 1930 + return parent_request; 1931 + } 1932 + 1933 + static void rbd_parent_request_destroy(struct kref *kref) 1934 + { 1935 + struct rbd_img_request *parent_request; 1936 + struct rbd_obj_request *orig_request; 1937 + 1938 + parent_request = container_of(kref, struct rbd_img_request, kref); 1939 + orig_request = parent_request->obj_request; 1940 + 1941 + parent_request->obj_request = NULL; 1942 + rbd_obj_request_put(orig_request); 1943 + img_request_child_clear(parent_request); 1944 + 1945 + rbd_img_request_destroy(kref); 2029 1946 } 2030 1947 2031 1948 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) ··· 2275 2114 { 2276 2115 struct rbd_img_request *img_request; 2277 2116 struct rbd_device *rbd_dev; 2278 - u64 length; 2117 + struct page **pages; 2279 2118 u32 page_count; 2280 2119 2281 2120 rbd_assert(obj_request->type == OBJ_REQUEST_BIO); ··· 2285 2124 2286 2125 rbd_dev = img_request->rbd_dev; 2287 2126 rbd_assert(rbd_dev); 2288 - length = (u64)1 << rbd_dev->header.obj_order; 2289 - page_count = (u32)calc_pages_for(0, length); 2290 2127 2291 - rbd_assert(obj_request->copyup_pages); 2292 - ceph_release_page_vector(obj_request->copyup_pages, page_count); 2128 + pages = obj_request->copyup_pages; 2129 + rbd_assert(pages != NULL); 2293 2130 obj_request->copyup_pages = NULL; 2131 + page_count = obj_request->copyup_page_count; 2132 + rbd_assert(page_count); 2133 + obj_request->copyup_page_count = 0; 2134 + ceph_release_page_vector(pages, page_count); 2294 2135 2295 2136 /* 2296 2137 * We want the transfer count to reflect the size of the ··· 2316 2153 struct ceph_osd_client *osdc; 2317 2154 struct rbd_device *rbd_dev; 2318 2155 struct page **pages; 2319 - int result; 2320 - u64 obj_size; 2321 - u64 xferred; 2156 + u32 page_count; 2157 + int img_result; 2158 + u64 parent_length; 2159 + u64 offset; 2160 + u64 length; 2322 2161 2323 2162 rbd_assert(img_request_child_test(img_request)); 2324 2163 ··· 2329 2164 pages = img_request->copyup_pages; 2330 2165 rbd_assert(pages != NULL); 2331 2166 img_request->copyup_pages = NULL; 2167 + page_count = img_request->copyup_page_count; 2168 + rbd_assert(page_count); 2169 + img_request->copyup_page_count = 0; 2332 2170 2333 2171 orig_request = img_request->obj_request; 2334 2172 rbd_assert(orig_request != NULL); 2335 - rbd_assert(orig_request->type == OBJ_REQUEST_BIO); 2336 - result = img_request->result; 2337 - obj_size = img_request->length; 2338 - xferred = img_request->xferred; 2339 - 2340 - rbd_dev = img_request->rbd_dev; 2341 - rbd_assert(rbd_dev); 2342 - rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); 2343 - 2173 + rbd_assert(obj_request_type_valid(orig_request->type)); 2174 + img_result = img_request->result; 2175 + parent_length = img_request->length; 2176 + rbd_assert(parent_length == img_request->xferred); 2344 2177 rbd_img_request_put(img_request); 2345 2178 2346 - if (result) 2179 + rbd_assert(orig_request->img_request); 2180 + rbd_dev = orig_request->img_request->rbd_dev; 2181 + rbd_assert(rbd_dev); 2182 + 2183 + /* 2184 + * If the overlap has become 0 (most likely because the 2185 + * image has been flattened) we need to free the pages 2186 + * and re-submit the original write request. 2187 + */ 2188 + if (!rbd_dev->parent_overlap) { 2189 + struct ceph_osd_client *osdc; 2190 + 2191 + ceph_release_page_vector(pages, page_count); 2192 + osdc = &rbd_dev->rbd_client->client->osdc; 2193 + img_result = rbd_obj_request_submit(osdc, orig_request); 2194 + if (!img_result) 2195 + return; 2196 + } 2197 + 2198 + if (img_result) 2347 2199 goto out_err; 2348 2200 2349 - /* Allocate the new copyup osd request for the original request */ 2350 - 2351 - result = -ENOMEM; 2352 - rbd_assert(!orig_request->osd_req); 2201 + /* 2202 + * The original osd request is of no use to use any more. 2203 + * We need a new one that can hold the two ops in a copyup 2204 + * request. Allocate the new copyup osd request for the 2205 + * original request, and release the old one. 2206 + */ 2207 + img_result = -ENOMEM; 2353 2208 osd_req = rbd_osd_req_create_copyup(orig_request); 2354 2209 if (!osd_req) 2355 2210 goto out_err; 2211 + rbd_osd_req_destroy(orig_request->osd_req); 2356 2212 orig_request->osd_req = osd_req; 2357 2213 orig_request->copyup_pages = pages; 2214 + orig_request->copyup_page_count = page_count; 2358 2215 2359 2216 /* Initialize the copyup op */ 2360 2217 2361 2218 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2362 - osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, 2219 + osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 2363 2220 false, false); 2364 2221 2365 2222 /* Then the original write request op */ 2366 2223 2224 + offset = orig_request->offset; 2225 + length = orig_request->length; 2367 2226 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, 2368 - orig_request->offset, 2369 - orig_request->length, 0, 0); 2370 - osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, 2371 - orig_request->length); 2227 + offset, length, 0, 0); 2228 + if (orig_request->type == OBJ_REQUEST_BIO) 2229 + osd_req_op_extent_osd_data_bio(osd_req, 1, 2230 + orig_request->bio_list, length); 2231 + else 2232 + osd_req_op_extent_osd_data_pages(osd_req, 1, 2233 + orig_request->pages, length, 2234 + offset & ~PAGE_MASK, false, false); 2372 2235 2373 2236 rbd_osd_req_format_write(orig_request); 2374 2237 ··· 2404 2211 2405 2212 orig_request->callback = rbd_img_obj_copyup_callback; 2406 2213 osdc = &rbd_dev->rbd_client->client->osdc; 2407 - result = rbd_obj_request_submit(osdc, orig_request); 2408 - if (!result) 2214 + img_result = rbd_obj_request_submit(osdc, orig_request); 2215 + if (!img_result) 2409 2216 return; 2410 2217 out_err: 2411 2218 /* Record the error code and complete the request */ 2412 2219 2413 - orig_request->result = result; 2220 + orig_request->result = img_result; 2414 2221 orig_request->xferred = 0; 2415 2222 obj_request_done_set(orig_request); 2416 2223 rbd_obj_request_complete(orig_request); ··· 2442 2249 int result; 2443 2250 2444 2251 rbd_assert(obj_request_img_data_test(obj_request)); 2445 - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 2252 + rbd_assert(obj_request_type_valid(obj_request->type)); 2446 2253 2447 2254 img_request = obj_request->img_request; 2448 2255 rbd_assert(img_request != NULL); 2449 2256 rbd_dev = img_request->rbd_dev; 2450 2257 rbd_assert(rbd_dev->parent != NULL); 2451 - 2452 - /* 2453 - * First things first. The original osd request is of no 2454 - * use to use any more, we'll need a new one that can hold 2455 - * the two ops in a copyup request. We'll get that later, 2456 - * but for now we can release the old one. 2457 - */ 2458 - rbd_osd_req_destroy(obj_request->osd_req); 2459 - obj_request->osd_req = NULL; 2460 2258 2461 2259 /* 2462 2260 * Determine the byte range covered by the object in the ··· 2479 2295 } 2480 2296 2481 2297 result = -ENOMEM; 2482 - parent_request = rbd_img_request_create(rbd_dev->parent, 2483 - img_offset, length, 2484 - false, true); 2298 + parent_request = rbd_parent_request_create(obj_request, 2299 + img_offset, length); 2485 2300 if (!parent_request) 2486 2301 goto out_err; 2487 - rbd_obj_request_get(obj_request); 2488 - parent_request->obj_request = obj_request; 2489 2302 2490 2303 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 2491 2304 if (result) 2492 2305 goto out_err; 2493 2306 parent_request->copyup_pages = pages; 2307 + parent_request->copyup_page_count = page_count; 2494 2308 2495 2309 parent_request->callback = rbd_img_obj_parent_read_full_callback; 2496 2310 result = rbd_img_request_submit(parent_request); ··· 2496 2314 return 0; 2497 2315 2498 2316 parent_request->copyup_pages = NULL; 2317 + parent_request->copyup_page_count = 0; 2499 2318 parent_request->obj_request = NULL; 2500 2319 rbd_obj_request_put(obj_request); 2501 2320 out_err: ··· 2514 2331 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2515 2332 { 2516 2333 struct rbd_obj_request *orig_request; 2334 + struct rbd_device *rbd_dev; 2517 2335 int result; 2518 2336 2519 2337 rbd_assert(!obj_request_img_data_test(obj_request)); ··· 2537 2353 obj_request->xferred, obj_request->length); 2538 2354 rbd_obj_request_put(obj_request); 2539 2355 2540 - rbd_assert(orig_request); 2541 - rbd_assert(orig_request->img_request); 2356 + /* 2357 + * If the overlap has become 0 (most likely because the 2358 + * image has been flattened) we need to free the pages 2359 + * and re-submit the original write request. 2360 + */ 2361 + rbd_dev = orig_request->img_request->rbd_dev; 2362 + if (!rbd_dev->parent_overlap) { 2363 + struct ceph_osd_client *osdc; 2364 + 2365 + rbd_obj_request_put(orig_request); 2366 + osdc = &rbd_dev->rbd_client->client->osdc; 2367 + result = rbd_obj_request_submit(osdc, orig_request); 2368 + if (!result) 2369 + return; 2370 + } 2542 2371 2543 2372 /* 2544 2373 * Our only purpose here is to determine whether the object ··· 2709 2512 struct rbd_obj_request *obj_request; 2710 2513 struct rbd_device *rbd_dev; 2711 2514 u64 obj_end; 2515 + u64 img_xferred; 2516 + int img_result; 2712 2517 2713 2518 rbd_assert(img_request_child_test(img_request)); 2714 2519 2520 + /* First get what we need from the image request and release it */ 2521 + 2715 2522 obj_request = img_request->obj_request; 2523 + img_xferred = img_request->xferred; 2524 + img_result = img_request->result; 2525 + rbd_img_request_put(img_request); 2526 + 2527 + /* 2528 + * If the overlap has become 0 (most likely because the 2529 + * image has been flattened) we need to re-submit the 2530 + * original request. 2531 + */ 2716 2532 rbd_assert(obj_request); 2717 2533 rbd_assert(obj_request->img_request); 2534 + rbd_dev = obj_request->img_request->rbd_dev; 2535 + if (!rbd_dev->parent_overlap) { 2536 + struct ceph_osd_client *osdc; 2718 2537 2719 - obj_request->result = img_request->result; 2538 + osdc = &rbd_dev->rbd_client->client->osdc; 2539 + img_result = rbd_obj_request_submit(osdc, obj_request); 2540 + if (!img_result) 2541 + return; 2542 + } 2543 + 2544 + obj_request->result = img_result; 2720 2545 if (obj_request->result) 2721 2546 goto out; 2722 2547 ··· 2751 2532 */ 2752 2533 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2753 2534 obj_end = obj_request->img_offset + obj_request->length; 2754 - rbd_dev = obj_request->img_request->rbd_dev; 2755 2535 if (obj_end > rbd_dev->parent_overlap) { 2756 2536 u64 xferred = 0; 2757 2537 ··· 2758 2540 xferred = rbd_dev->parent_overlap - 2759 2541 obj_request->img_offset; 2760 2542 2761 - obj_request->xferred = min(img_request->xferred, xferred); 2543 + obj_request->xferred = min(img_xferred, xferred); 2762 2544 } else { 2763 - obj_request->xferred = img_request->xferred; 2545 + obj_request->xferred = img_xferred; 2764 2546 } 2765 2547 out: 2766 - rbd_img_request_put(img_request); 2767 2548 rbd_img_obj_request_read_callback(obj_request); 2768 2549 rbd_obj_request_complete(obj_request); 2769 2550 } 2770 2551 2771 2552 static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 2772 2553 { 2773 - struct rbd_device *rbd_dev; 2774 2554 struct rbd_img_request *img_request; 2775 2555 int result; 2776 2556 2777 2557 rbd_assert(obj_request_img_data_test(obj_request)); 2778 2558 rbd_assert(obj_request->img_request != NULL); 2779 2559 rbd_assert(obj_request->result == (s32) -ENOENT); 2780 - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 2560 + rbd_assert(obj_request_type_valid(obj_request->type)); 2781 2561 2782 - rbd_dev = obj_request->img_request->rbd_dev; 2783 - rbd_assert(rbd_dev->parent != NULL); 2784 2562 /* rbd_read_finish(obj_request, obj_request->length); */ 2785 - img_request = rbd_img_request_create(rbd_dev->parent, 2563 + img_request = rbd_parent_request_create(obj_request, 2786 2564 obj_request->img_offset, 2787 - obj_request->length, 2788 - false, true); 2565 + obj_request->length); 2789 2566 result = -ENOMEM; 2790 2567 if (!img_request) 2791 2568 goto out_err; 2792 2569 2793 - rbd_obj_request_get(obj_request); 2794 - img_request->obj_request = obj_request; 2795 - 2796 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2797 - obj_request->bio_list); 2570 + if (obj_request->type == OBJ_REQUEST_BIO) 2571 + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2572 + obj_request->bio_list); 2573 + else 2574 + result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 2575 + obj_request->pages); 2798 2576 if (result) 2799 2577 goto out_err; 2800 2578 ··· 2840 2626 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2841 2627 { 2842 2628 struct rbd_device *rbd_dev = (struct rbd_device *)data; 2629 + int ret; 2843 2630 2844 2631 if (!rbd_dev) 2845 2632 return; ··· 2848 2633 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2849 2634 rbd_dev->header_name, (unsigned long long)notify_id, 2850 2635 (unsigned int)opcode); 2851 - (void)rbd_dev_refresh(rbd_dev); 2636 + ret = rbd_dev_refresh(rbd_dev); 2637 + if (ret) 2638 + rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret); 2852 2639 2853 2640 rbd_obj_notify_ack(rbd_dev, notify_id); 2854 2641 } ··· 2859 2642 * Request sync osd watch/unwatch. The value of "start" determines 2860 2643 * whether a watch request is being initiated or torn down. 2861 2644 */ 2862 - static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 2645 + static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 2863 2646 { 2864 2647 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2865 2648 struct rbd_obj_request *obj_request; ··· 2893 2676 rbd_dev->watch_request->osd_req); 2894 2677 2895 2678 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 2896 - rbd_dev->watch_event->cookie, 0, start); 2679 + rbd_dev->watch_event->cookie, 0, start ? 1 : 0); 2897 2680 rbd_osd_req_format_write(obj_request); 2898 2681 2899 2682 ret = rbd_obj_request_submit(osdc, obj_request); ··· 3086 2869 goto end_request; /* Shouldn't happen */ 3087 2870 } 3088 2871 2872 + result = -EIO; 2873 + if (offset + length > rbd_dev->mapping.size) { 2874 + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 2875 + offset, length, rbd_dev->mapping.size); 2876 + goto end_request; 2877 + } 2878 + 3089 2879 result = -ENOMEM; 3090 2880 img_request = rbd_img_request_create(rbd_dev, offset, length, 3091 - write_request, false); 2881 + write_request); 3092 2882 if (!img_request) 3093 2883 goto end_request; 3094 2884 ··· 3246 3022 } 3247 3023 3248 3024 /* 3249 - * Read the complete header for the given rbd device. 3250 - * 3251 - * Returns a pointer to a dynamically-allocated buffer containing 3252 - * the complete and validated header. Caller can pass the address 3253 - * of a variable that will be filled in with the version of the 3254 - * header object at the time it was read. 3255 - * 3256 - * Returns a pointer-coded errno if a failure occurs. 3025 + * Read the complete header for the given rbd device. On successful 3026 + * return, the rbd_dev->header field will contain up-to-date 3027 + * information about the image. 3257 3028 */ 3258 - static struct rbd_image_header_ondisk * 3259 - rbd_dev_v1_header_read(struct rbd_device *rbd_dev) 3029 + static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 3260 3030 { 3261 3031 struct rbd_image_header_ondisk *ondisk = NULL; 3262 3032 u32 snap_count = 0; ··· 3275 3057 size += names_size; 3276 3058 ondisk = kmalloc(size, GFP_KERNEL); 3277 3059 if (!ondisk) 3278 - return ERR_PTR(-ENOMEM); 3060 + return -ENOMEM; 3279 3061 3280 3062 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 3281 3063 0, size, ondisk); 3282 3064 if (ret < 0) 3283 - goto out_err; 3065 + goto out; 3284 3066 if ((size_t)ret < size) { 3285 3067 ret = -ENXIO; 3286 3068 rbd_warn(rbd_dev, "short header read (want %zd got %d)", 3287 3069 size, ret); 3288 - goto out_err; 3070 + goto out; 3289 3071 } 3290 3072 if (!rbd_dev_ondisk_valid(ondisk)) { 3291 3073 ret = -ENXIO; 3292 3074 rbd_warn(rbd_dev, "invalid header"); 3293 - goto out_err; 3075 + goto out; 3294 3076 } 3295 3077 3296 3078 names_size = le64_to_cpu(ondisk->snap_names_len); ··· 3298 3080 snap_count = le32_to_cpu(ondisk->snap_count); 3299 3081 } while (snap_count != want_count); 3300 3082 3301 - return ondisk; 3302 - 3303 - out_err: 3083 + ret = rbd_header_from_disk(rbd_dev, ondisk); 3084 + out: 3304 3085 kfree(ondisk); 3305 - 3306 - return ERR_PTR(ret); 3307 - } 3308 - 3309 - /* 3310 - * reload the ondisk the header 3311 - */ 3312 - static int rbd_read_header(struct rbd_device *rbd_dev, 3313 - struct rbd_image_header *header) 3314 - { 3315 - struct rbd_image_header_ondisk *ondisk; 3316 - int ret; 3317 - 3318 - ondisk = rbd_dev_v1_header_read(rbd_dev); 3319 - if (IS_ERR(ondisk)) 3320 - return PTR_ERR(ondisk); 3321 - ret = rbd_header_from_disk(header, ondisk); 3322 - kfree(ondisk); 3323 - 3324 - return ret; 3325 - } 3326 - 3327 - static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 3328 - { 3329 - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 3330 - return; 3331 - 3332 - if (rbd_dev->mapping.size != rbd_dev->header.image_size) { 3333 - sector_t size; 3334 - 3335 - rbd_dev->mapping.size = rbd_dev->header.image_size; 3336 - size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 3337 - dout("setting size to %llu sectors", (unsigned long long)size); 3338 - set_capacity(rbd_dev->disk, size); 3339 - } 3340 - } 3341 - 3342 - /* 3343 - * only read the first part of the ondisk header, without the snaps info 3344 - */ 3345 - static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) 3346 - { 3347 - int ret; 3348 - struct rbd_image_header h; 3349 - 3350 - ret = rbd_read_header(rbd_dev, &h); 3351 - if (ret < 0) 3352 - return ret; 3353 - 3354 - down_write(&rbd_dev->header_rwsem); 3355 - 3356 - /* Update image size, and check for resize of mapped image */ 3357 - rbd_dev->header.image_size = h.image_size; 3358 - rbd_update_mapping_size(rbd_dev); 3359 - 3360 - /* rbd_dev->header.object_prefix shouldn't change */ 3361 - kfree(rbd_dev->header.snap_sizes); 3362 - kfree(rbd_dev->header.snap_names); 3363 - /* osd requests may still refer to snapc */ 3364 - ceph_put_snap_context(rbd_dev->header.snapc); 3365 - 3366 - rbd_dev->header.image_size = h.image_size; 3367 - rbd_dev->header.snapc = h.snapc; 3368 - rbd_dev->header.snap_names = h.snap_names; 3369 - rbd_dev->header.snap_sizes = h.snap_sizes; 3370 - /* Free the extra copy of the object prefix */ 3371 - if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) 3372 - rbd_warn(rbd_dev, "object prefix changed (ignoring)"); 3373 - kfree(h.object_prefix); 3374 - 3375 - up_write(&rbd_dev->header_rwsem); 3376 3086 3377 3087 return ret; 3378 3088 } ··· 3326 3180 3327 3181 static int rbd_dev_refresh(struct rbd_device *rbd_dev) 3328 3182 { 3329 - u64 image_size; 3183 + u64 mapping_size; 3330 3184 int ret; 3331 3185 3332 3186 rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3333 - image_size = rbd_dev->header.image_size; 3187 + mapping_size = rbd_dev->mapping.size; 3334 3188 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3335 3189 if (rbd_dev->image_format == 1) 3336 - ret = rbd_dev_v1_refresh(rbd_dev); 3190 + ret = rbd_dev_v1_header_info(rbd_dev); 3337 3191 else 3338 - ret = rbd_dev_v2_refresh(rbd_dev); 3192 + ret = rbd_dev_v2_header_info(rbd_dev); 3339 3193 3340 3194 /* If it's a mapped snapshot, validate its EXISTS flag */ 3341 3195 3342 3196 rbd_exists_validate(rbd_dev); 3343 3197 mutex_unlock(&ctl_mutex); 3344 - if (ret) 3345 - rbd_warn(rbd_dev, "got notification but failed to " 3346 - " update snaps: %d\n", ret); 3347 - if (image_size != rbd_dev->header.image_size) 3198 + if (mapping_size != rbd_dev->mapping.size) { 3199 + sector_t size; 3200 + 3201 + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 3202 + dout("setting size to %llu sectors", (unsigned long long)size); 3203 + set_capacity(rbd_dev->disk, size); 3348 3204 revalidate_disk(rbd_dev->disk); 3205 + } 3349 3206 3350 3207 return ret; 3351 3208 } ··· 3552 3403 int ret; 3553 3404 3554 3405 ret = rbd_dev_refresh(rbd_dev); 3406 + if (ret) 3407 + rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); 3555 3408 3556 3409 return ret < 0 ? ret : size; 3557 3410 } ··· 3652 3501 3653 3502 spin_lock_init(&rbd_dev->lock); 3654 3503 rbd_dev->flags = 0; 3504 + atomic_set(&rbd_dev->parent_ref, 0); 3655 3505 INIT_LIST_HEAD(&rbd_dev->node); 3656 3506 init_rwsem(&rbd_dev->header_rwsem); 3657 3507 ··· 3802 3650 __le64 snapid; 3803 3651 void *p; 3804 3652 void *end; 3653 + u64 pool_id; 3805 3654 char *image_id; 3806 3655 u64 overlap; 3807 3656 int ret; ··· 3833 3680 p = reply_buf; 3834 3681 end = reply_buf + ret; 3835 3682 ret = -ERANGE; 3836 - ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 3837 - if (parent_spec->pool_id == CEPH_NOPOOL) 3683 + ceph_decode_64_safe(&p, end, pool_id, out_err); 3684 + if (pool_id == CEPH_NOPOOL) { 3685 + /* 3686 + * Either the parent never existed, or we have 3687 + * record of it but the image got flattened so it no 3688 + * longer has a parent. When the parent of a 3689 + * layered image disappears we immediately set the 3690 + * overlap to 0. The effect of this is that all new 3691 + * requests will be treated as if the image had no 3692 + * parent. 3693 + */ 3694 + if (rbd_dev->parent_overlap) { 3695 + rbd_dev->parent_overlap = 0; 3696 + smp_mb(); 3697 + rbd_dev_parent_put(rbd_dev); 3698 + pr_info("%s: clone image has been flattened\n", 3699 + rbd_dev->disk->disk_name); 3700 + } 3701 + 3838 3702 goto out; /* No parent? No problem. */ 3703 + } 3839 3704 3840 3705 /* The ceph file layout needs to fit pool id in 32 bits */ 3841 3706 3842 3707 ret = -EIO; 3843 - if (parent_spec->pool_id > (u64)U32_MAX) { 3708 + if (pool_id > (u64)U32_MAX) { 3844 3709 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", 3845 - (unsigned long long)parent_spec->pool_id, U32_MAX); 3710 + (unsigned long long)pool_id, U32_MAX); 3846 3711 goto out_err; 3847 3712 } 3713 + parent_spec->pool_id = pool_id; 3848 3714 3849 3715 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3850 3716 if (IS_ERR(image_id)) { ··· 3874 3702 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 3875 3703 ceph_decode_64_safe(&p, end, overlap, out_err); 3876 3704 3877 - rbd_dev->parent_overlap = overlap; 3878 - rbd_dev->parent_spec = parent_spec; 3879 - parent_spec = NULL; /* rbd_dev now owns this */ 3705 + if (overlap) { 3706 + rbd_spec_put(rbd_dev->parent_spec); 3707 + rbd_dev->parent_spec = parent_spec; 3708 + parent_spec = NULL; /* rbd_dev now owns this */ 3709 + rbd_dev->parent_overlap = overlap; 3710 + } else { 3711 + rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n"); 3712 + } 3880 3713 out: 3881 3714 ret = 0; 3882 3715 out_err: ··· 4179 4002 for (i = 0; i < snap_count; i++) 4180 4003 snapc->snaps[i] = ceph_decode_64(&p); 4181 4004 4005 + ceph_put_snap_context(rbd_dev->header.snapc); 4182 4006 rbd_dev->header.snapc = snapc; 4183 4007 4184 4008 dout(" snap context seq = %llu, snap_count = %u\n", ··· 4231 4053 return snap_name; 4232 4054 } 4233 4055 4234 - static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) 4056 + static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4235 4057 { 4058 + bool first_time = rbd_dev->header.object_prefix == NULL; 4236 4059 int ret; 4237 4060 4238 4061 down_write(&rbd_dev->header_rwsem); 4239 4062 4063 + if (first_time) { 4064 + ret = rbd_dev_v2_header_onetime(rbd_dev); 4065 + if (ret) 4066 + goto out; 4067 + } 4068 + 4069 + /* 4070 + * If the image supports layering, get the parent info. We 4071 + * need to probe the first time regardless. Thereafter we 4072 + * only need to if there's a parent, to see if it has 4073 + * disappeared due to the mapped image getting flattened. 4074 + */ 4075 + if (rbd_dev->header.features & RBD_FEATURE_LAYERING && 4076 + (first_time || rbd_dev->parent_spec)) { 4077 + bool warn; 4078 + 4079 + ret = rbd_dev_v2_parent_info(rbd_dev); 4080 + if (ret) 4081 + goto out; 4082 + 4083 + /* 4084 + * Print a warning if this is the initial probe and 4085 + * the image has a parent. Don't print it if the 4086 + * image now being probed is itself a parent. We 4087 + * can tell at this point because we won't know its 4088 + * pool name yet (just its pool id). 4089 + */ 4090 + warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; 4091 + if (first_time && warn) 4092 + rbd_warn(rbd_dev, "WARNING: kernel layering " 4093 + "is EXPERIMENTAL!"); 4094 + } 4095 + 4240 4096 ret = rbd_dev_v2_image_size(rbd_dev); 4241 4097 if (ret) 4242 4098 goto out; 4243 - rbd_update_mapping_size(rbd_dev); 4099 + 4100 + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) 4101 + if (rbd_dev->mapping.size != rbd_dev->header.image_size) 4102 + rbd_dev->mapping.size = rbd_dev->header.image_size; 4244 4103 4245 4104 ret = rbd_dev_v2_snap_context(rbd_dev); 4246 4105 dout("rbd_dev_v2_snap_context returned %d\n", ret); 4247 - if (ret) 4248 - goto out; 4249 4106 out: 4250 4107 up_write(&rbd_dev->header_rwsem); 4251 4108 ··· 4703 4490 { 4704 4491 struct rbd_image_header *header; 4705 4492 4706 - rbd_dev_remove_parent(rbd_dev); 4707 - rbd_spec_put(rbd_dev->parent_spec); 4708 - rbd_dev->parent_spec = NULL; 4709 - rbd_dev->parent_overlap = 0; 4493 + /* Drop parent reference unless it's already been done (or none) */ 4494 + 4495 + if (rbd_dev->parent_overlap) 4496 + rbd_dev_parent_put(rbd_dev); 4710 4497 4711 4498 /* Free dynamic fields from the header, then zero it out */ 4712 4499 ··· 4718 4505 memset(header, 0, sizeof (*header)); 4719 4506 } 4720 4507 4721 - static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 4508 + static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 4722 4509 { 4723 4510 int ret; 4724 - 4725 - /* Populate rbd image metadata */ 4726 - 4727 - ret = rbd_read_header(rbd_dev, &rbd_dev->header); 4728 - if (ret < 0) 4729 - goto out_err; 4730 - 4731 - /* Version 1 images have no parent (no layering) */ 4732 - 4733 - rbd_dev->parent_spec = NULL; 4734 - rbd_dev->parent_overlap = 0; 4735 - 4736 - dout("discovered version 1 image, header name is %s\n", 4737 - rbd_dev->header_name); 4738 - 4739 - return 0; 4740 - 4741 - out_err: 4742 - kfree(rbd_dev->header_name); 4743 - rbd_dev->header_name = NULL; 4744 - kfree(rbd_dev->spec->image_id); 4745 - rbd_dev->spec->image_id = NULL; 4746 - 4747 - return ret; 4748 - } 4749 - 4750 - static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 4751 - { 4752 - int ret; 4753 - 4754 - ret = rbd_dev_v2_image_size(rbd_dev); 4755 - if (ret) 4756 - goto out_err; 4757 - 4758 - /* Get the object prefix (a.k.a. block_name) for the image */ 4759 4511 4760 4512 ret = rbd_dev_v2_object_prefix(rbd_dev); 4761 4513 if (ret) 4762 4514 goto out_err; 4763 4515 4764 - /* Get the and check features for the image */ 4765 - 4516 + /* 4517 + * Get the and check features for the image. Currently the 4518 + * features are assumed to never change. 4519 + */ 4766 4520 ret = rbd_dev_v2_features(rbd_dev); 4767 4521 if (ret) 4768 4522 goto out_err; 4769 - 4770 - /* If the image supports layering, get the parent info */ 4771 - 4772 - if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 4773 - ret = rbd_dev_v2_parent_info(rbd_dev); 4774 - if (ret) 4775 - goto out_err; 4776 - 4777 - /* 4778 - * Don't print a warning for parent images. We can 4779 - * tell this point because we won't know its pool 4780 - * name yet (just its pool id). 4781 - */ 4782 - if (rbd_dev->spec->pool_name) 4783 - rbd_warn(rbd_dev, "WARNING: kernel layering " 4784 - "is EXPERIMENTAL!"); 4785 - } 4786 4523 4787 4524 /* If the image supports fancy striping, get its parameters */ 4788 4525 ··· 4741 4578 if (ret < 0) 4742 4579 goto out_err; 4743 4580 } 4744 - 4745 - /* crypto and compression type aren't (yet) supported for v2 images */ 4746 - 4747 - rbd_dev->header.crypt_type = 0; 4748 - rbd_dev->header.comp_type = 0; 4749 - 4750 - /* Get the snapshot context, plus the header version */ 4751 - 4752 - ret = rbd_dev_v2_snap_context(rbd_dev); 4753 - if (ret) 4754 - goto out_err; 4755 - 4756 - dout("discovered version 2 image, header name is %s\n", 4757 - rbd_dev->header_name); 4581 + /* No support for crypto and compression type format 2 images */ 4758 4582 4759 4583 return 0; 4760 4584 out_err: 4761 - rbd_dev->parent_overlap = 0; 4762 - rbd_spec_put(rbd_dev->parent_spec); 4763 - rbd_dev->parent_spec = NULL; 4764 - kfree(rbd_dev->header_name); 4765 - rbd_dev->header_name = NULL; 4585 + rbd_dev->header.features = 0; 4766 4586 kfree(rbd_dev->header.object_prefix); 4767 4587 rbd_dev->header.object_prefix = NULL; 4768 4588 ··· 4774 4628 if (!parent) 4775 4629 goto out_err; 4776 4630 4777 - ret = rbd_dev_image_probe(parent); 4631 + ret = rbd_dev_image_probe(parent, false); 4778 4632 if (ret < 0) 4779 4633 goto out_err; 4780 4634 rbd_dev->parent = parent; 4635 + atomic_set(&rbd_dev->parent_ref, 1); 4781 4636 4782 4637 return 0; 4783 4638 out_err: 4784 4639 if (parent) { 4785 - rbd_spec_put(rbd_dev->parent_spec); 4640 + rbd_dev_unparent(rbd_dev); 4786 4641 kfree(rbd_dev->header_name); 4787 4642 rbd_dev_destroy(parent); 4788 4643 } else { ··· 4797 4650 static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 4798 4651 { 4799 4652 int ret; 4800 - 4801 - ret = rbd_dev_mapping_set(rbd_dev); 4802 - if (ret) 4803 - return ret; 4804 4653 4805 4654 /* generate unique id: find highest unique id, add one */ 4806 4655 rbd_dev_id_get(rbd_dev); ··· 4819 4676 if (ret) 4820 4677 goto err_out_blkdev; 4821 4678 4822 - ret = rbd_bus_add_dev(rbd_dev); 4679 + ret = rbd_dev_mapping_set(rbd_dev); 4823 4680 if (ret) 4824 4681 goto err_out_disk; 4682 + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 4683 + 4684 + ret = rbd_bus_add_dev(rbd_dev); 4685 + if (ret) 4686 + goto err_out_mapping; 4825 4687 4826 4688 /* Everything's ready. Announce the disk to the world. */ 4827 4689 4828 - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 4829 4690 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 4830 4691 add_disk(rbd_dev->disk); 4831 4692 ··· 4838 4691 4839 4692 return ret; 4840 4693 4694 + err_out_mapping: 4695 + rbd_dev_mapping_clear(rbd_dev); 4841 4696 err_out_disk: 4842 4697 rbd_free_disk(rbd_dev); 4843 4698 err_out_blkdev: ··· 4880 4731 4881 4732 static void rbd_dev_image_release(struct rbd_device *rbd_dev) 4882 4733 { 4883 - int ret; 4884 - 4885 4734 rbd_dev_unprobe(rbd_dev); 4886 - ret = rbd_dev_header_watch_sync(rbd_dev, 0); 4887 - if (ret) 4888 - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); 4889 4735 kfree(rbd_dev->header_name); 4890 4736 rbd_dev->header_name = NULL; 4891 4737 rbd_dev->image_format = 0; ··· 4892 4748 4893 4749 /* 4894 4750 * Probe for the existence of the header object for the given rbd 4895 - * device. For format 2 images this includes determining the image 4896 - * id. 4751 + * device. If this image is the one being mapped (i.e., not a 4752 + * parent), initiate a watch on its header object before using that 4753 + * object to get detailed information about the rbd image. 4897 4754 */ 4898 - static int rbd_dev_image_probe(struct rbd_device *rbd_dev) 4755 + static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) 4899 4756 { 4900 4757 int ret; 4901 4758 int tmp; ··· 4916 4771 if (ret) 4917 4772 goto err_out_format; 4918 4773 4919 - ret = rbd_dev_header_watch_sync(rbd_dev, 1); 4920 - if (ret) 4921 - goto out_header_name; 4774 + if (mapping) { 4775 + ret = rbd_dev_header_watch_sync(rbd_dev, true); 4776 + if (ret) 4777 + goto out_header_name; 4778 + } 4922 4779 4923 4780 if (rbd_dev->image_format == 1) 4924 - ret = rbd_dev_v1_probe(rbd_dev); 4781 + ret = rbd_dev_v1_header_info(rbd_dev); 4925 4782 else 4926 - ret = rbd_dev_v2_probe(rbd_dev); 4783 + ret = rbd_dev_v2_header_info(rbd_dev); 4927 4784 if (ret) 4928 4785 goto err_out_watch; 4929 4786 ··· 4934 4787 goto err_out_probe; 4935 4788 4936 4789 ret = rbd_dev_probe_parent(rbd_dev); 4937 - if (!ret) 4938 - return 0; 4790 + if (ret) 4791 + goto err_out_probe; 4939 4792 4793 + dout("discovered format %u image, header name is %s\n", 4794 + rbd_dev->image_format, rbd_dev->header_name); 4795 + 4796 + return 0; 4940 4797 err_out_probe: 4941 4798 rbd_dev_unprobe(rbd_dev); 4942 4799 err_out_watch: 4943 - tmp = rbd_dev_header_watch_sync(rbd_dev, 0); 4944 - if (tmp) 4945 - rbd_warn(rbd_dev, "unable to tear down watch request\n"); 4800 + if (mapping) { 4801 + tmp = rbd_dev_header_watch_sync(rbd_dev, false); 4802 + if (tmp) 4803 + rbd_warn(rbd_dev, "unable to tear down " 4804 + "watch request (%d)\n", tmp); 4805 + } 4946 4806 out_header_name: 4947 4807 kfree(rbd_dev->header_name); 4948 4808 rbd_dev->header_name = NULL; ··· 4973 4819 struct rbd_spec *spec = NULL; 4974 4820 struct rbd_client *rbdc; 4975 4821 struct ceph_osd_client *osdc; 4822 + bool read_only; 4976 4823 int rc = -ENOMEM; 4977 4824 4978 4825 if (!try_module_get(THIS_MODULE)) ··· 4983 4828 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4984 4829 if (rc < 0) 4985 4830 goto err_out_module; 4831 + read_only = rbd_opts->read_only; 4832 + kfree(rbd_opts); 4833 + rbd_opts = NULL; /* done with this */ 4986 4834 4987 4835 rbdc = rbd_get_client(ceph_opts); 4988 4836 if (IS_ERR(rbdc)) { ··· 5016 4858 rbdc = NULL; /* rbd_dev now owns this */ 5017 4859 spec = NULL; /* rbd_dev now owns this */ 5018 4860 5019 - rbd_dev->mapping.read_only = rbd_opts->read_only; 5020 - kfree(rbd_opts); 5021 - rbd_opts = NULL; /* done with this */ 5022 - 5023 - rc = rbd_dev_image_probe(rbd_dev); 4861 + rc = rbd_dev_image_probe(rbd_dev, true); 5024 4862 if (rc < 0) 5025 4863 goto err_out_rbd_dev; 4864 + 4865 + /* If we are mapping a snapshot it must be marked read-only */ 4866 + 4867 + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 4868 + read_only = true; 4869 + rbd_dev->mapping.read_only = read_only; 5026 4870 5027 4871 rc = rbd_dev_device_setup(rbd_dev); 5028 4872 if (!rc) ··· 5071 4911 5072 4912 rbd_free_disk(rbd_dev); 5073 4913 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5074 - rbd_dev_clear_mapping(rbd_dev); 4914 + rbd_dev_mapping_clear(rbd_dev); 5075 4915 unregister_blkdev(rbd_dev->major, rbd_dev->name); 5076 4916 rbd_dev->major = 0; 5077 4917 rbd_dev_id_put(rbd_dev); ··· 5138 4978 spin_unlock_irq(&rbd_dev->lock); 5139 4979 if (ret < 0) 5140 4980 goto done; 5141 - ret = count; 5142 4981 rbd_bus_del_dev(rbd_dev); 4982 + ret = rbd_dev_header_watch_sync(rbd_dev, false); 4983 + if (ret) 4984 + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); 5143 4985 rbd_dev_image_release(rbd_dev); 5144 4986 module_put(THIS_MODULE); 4987 + ret = count; 5145 4988 done: 5146 4989 mutex_unlock(&ctl_mutex); 5147 4990
+4 -1
net/ceph/osd_client.c
··· 1204 1204 mutex_lock(&osdc->request_mutex); 1205 1205 if (req->r_linger) { 1206 1206 __unregister_linger_request(osdc, req); 1207 + req->r_linger = 0; 1207 1208 ceph_osdc_put_request(req); 1208 1209 } 1209 1210 mutex_unlock(&osdc->request_mutex); ··· 2121 2120 down_read(&osdc->map_sem); 2122 2121 mutex_lock(&osdc->request_mutex); 2123 2122 __register_request(osdc, req); 2124 - WARN_ON(req->r_sent); 2123 + req->r_sent = 0; 2124 + req->r_got_reply = 0; 2125 + req->r_completed = 0; 2125 2126 rc = __map_request(osdc, req, 0); 2126 2127 if (rc < 0) { 2127 2128 if (nofail) {