Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
"There is a lot of refactoring and hardening of the libceph and rbd
code here from Ilya that fix various smaller bugs, and a few more
important fixes with clone overlap. The main fix is a critical change
to the request_fn handling to not sleep that was exposed by the recent
mutex changes (which will also go to the 3.16 stable series).

Yan Zheng has several fixes in here for CephFS fixing ACL handling,
time stamps, and request resends when the MDS restarts.

Finally, there are a few cleanups from Himangi Saraogi based on
Coccinelle"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (39 commits)
libceph: set last_piece in ceph_msg_data_pages_cursor_init() correctly
rbd: remove extra newlines from rbd_warn() messages
rbd: allocate img_request with GFP_NOIO instead GFP_ATOMIC
rbd: rework rbd_request_fn()
ceph: fix kick_requests()
ceph: fix append mode write
ceph: fix sizeof(struct tYpO *) typo
ceph: remove redundant memset(0)
rbd: take snap_id into account when reading in parent info
rbd: do not read in parent info before snap context
rbd: update mapping size only on refresh
rbd: harden rbd_dev_refresh() and callers a bit
rbd: split rbd_dev_spec_update() into two functions
rbd: remove unnecessary asserts in rbd_dev_image_probe()
rbd: introduce rbd_dev_header_info()
rbd: show the entire chain of parent images
ceph: replace comma with a semicolon
rbd: use rbd_segment_name_free() instead of kfree()
ceph: check zero length in ceph_sync_read()
ceph: reset r_resend_mds after receiving -ESTALE
...

+553 -438
+2 -2
Documentation/ABI/testing/sysfs-bus-rbd
··· 94 94 95 95 parent 96 96 97 - Information identifying the pool, image, and snapshot id for 98 - the parent image in a layered rbd image (format 2 only). 97 + Information identifying the chain of parent images in a layered rbd 98 + image. Entries are separated by empty lines.
+380 -339
drivers/block/rbd.c
··· 42 42 #include <linux/blkdev.h> 43 43 #include <linux/slab.h> 44 44 #include <linux/idr.h> 45 + #include <linux/workqueue.h> 45 46 46 47 #include "rbd_types.h" 47 48 ··· 333 332 334 333 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 335 334 335 + struct list_head rq_queue; /* incoming rq queue */ 336 336 spinlock_t lock; /* queue, flags, open_count */ 337 + struct workqueue_struct *rq_wq; 338 + struct work_struct rq_work; 337 339 338 340 struct rbd_image_header header; 339 341 unsigned long flags; /* possibly lock protected */ ··· 518 514 519 515 static int rbd_dev_refresh(struct rbd_device *rbd_dev); 520 516 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 521 - static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); 517 + static int rbd_dev_header_info(struct rbd_device *rbd_dev); 518 + static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 522 519 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 523 520 u64 snap_id); 524 521 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ··· 976 971 header->snap_names = snap_names; 977 972 header->snap_sizes = snap_sizes; 978 973 979 - /* Make sure mapping size is consistent with header info */ 980 - 981 - if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) 982 - if (rbd_dev->mapping.size != header->image_size) 983 - rbd_dev->mapping.size = header->image_size; 984 - 985 974 return 0; 986 975 out_2big: 987 976 ret = -EIO; ··· 1138 1139 rbd_dev->mapping.features = 0; 1139 1140 } 1140 1141 1142 + static void rbd_segment_name_free(const char *name) 1143 + { 1144 + /* The explicit cast here is needed to drop the const qualifier */ 1145 + 1146 + kmem_cache_free(rbd_segment_name_cache, (void *)name); 1147 + } 1148 + 1141 1149 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1142 1150 { 1143 1151 char *name; ··· 1164 1158 if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 1165 1159 pr_err("error formatting segment name for #%llu (%d)\n", 1166 1160 segment, ret); 1167 - kfree(name); 1161 + rbd_segment_name_free(name); 1168 1162 name = NULL; 1169 1163 } 1170 1164 1171 1165 return name; 1172 - } 1173 - 1174 - static void rbd_segment_name_free(const char *name) 1175 - { 1176 - /* The explicit cast here is needed to drop the const qualifier */ 1177 - 1178 - kmem_cache_free(rbd_segment_name_cache, (void *)name); 1179 1166 } 1180 1167 1181 1168 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) ··· 1370 1371 struct rbd_device *rbd_dev; 1371 1372 1372 1373 rbd_dev = obj_request->img_request->rbd_dev; 1373 - rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 1374 + rbd_warn(rbd_dev, "obj_request %p already marked img_data", 1374 1375 obj_request); 1375 1376 } 1376 1377 } ··· 1388 1389 1389 1390 if (obj_request_img_data_test(obj_request)) 1390 1391 rbd_dev = obj_request->img_request->rbd_dev; 1391 - rbd_warn(rbd_dev, "obj_request %p already marked done\n", 1392 + rbd_warn(rbd_dev, "obj_request %p already marked done", 1392 1393 obj_request); 1393 1394 } 1394 1395 } ··· 1526 1527 static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1527 1528 struct rbd_obj_request *obj_request) 1528 1529 { 1529 - dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 1530 - 1530 + dout("%s %p\n", __func__, obj_request); 1531 1531 return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1532 + } 1533 + 1534 + static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 1535 + { 1536 + dout("%s %p\n", __func__, obj_request); 1537 + ceph_osdc_cancel_request(obj_request->osd_req); 1538 + } 1539 + 1540 + /* 1541 + * Wait for an object request to complete. If interrupted, cancel the 1542 + * underlying osd request. 1543 + */ 1544 + static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1545 + { 1546 + int ret; 1547 + 1548 + dout("%s %p\n", __func__, obj_request); 1549 + 1550 + ret = wait_for_completion_interruptible(&obj_request->completion); 1551 + if (ret < 0) { 1552 + dout("%s %p interrupted\n", __func__, obj_request); 1553 + rbd_obj_request_end(obj_request); 1554 + return ret; 1555 + } 1556 + 1557 + dout("%s %p done\n", __func__, obj_request); 1558 + return 0; 1532 1559 } 1533 1560 1534 1561 static void rbd_img_request_complete(struct rbd_img_request *img_request) ··· 1581 1556 img_request->callback(img_request); 1582 1557 else 1583 1558 rbd_img_request_put(img_request); 1584 - } 1585 - 1586 - /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1587 - 1588 - static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1589 - { 1590 - dout("%s: obj %p\n", __func__, obj_request); 1591 - 1592 - return wait_for_completion_interruptible(&obj_request->completion); 1593 1559 } 1594 1560 1595 1561 /* ··· 1779 1763 rbd_osd_trivial_callback(obj_request); 1780 1764 break; 1781 1765 default: 1782 - rbd_warn(NULL, "%s: unsupported op %hu\n", 1766 + rbd_warn(NULL, "%s: unsupported op %hu", 1783 1767 obj_request->object_name, (unsigned short) opcode); 1784 1768 break; 1785 1769 } ··· 2014 1998 if (!counter) 2015 1999 rbd_dev_unparent(rbd_dev); 2016 2000 else 2017 - rbd_warn(rbd_dev, "parent reference underflow\n"); 2001 + rbd_warn(rbd_dev, "parent reference underflow"); 2018 2002 } 2019 2003 2020 2004 /* ··· 2044 2028 /* Image was flattened, but parent is not yet torn down */ 2045 2029 2046 2030 if (counter < 0) 2047 - rbd_warn(rbd_dev, "parent reference overflow\n"); 2031 + rbd_warn(rbd_dev, "parent reference overflow"); 2048 2032 2049 2033 return false; 2050 2034 } ··· 2061 2045 { 2062 2046 struct rbd_img_request *img_request; 2063 2047 2064 - img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); 2048 + img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2065 2049 if (!img_request) 2066 2050 return NULL; 2067 2051 ··· 2177 2161 if (result) { 2178 2162 struct rbd_device *rbd_dev = img_request->rbd_dev; 2179 2163 2180 - rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 2164 + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 2181 2165 img_request_write_test(img_request) ? "write" : "read", 2182 2166 obj_request->length, obj_request->img_offset, 2183 2167 obj_request->offset); 2184 - rbd_warn(rbd_dev, " result %d xferred %x\n", 2168 + rbd_warn(rbd_dev, " result %d xferred %x", 2185 2169 result, xferred); 2186 2170 if (!img_request->result) 2187 2171 img_request->result = result; ··· 2962 2946 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2963 2947 rbd_dev->header_name, (unsigned long long)notify_id, 2964 2948 (unsigned int)opcode); 2949 + 2950 + /* 2951 + * Until adequate refresh error handling is in place, there is 2952 + * not much we can do here, except warn. 2953 + * 2954 + * See http://tracker.ceph.com/issues/5040 2955 + */ 2965 2956 ret = rbd_dev_refresh(rbd_dev); 2966 2957 if (ret) 2967 - rbd_warn(rbd_dev, "header refresh error (%d)\n", ret); 2958 + rbd_warn(rbd_dev, "refresh failed: %d", ret); 2968 2959 2969 - rbd_obj_notify_ack_sync(rbd_dev, notify_id); 2960 + ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 2961 + if (ret) 2962 + rbd_warn(rbd_dev, "notify_ack ret %d", ret); 2963 + } 2964 + 2965 + /* 2966 + * Send a (un)watch request and wait for the ack. Return a request 2967 + * with a ref held on success or error. 2968 + */ 2969 + static struct rbd_obj_request *rbd_obj_watch_request_helper( 2970 + struct rbd_device *rbd_dev, 2971 + bool watch) 2972 + { 2973 + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2974 + struct rbd_obj_request *obj_request; 2975 + int ret; 2976 + 2977 + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2978 + OBJ_REQUEST_NODATA); 2979 + if (!obj_request) 2980 + return ERR_PTR(-ENOMEM); 2981 + 2982 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 2983 + obj_request); 2984 + if (!obj_request->osd_req) { 2985 + ret = -ENOMEM; 2986 + goto out; 2987 + } 2988 + 2989 + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 2990 + rbd_dev->watch_event->cookie, 0, watch); 2991 + rbd_osd_req_format_write(obj_request); 2992 + 2993 + if (watch) 2994 + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 2995 + 2996 + ret = rbd_obj_request_submit(osdc, obj_request); 2997 + if (ret) 2998 + goto out; 2999 + 3000 + ret = rbd_obj_request_wait(obj_request); 3001 + if (ret) 3002 + goto out; 3003 + 3004 + ret = obj_request->result; 3005 + if (ret) { 3006 + if (watch) 3007 + rbd_obj_request_end(obj_request); 3008 + goto out; 3009 + } 3010 + 3011 + return obj_request; 3012 + 3013 + out: 3014 + rbd_obj_request_put(obj_request); 3015 + return ERR_PTR(ret); 2970 3016 } 2971 3017 2972 3018 /* ··· 3048 2970 if (ret < 0) 3049 2971 return ret; 3050 2972 3051 - rbd_assert(rbd_dev->watch_event); 3052 - 3053 - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3054 - OBJ_REQUEST_NODATA); 3055 - if (!obj_request) { 3056 - ret = -ENOMEM; 3057 - goto out_cancel; 2973 + obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 2974 + if (IS_ERR(obj_request)) { 2975 + ceph_osdc_cancel_event(rbd_dev->watch_event); 2976 + rbd_dev->watch_event = NULL; 2977 + return PTR_ERR(obj_request); 3058 2978 } 3059 - 3060 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 3061 - obj_request); 3062 - if (!obj_request->osd_req) { 3063 - ret = -ENOMEM; 3064 - goto out_put; 3065 - } 3066 - 3067 - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 3068 - 3069 - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3070 - rbd_dev->watch_event->cookie, 0, 1); 3071 - rbd_osd_req_format_write(obj_request); 3072 - 3073 - ret = rbd_obj_request_submit(osdc, obj_request); 3074 - if (ret) 3075 - goto out_linger; 3076 - 3077 - ret = rbd_obj_request_wait(obj_request); 3078 - if (ret) 3079 - goto out_linger; 3080 - 3081 - ret = obj_request->result; 3082 - if (ret) 3083 - goto out_linger; 3084 2979 3085 2980 /* 3086 2981 * A watch request is set to linger, so the underlying osd 3087 2982 * request won't go away until we unregister it. We retain 3088 2983 * a pointer to the object request during that time (in 3089 - * rbd_dev->watch_request), so we'll keep a reference to 3090 - * it. We'll drop that reference (below) after we've 3091 - * unregistered it. 2984 + * rbd_dev->watch_request), so we'll keep a reference to it. 2985 + * We'll drop that reference after we've unregistered it in 2986 + * rbd_dev_header_unwatch_sync(). 3092 2987 */ 3093 2988 rbd_dev->watch_request = obj_request; 3094 2989 3095 2990 return 0; 3096 - 3097 - out_linger: 3098 - ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req); 3099 - out_put: 3100 - rbd_obj_request_put(obj_request); 3101 - out_cancel: 3102 - ceph_osdc_cancel_event(rbd_dev->watch_event); 3103 - rbd_dev->watch_event = NULL; 3104 - 3105 - return ret; 3106 2991 } 3107 2992 3108 2993 /* 3109 2994 * Tear down a watch request, synchronously. 3110 2995 */ 3111 - static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 2996 + static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3112 2997 { 3113 - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3114 2998 struct rbd_obj_request *obj_request; 3115 - int ret; 3116 2999 3117 3000 rbd_assert(rbd_dev->watch_event); 3118 3001 rbd_assert(rbd_dev->watch_request); 3119 3002 3120 - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3121 - OBJ_REQUEST_NODATA); 3122 - if (!obj_request) { 3123 - ret = -ENOMEM; 3124 - goto out_cancel; 3125 - } 3126 - 3127 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 3128 - obj_request); 3129 - if (!obj_request->osd_req) { 3130 - ret = -ENOMEM; 3131 - goto out_put; 3132 - } 3133 - 3134 - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3135 - rbd_dev->watch_event->cookie, 0, 0); 3136 - rbd_osd_req_format_write(obj_request); 3137 - 3138 - ret = rbd_obj_request_submit(osdc, obj_request); 3139 - if (ret) 3140 - goto out_put; 3141 - 3142 - ret = rbd_obj_request_wait(obj_request); 3143 - if (ret) 3144 - goto out_put; 3145 - 3146 - ret = obj_request->result; 3147 - if (ret) 3148 - goto out_put; 3149 - 3150 - /* We have successfully torn down the watch request */ 3151 - 3152 - ceph_osdc_unregister_linger_request(osdc, 3153 - rbd_dev->watch_request->osd_req); 3003 + rbd_obj_request_end(rbd_dev->watch_request); 3154 3004 rbd_obj_request_put(rbd_dev->watch_request); 3155 3005 rbd_dev->watch_request = NULL; 3156 3006 3157 - out_put: 3158 - rbd_obj_request_put(obj_request); 3159 - out_cancel: 3007 + obj_request = rbd_obj_watch_request_helper(rbd_dev, false); 3008 + if (!IS_ERR(obj_request)) 3009 + rbd_obj_request_put(obj_request); 3010 + else 3011 + rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", 3012 + PTR_ERR(obj_request)); 3013 + 3160 3014 ceph_osdc_cancel_event(rbd_dev->watch_event); 3161 3015 rbd_dev->watch_event = NULL; 3162 - 3163 - return ret; 3164 - } 3165 - 3166 - static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3167 - { 3168 - int ret; 3169 - 3170 - ret = __rbd_dev_header_unwatch_sync(rbd_dev); 3171 - if (ret) { 3172 - rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", 3173 - ret); 3174 - } 3175 3016 } 3176 3017 3177 3018 /* ··· 3180 3183 return ret; 3181 3184 } 3182 3185 3186 + static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 3187 + { 3188 + struct rbd_img_request *img_request; 3189 + u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3190 + u64 length = blk_rq_bytes(rq); 3191 + bool wr = rq_data_dir(rq) == WRITE; 3192 + int result; 3193 + 3194 + /* Ignore/skip any zero-length requests */ 3195 + 3196 + if (!length) { 3197 + dout("%s: zero-length request\n", __func__); 3198 + result = 0; 3199 + goto err_rq; 3200 + } 3201 + 3202 + /* Disallow writes to a read-only device */ 3203 + 3204 + if (wr) { 3205 + if (rbd_dev->mapping.read_only) { 3206 + result = -EROFS; 3207 + goto err_rq; 3208 + } 3209 + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3210 + } 3211 + 3212 + /* 3213 + * Quit early if the mapped snapshot no longer exists. It's 3214 + * still possible the snapshot will have disappeared by the 3215 + * time our request arrives at the osd, but there's no sense in 3216 + * sending it if we already know. 3217 + */ 3218 + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3219 + dout("request for non-existent snapshot"); 3220 + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3221 + result = -ENXIO; 3222 + goto err_rq; 3223 + } 3224 + 3225 + if (offset && length > U64_MAX - offset + 1) { 3226 + rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3227 + length); 3228 + result = -EINVAL; 3229 + goto err_rq; /* Shouldn't happen */ 3230 + } 3231 + 3232 + if (offset + length > rbd_dev->mapping.size) { 3233 + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 3234 + length, rbd_dev->mapping.size); 3235 + result = -EIO; 3236 + goto err_rq; 3237 + } 3238 + 3239 + img_request = rbd_img_request_create(rbd_dev, offset, length, wr); 3240 + if (!img_request) { 3241 + result = -ENOMEM; 3242 + goto err_rq; 3243 + } 3244 + img_request->rq = rq; 3245 + 3246 + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); 3247 + if (result) 3248 + goto err_img_request; 3249 + 3250 + result = rbd_img_request_submit(img_request); 3251 + if (result) 3252 + goto err_img_request; 3253 + 3254 + return; 3255 + 3256 + err_img_request: 3257 + rbd_img_request_put(img_request); 3258 + err_rq: 3259 + if (result) 3260 + rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3261 + wr ? "write" : "read", length, offset, result); 3262 + blk_end_request_all(rq, result); 3263 + } 3264 + 3265 + static void rbd_request_workfn(struct work_struct *work) 3266 + { 3267 + struct rbd_device *rbd_dev = 3268 + container_of(work, struct rbd_device, rq_work); 3269 + struct request *rq, *next; 3270 + LIST_HEAD(requests); 3271 + 3272 + spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ 3273 + list_splice_init(&rbd_dev->rq_queue, &requests); 3274 + spin_unlock_irq(&rbd_dev->lock); 3275 + 3276 + list_for_each_entry_safe(rq, next, &requests, queuelist) { 3277 + list_del_init(&rq->queuelist); 3278 + rbd_handle_request(rbd_dev, rq); 3279 + } 3280 + } 3281 + 3282 + /* 3283 + * Called with q->queue_lock held and interrupts disabled, possibly on 3284 + * the way to schedule(). Do not sleep here! 3285 + */ 3183 3286 static void rbd_request_fn(struct request_queue *q) 3184 - __releases(q->queue_lock) __acquires(q->queue_lock) 3185 3287 { 3186 3288 struct rbd_device *rbd_dev = q->queuedata; 3187 3289 struct request *rq; 3188 - int result; 3290 + int queued = 0; 3291 + 3292 + rbd_assert(rbd_dev); 3189 3293 3190 3294 while ((rq = blk_fetch_request(q))) { 3191 - bool write_request = rq_data_dir(rq) == WRITE; 3192 - struct rbd_img_request *img_request; 3193 - u64 offset; 3194 - u64 length; 3195 - 3196 3295 /* Ignore any non-FS requests that filter through. */ 3197 - 3198 3296 if (rq->cmd_type != REQ_TYPE_FS) { 3199 3297 dout("%s: non-fs request type %d\n", __func__, 3200 3298 (int) rq->cmd_type); ··· 3297 3205 continue; 3298 3206 } 3299 3207 3300 - /* Ignore/skip any zero-length requests */ 3301 - 3302 - offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 3303 - length = (u64) blk_rq_bytes(rq); 3304 - 3305 - if (!length) { 3306 - dout("%s: zero-length request\n", __func__); 3307 - __blk_end_request_all(rq, 0); 3308 - continue; 3309 - } 3310 - 3311 - spin_unlock_irq(q->queue_lock); 3312 - 3313 - /* Disallow writes to a read-only device */ 3314 - 3315 - if (write_request) { 3316 - result = -EROFS; 3317 - if (rbd_dev->mapping.read_only) 3318 - goto end_request; 3319 - rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3320 - } 3321 - 3322 - /* 3323 - * Quit early if the mapped snapshot no longer 3324 - * exists. It's still possible the snapshot will 3325 - * have disappeared by the time our request arrives 3326 - * at the osd, but there's no sense in sending it if 3327 - * we already know. 3328 - */ 3329 - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3330 - dout("request for non-existent snapshot"); 3331 - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3332 - result = -ENXIO; 3333 - goto end_request; 3334 - } 3335 - 3336 - result = -EINVAL; 3337 - if (offset && length > U64_MAX - offset + 1) { 3338 - rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", 3339 - offset, length); 3340 - goto end_request; /* Shouldn't happen */ 3341 - } 3342 - 3343 - result = -EIO; 3344 - if (offset + length > rbd_dev->mapping.size) { 3345 - rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 3346 - offset, length, rbd_dev->mapping.size); 3347 - goto end_request; 3348 - } 3349 - 3350 - result = -ENOMEM; 3351 - img_request = rbd_img_request_create(rbd_dev, offset, length, 3352 - write_request); 3353 - if (!img_request) 3354 - goto end_request; 3355 - 3356 - img_request->rq = rq; 3357 - 3358 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3359 - rq->bio); 3360 - if (!result) 3361 - result = rbd_img_request_submit(img_request); 3362 - if (result) 3363 - rbd_img_request_put(img_request); 3364 - end_request: 3365 - spin_lock_irq(q->queue_lock); 3366 - if (result < 0) { 3367 - rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 3368 - write_request ? "write" : "read", 3369 - length, offset, result); 3370 - 3371 - __blk_end_request_all(rq, result); 3372 - } 3208 + list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); 3209 + queued++; 3373 3210 } 3211 + 3212 + if (queued) 3213 + queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work); 3374 3214 } 3375 3215 3376 3216 /* ··· 3541 3517 u64 mapping_size; 3542 3518 int ret; 3543 3519 3544 - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3545 3520 down_write(&rbd_dev->header_rwsem); 3546 3521 mapping_size = rbd_dev->mapping.size; 3547 - if (rbd_dev->image_format == 1) 3548 - ret = rbd_dev_v1_header_info(rbd_dev); 3549 - else 3550 - ret = rbd_dev_v2_header_info(rbd_dev); 3551 3522 3552 - /* If it's a mapped snapshot, validate its EXISTS flag */ 3523 + ret = rbd_dev_header_info(rbd_dev); 3524 + if (ret) 3525 + return ret; 3553 3526 3554 - rbd_exists_validate(rbd_dev); 3555 - up_write(&rbd_dev->header_rwsem); 3556 - 3557 - if (mapping_size != rbd_dev->mapping.size) { 3558 - rbd_dev_update_size(rbd_dev); 3527 + /* 3528 + * If there is a parent, see if it has disappeared due to the 3529 + * mapped image getting flattened. 3530 + */ 3531 + if (rbd_dev->parent) { 3532 + ret = rbd_dev_v2_parent_info(rbd_dev); 3533 + if (ret) 3534 + return ret; 3559 3535 } 3560 3536 3561 - return ret; 3537 + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 3538 + if (rbd_dev->mapping.size != rbd_dev->header.image_size) 3539 + rbd_dev->mapping.size = rbd_dev->header.image_size; 3540 + } else { 3541 + /* validate mapped snapshot's EXISTS flag */ 3542 + rbd_exists_validate(rbd_dev); 3543 + } 3544 + 3545 + up_write(&rbd_dev->header_rwsem); 3546 + 3547 + if (mapping_size != rbd_dev->mapping.size) 3548 + rbd_dev_update_size(rbd_dev); 3549 + 3550 + return 0; 3562 3551 } 3563 3552 3564 3553 static int rbd_init_disk(struct rbd_device *rbd_dev) ··· 3733 3696 } 3734 3697 3735 3698 /* 3736 - * For an rbd v2 image, shows the pool id, image id, and snapshot id 3737 - * for the parent image. If there is no parent, simply shows 3738 - * "(no parent image)". 3699 + * For a v2 image, shows the chain of parent images, separated by empty 3700 + * lines. For v1 images or if there is no parent, shows "(no parent 3701 + * image)". 3739 3702 */ 3740 3703 static ssize_t rbd_parent_show(struct device *dev, 3741 - struct device_attribute *attr, 3742 - char *buf) 3704 + struct device_attribute *attr, 3705 + char *buf) 3743 3706 { 3744 3707 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3745 - struct rbd_spec *spec = rbd_dev->parent_spec; 3746 - int count; 3747 - char *bufp = buf; 3708 + ssize_t count = 0; 3748 3709 3749 - if (!spec) 3710 + if (!rbd_dev->parent) 3750 3711 return sprintf(buf, "(no parent image)\n"); 3751 3712 3752 - count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 3753 - (unsigned long long) spec->pool_id, spec->pool_name); 3754 - if (count < 0) 3755 - return count; 3756 - bufp += count; 3713 + for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 3714 + struct rbd_spec *spec = rbd_dev->parent_spec; 3757 3715 3758 - count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 3759 - spec->image_name ? spec->image_name : "(unknown)"); 3760 - if (count < 0) 3761 - return count; 3762 - bufp += count; 3716 + count += sprintf(&buf[count], "%s" 3717 + "pool_id %llu\npool_name %s\n" 3718 + "image_id %s\nimage_name %s\n" 3719 + "snap_id %llu\nsnap_name %s\n" 3720 + "overlap %llu\n", 3721 + !count ? "" : "\n", /* first? */ 3722 + spec->pool_id, spec->pool_name, 3723 + spec->image_id, spec->image_name ?: "(unknown)", 3724 + spec->snap_id, spec->snap_name, 3725 + rbd_dev->parent_overlap); 3726 + } 3763 3727 3764 - count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 3765 - (unsigned long long) spec->snap_id, spec->snap_name); 3766 - if (count < 0) 3767 - return count; 3768 - bufp += count; 3769 - 3770 - count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 3771 - if (count < 0) 3772 - return count; 3773 - bufp += count; 3774 - 3775 - return (ssize_t) (bufp - buf); 3728 + return count; 3776 3729 } 3777 3730 3778 3731 static ssize_t rbd_image_refresh(struct device *dev, ··· 3775 3748 3776 3749 ret = rbd_dev_refresh(rbd_dev); 3777 3750 if (ret) 3778 - rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); 3751 + return ret; 3779 3752 3780 - return ret < 0 ? ret : size; 3753 + return size; 3781 3754 } 3782 3755 3783 3756 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); ··· 3849 3822 spec = kzalloc(sizeof (*spec), GFP_KERNEL); 3850 3823 if (!spec) 3851 3824 return NULL; 3825 + 3826 + spec->pool_id = CEPH_NOPOOL; 3827 + spec->snap_id = CEPH_NOSNAP; 3852 3828 kref_init(&spec->kref); 3853 3829 3854 3830 return spec; ··· 3878 3848 return NULL; 3879 3849 3880 3850 spin_lock_init(&rbd_dev->lock); 3851 + INIT_LIST_HEAD(&rbd_dev->rq_queue); 3852 + INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); 3881 3853 rbd_dev->flags = 0; 3882 3854 atomic_set(&rbd_dev->parent_ref, 0); 3883 3855 INIT_LIST_HEAD(&rbd_dev->node); ··· 4053 4021 goto out_err; 4054 4022 } 4055 4023 4056 - snapid = cpu_to_le64(CEPH_NOSNAP); 4024 + snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4057 4025 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4058 4026 "rbd", "get_parent", 4059 4027 &snapid, sizeof (snapid), ··· 4091 4059 4092 4060 ret = -EIO; 4093 4061 if (pool_id > (u64)U32_MAX) { 4094 - rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", 4062 + rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4095 4063 (unsigned long long)pool_id, U32_MAX); 4096 4064 goto out_err; 4097 4065 } ··· 4115 4083 parent_spec->snap_id = snap_id; 4116 4084 rbd_dev->parent_spec = parent_spec; 4117 4085 parent_spec = NULL; /* rbd_dev now owns this */ 4086 + } else { 4087 + kfree(image_id); 4118 4088 } 4119 4089 4120 4090 /* ··· 4144 4110 * overlap is zero we just pretend there was 4145 4111 * no parent image. 4146 4112 */ 4147 - rbd_warn(rbd_dev, "ignoring parent of " 4148 - "clone with overlap 0\n"); 4113 + rbd_warn(rbd_dev, "ignoring parent with overlap 0"); 4149 4114 } 4150 4115 } 4151 4116 out: ··· 4312 4279 } 4313 4280 4314 4281 /* 4315 - * When an rbd image has a parent image, it is identified by the 4316 - * pool, image, and snapshot ids (not names). This function fills 4317 - * in the names for those ids. (It's OK if we can't figure out the 4318 - * name for an image id, but the pool and snapshot ids should always 4319 - * exist and have names.) All names in an rbd spec are dynamically 4320 - * allocated. 4321 - * 4322 - * When an image being mapped (not a parent) is probed, we have the 4323 - * pool name and pool id, image name and image id, and the snapshot 4324 - * name. The only thing we're missing is the snapshot id. 4282 + * An image being mapped will have everything but the snap id. 4325 4283 */ 4326 - static int rbd_dev_spec_update(struct rbd_device *rbd_dev) 4284 + static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 4285 + { 4286 + struct rbd_spec *spec = rbd_dev->spec; 4287 + 4288 + rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 4289 + rbd_assert(spec->image_id && spec->image_name); 4290 + rbd_assert(spec->snap_name); 4291 + 4292 + if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 4293 + u64 snap_id; 4294 + 4295 + snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 4296 + if (snap_id == CEPH_NOSNAP) 4297 + return -ENOENT; 4298 + 4299 + spec->snap_id = snap_id; 4300 + } else { 4301 + spec->snap_id = CEPH_NOSNAP; 4302 + } 4303 + 4304 + return 0; 4305 + } 4306 + 4307 + /* 4308 + * A parent image will have all ids but none of the names. 4309 + * 4310 + * All names in an rbd spec are dynamically allocated. It's OK if we 4311 + * can't figure out the name for an image id. 4312 + */ 4313 + static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 4327 4314 { 4328 4315 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4329 4316 struct rbd_spec *spec = rbd_dev->spec; ··· 4352 4299 const char *snap_name; 4353 4300 int ret; 4354 4301 4355 - /* 4356 - * An image being mapped will have the pool name (etc.), but 4357 - * we need to look up the snapshot id. 4358 - */ 4359 - if (spec->pool_name) { 4360 - if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 4361 - u64 snap_id; 4362 - 4363 - snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 4364 - if (snap_id == CEPH_NOSNAP) 4365 - return -ENOENT; 4366 - spec->snap_id = snap_id; 4367 - } else { 4368 - spec->snap_id = CEPH_NOSNAP; 4369 - } 4370 - 4371 - return 0; 4372 - } 4302 + rbd_assert(spec->pool_id != CEPH_NOPOOL); 4303 + rbd_assert(spec->image_id); 4304 + rbd_assert(spec->snap_id != CEPH_NOSNAP); 4373 4305 4374 4306 /* Get the pool name; we have to make our own copy of this */ 4375 4307 ··· 4373 4335 if (!image_name) 4374 4336 rbd_warn(rbd_dev, "unable to get image name"); 4375 4337 4376 - /* Look up the snapshot name, and make a copy */ 4338 + /* Fetch the snapshot name */ 4377 4339 4378 4340 snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4379 4341 if (IS_ERR(snap_name)) { ··· 4386 4348 spec->snap_name = snap_name; 4387 4349 4388 4350 return 0; 4351 + 4389 4352 out_err: 4390 4353 kfree(image_name); 4391 4354 kfree(pool_name); 4392 - 4393 4355 return ret; 4394 4356 } 4395 4357 ··· 4521 4483 return ret; 4522 4484 } 4523 4485 4524 - /* 4525 - * If the image supports layering, get the parent info. We 4526 - * need to probe the first time regardless. Thereafter we 4527 - * only need to if there's a parent, to see if it has 4528 - * disappeared due to the mapped image getting flattened. 4529 - */ 4530 - if (rbd_dev->header.features & RBD_FEATURE_LAYERING && 4531 - (first_time || rbd_dev->parent_spec)) { 4532 - bool warn; 4533 - 4534 - ret = rbd_dev_v2_parent_info(rbd_dev); 4535 - if (ret) 4536 - return ret; 4537 - 4538 - /* 4539 - * Print a warning if this is the initial probe and 4540 - * the image has a parent. Don't print it if the 4541 - * image now being probed is itself a parent. We 4542 - * can tell at this point because we won't know its 4543 - * pool name yet (just its pool id). 4544 - */ 4545 - warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; 4546 - if (first_time && warn) 4547 - rbd_warn(rbd_dev, "WARNING: kernel layering " 4548 - "is EXPERIMENTAL!"); 4549 - } 4550 - 4551 - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) 4552 - if (rbd_dev->mapping.size != rbd_dev->header.image_size) 4553 - rbd_dev->mapping.size = rbd_dev->header.image_size; 4554 - 4555 4486 ret = rbd_dev_v2_snap_context(rbd_dev); 4556 4487 dout("rbd_dev_v2_snap_context returned %d\n", ret); 4557 4488 4558 4489 return ret; 4490 + } 4491 + 4492 + static int rbd_dev_header_info(struct rbd_device *rbd_dev) 4493 + { 4494 + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4495 + 4496 + if (rbd_dev->image_format == 1) 4497 + return rbd_dev_v1_header_info(rbd_dev); 4498 + 4499 + return rbd_dev_v2_header_info(rbd_dev); 4559 4500 } 4560 4501 4561 4502 static int rbd_bus_add_dev(struct rbd_device *rbd_dev) ··· 5083 5066 ret = rbd_dev_mapping_set(rbd_dev); 5084 5067 if (ret) 5085 5068 goto err_out_disk; 5069 + 5086 5070 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 5087 5071 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5088 5072 5073 + rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0); 5074 + if (!rbd_dev->rq_wq) 5075 + goto err_out_mapping; 5076 + 5089 5077 ret = rbd_bus_add_dev(rbd_dev); 5090 5078 if (ret) 5091 - goto err_out_mapping; 5079 + goto err_out_workqueue; 5092 5080 5093 5081 /* Everything's ready. Announce the disk to the world. */ 5094 5082 ··· 5105 5083 5106 5084 return ret; 5107 5085 5086 + err_out_workqueue: 5087 + destroy_workqueue(rbd_dev->rq_wq); 5088 + rbd_dev->rq_wq = NULL; 5108 5089 err_out_mapping: 5109 5090 rbd_dev_mapping_clear(rbd_dev); 5110 5091 err_out_disk: ··· 5180 5155 ret = rbd_dev_image_id(rbd_dev); 5181 5156 if (ret) 5182 5157 return ret; 5183 - rbd_assert(rbd_dev->spec->image_id); 5184 - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5185 5158 5186 5159 ret = rbd_dev_header_name(rbd_dev); 5187 5160 if (ret) ··· 5191 5168 goto out_header_name; 5192 5169 } 5193 5170 5194 - if (rbd_dev->image_format == 1) 5195 - ret = rbd_dev_v1_header_info(rbd_dev); 5196 - else 5197 - ret = rbd_dev_v2_header_info(rbd_dev); 5171 + ret = rbd_dev_header_info(rbd_dev); 5198 5172 if (ret) 5199 5173 goto err_out_watch; 5200 5174 5201 - ret = rbd_dev_spec_update(rbd_dev); 5175 + /* 5176 + * If this image is the one being mapped, we have pool name and 5177 + * id, image name and id, and snap name - need to fill snap id. 5178 + * Otherwise this is a parent image, identified by pool, image 5179 + * and snap ids - need to fill in names for those ids. 5180 + */ 5181 + if (mapping) 5182 + ret = rbd_spec_fill_snap_id(rbd_dev); 5183 + else 5184 + ret = rbd_spec_fill_names(rbd_dev); 5202 5185 if (ret) 5203 5186 goto err_out_probe; 5187 + 5188 + if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5189 + ret = rbd_dev_v2_parent_info(rbd_dev); 5190 + if (ret) 5191 + goto err_out_probe; 5192 + 5193 + /* 5194 + * Need to warn users if this image is the one being 5195 + * mapped and has a parent. 5196 + */ 5197 + if (mapping && rbd_dev->parent_spec) 5198 + rbd_warn(rbd_dev, 5199 + "WARNING: kernel layering is EXPERIMENTAL!"); 5200 + } 5204 5201 5205 5202 ret = rbd_dev_probe_parent(rbd_dev); 5206 5203 if (ret) ··· 5228 5185 5229 5186 dout("discovered format %u image, header name is %s\n", 5230 5187 rbd_dev->image_format, rbd_dev->header_name); 5231 - 5232 5188 return 0; 5189 + 5233 5190 err_out_probe: 5234 5191 rbd_dev_unprobe(rbd_dev); 5235 5192 err_out_watch: ··· 5242 5199 rbd_dev->image_format = 0; 5243 5200 kfree(rbd_dev->spec->image_id); 5244 5201 rbd_dev->spec->image_id = NULL; 5245 - 5246 - dout("probe failed, returning %d\n", ret); 5247 - 5248 5202 return ret; 5249 5203 } 5250 5204 ··· 5283 5243 /* The ceph file layout needs to fit pool id in 32 bits */ 5284 5244 5285 5245 if (spec->pool_id > (u64)U32_MAX) { 5286 - rbd_warn(NULL, "pool id too large (%llu > %u)\n", 5246 + rbd_warn(NULL, "pool id too large (%llu > %u)", 5287 5247 (unsigned long long)spec->pool_id, U32_MAX); 5288 5248 rc = -EIO; 5289 5249 goto err_out_client; ··· 5354 5314 { 5355 5315 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5356 5316 5317 + destroy_workqueue(rbd_dev->rq_wq); 5357 5318 rbd_free_disk(rbd_dev); 5358 5319 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5359 5320 rbd_dev_mapping_clear(rbd_dev);
+12 -2
fs/ceph/acl.c
··· 172 172 int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) 173 173 { 174 174 struct posix_acl *default_acl, *acl; 175 + umode_t new_mode = inode->i_mode; 175 176 int error; 176 177 177 - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); 178 + error = posix_acl_create(dir, &new_mode, &default_acl, &acl); 178 179 if (error) 179 180 return error; 180 181 181 - if (!default_acl && !acl) 182 + if (!default_acl && !acl) { 182 183 cache_no_acl(inode); 184 + if (new_mode != inode->i_mode) { 185 + struct iattr newattrs = { 186 + .ia_mode = new_mode, 187 + .ia_valid = ATTR_MODE, 188 + }; 189 + error = ceph_setattr(dentry, &newattrs); 190 + } 191 + return error; 192 + } 183 193 184 194 if (default_acl) { 185 195 error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+1 -1
fs/ceph/caps.c
··· 3277 3277 rel->ino = cpu_to_le64(ceph_ino(inode)); 3278 3278 rel->cap_id = cpu_to_le64(cap->cap_id); 3279 3279 rel->seq = cpu_to_le32(cap->seq); 3280 - rel->issue_seq = cpu_to_le32(cap->issue_seq), 3280 + rel->issue_seq = cpu_to_le32(cap->issue_seq); 3281 3281 rel->mseq = cpu_to_le32(cap->mseq); 3282 3282 rel->caps = cpu_to_le32(cap->implemented); 3283 3283 rel->wanted = cpu_to_le32(cap->mds_wanted);
+16 -8
fs/ceph/file.c
··· 423 423 dout("sync_read on file %p %llu~%u %s\n", file, off, 424 424 (unsigned)len, 425 425 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 426 + 427 + if (!len) 428 + return 0; 426 429 /* 427 430 * flush any page cache pages in this range. this 428 431 * will make concurrent normal and sync io slow, ··· 473 470 size_t left = ret; 474 471 475 472 while (left) { 476 - int copy = min_t(size_t, PAGE_SIZE, left); 477 - l = copy_page_to_iter(pages[k++], 0, copy, i); 473 + size_t page_off = off & ~PAGE_MASK; 474 + size_t copy = min_t(size_t, 475 + PAGE_SIZE - page_off, left); 476 + l = copy_page_to_iter(pages[k++], page_off, 477 + copy, i); 478 478 off += l; 479 479 left -= l; 480 480 if (l < copy) ··· 537 531 * objects, rollback on failure, etc.) 538 532 */ 539 533 static ssize_t 540 - ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) 534 + ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) 541 535 { 542 536 struct file *file = iocb->ki_filp; 543 537 struct inode *inode = file_inode(file); ··· 553 547 int check_caps = 0; 554 548 int ret; 555 549 struct timespec mtime = CURRENT_TIME; 556 - loff_t pos = iocb->ki_pos; 557 550 size_t count = iov_iter_count(from); 558 551 559 552 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) ··· 651 646 * correct atomic write, we should e.g. take write locks on all 652 647 * objects, rollback on failure, etc.) 653 648 */ 654 - static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) 649 + static ssize_t 650 + ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) 655 651 { 656 652 struct file *file = iocb->ki_filp; 657 653 struct inode *inode = file_inode(file); ··· 669 663 int check_caps = 0; 670 664 int ret; 671 665 struct timespec mtime = CURRENT_TIME; 672 - loff_t pos = iocb->ki_pos; 673 666 size_t count = iov_iter_count(from); 674 667 675 668 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) ··· 923 918 /* we might need to revert back to that point */ 924 919 data = *from; 925 920 if (file->f_flags & O_DIRECT) 926 - written = ceph_sync_direct_write(iocb, &data); 921 + written = ceph_sync_direct_write(iocb, &data, pos); 927 922 else 928 - written = ceph_sync_write(iocb, &data); 923 + written = ceph_sync_write(iocb, &data, pos); 929 924 if (written == -EOLDSNAPC) { 930 925 dout("aio_write %p %llx.%llx %llu~%u" 931 926 "got EOLDSNAPC, retrying\n", ··· 1181 1176 int ret = 0; 1182 1177 loff_t endoff = 0; 1183 1178 loff_t size; 1179 + 1180 + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 1181 + return -EOPNOTSUPP; 1184 1182 1185 1183 if (!S_ISREG(inode->i_mode)) 1186 1184 return -EOPNOTSUPP;
+12 -4
fs/ceph/mds_client.c
··· 1904 1904 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1905 1905 1906 1906 if (req->r_got_unsafe) { 1907 + void *p; 1907 1908 /* 1908 1909 * Replay. Do not regenerate message (and rebuild 1909 1910 * paths, etc.); just use the original message. ··· 1925 1924 1926 1925 /* remove cap/dentry releases from message */ 1927 1926 rhead->num_releases = 0; 1928 - msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); 1929 - msg->front.iov_len = req->r_request_release_offset; 1927 + 1928 + /* time stamp */ 1929 + p = msg->front.iov_base + req->r_request_release_offset; 1930 + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 1931 + 1932 + msg->front.iov_len = p - msg->front.iov_base; 1933 + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1930 1934 return 0; 1931 1935 } 1932 1936 ··· 2067 2061 static void kick_requests(struct ceph_mds_client *mdsc, int mds) 2068 2062 { 2069 2063 struct ceph_mds_request *req; 2070 - struct rb_node *p; 2064 + struct rb_node *p = rb_first(&mdsc->request_tree); 2071 2065 2072 2066 dout("kick_requests mds%d\n", mds); 2073 - for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { 2067 + while (p) { 2074 2068 req = rb_entry(p, struct ceph_mds_request, r_node); 2069 + p = rb_next(p); 2075 2070 if (req->r_got_unsafe) 2076 2071 continue; 2077 2072 if (req->r_session && ··· 2255 2248 */ 2256 2249 if (result == -ESTALE) { 2257 2250 dout("got ESTALE on request %llu", req->r_tid); 2251 + req->r_resend_mds = -1; 2258 2252 if (req->r_direct_mode != USE_AUTH_MDS) { 2259 2253 dout("not using auth, setting for that now"); 2260 2254 req->r_direct_mode = USE_AUTH_MDS;
+2 -2
fs/ceph/xattr.c
··· 592 592 xattr_version = ci->i_xattrs.version; 593 593 spin_unlock(&ci->i_ceph_lock); 594 594 595 - xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), 595 + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), 596 596 GFP_NOFS); 597 597 err = -ENOMEM; 598 598 if (!xattrs) 599 599 goto bad_lock; 600 - memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); 600 + 601 601 for (i = 0; i < numattr; i++) { 602 602 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), 603 603 GFP_NOFS);
+2 -12
include/linux/ceph/messenger.h
··· 285 285 286 286 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 287 287 bool can_fail); 288 - extern void ceph_msg_kfree(struct ceph_msg *m); 289 288 290 - 291 - static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) 292 - { 293 - kref_get(&msg->kref); 294 - return msg; 295 - } 296 - extern void ceph_msg_last_put(struct kref *kref); 297 - static inline void ceph_msg_put(struct ceph_msg *msg) 298 - { 299 - kref_put(&msg->kref, ceph_msg_last_put); 300 - } 289 + extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg); 290 + extern void ceph_msg_put(struct ceph_msg *msg); 301 291 302 292 extern void ceph_msg_dump(struct ceph_msg *msg); 303 293
+4 -12
include/linux/ceph/osd_client.h
··· 117 117 struct list_head r_req_lru_item; 118 118 struct list_head r_osd_item; 119 119 struct list_head r_linger_item; 120 - struct list_head r_linger_osd; 120 + struct list_head r_linger_osd_item; 121 121 struct ceph_osd *r_osd; 122 122 struct ceph_pg r_pgid; 123 123 int r_pg_osds[CEPH_PG_MAX_SIZE]; ··· 325 325 326 326 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 327 327 struct ceph_osd_request *req); 328 - extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, 329 - struct ceph_osd_request *req); 330 328 331 - static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 332 - { 333 - kref_get(&req->r_kref); 334 - } 335 - extern void ceph_osdc_release_request(struct kref *kref); 336 - static inline void ceph_osdc_put_request(struct ceph_osd_request *req) 337 - { 338 - kref_put(&req->r_kref, ceph_osdc_release_request); 339 - } 329 + extern void ceph_osdc_get_request(struct ceph_osd_request *req); 330 + extern void ceph_osdc_put_request(struct ceph_osd_request *req); 340 331 341 332 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, 342 333 struct ceph_osd_request *req, 343 334 bool nofail); 335 + extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); 344 336 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 345 337 struct ceph_osd_request *req); 346 338 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+33 -14
net/ceph/messenger.c
··· 174 174 #define SKIP_BUF_SIZE 1024 175 175 176 176 static void queue_con(struct ceph_connection *con); 177 + static void cancel_con(struct ceph_connection *con); 177 178 static void con_work(struct work_struct *); 178 179 static void con_fault(struct ceph_connection *con); 179 180 ··· 681 680 682 681 reset_connection(con); 683 682 con->peer_global_seq = 0; 684 - cancel_delayed_work(&con->work); 683 + cancel_con(con); 685 684 con_close_socket(con); 686 685 mutex_unlock(&con->mutex); 687 686 } ··· 901 900 BUG_ON(page_count > (int)USHRT_MAX); 902 901 cursor->page_count = (unsigned short)page_count; 903 902 BUG_ON(length > SIZE_MAX - cursor->page_offset); 904 - cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; 903 + cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; 905 904 } 906 905 907 906 static struct page * ··· 2668 2667 { 2669 2668 if (!con->ops->get(con)) { 2670 2669 dout("%s %p ref count 0\n", __func__, con); 2671 - 2672 2670 return -ENOENT; 2673 2671 } 2674 2672 2675 2673 if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { 2676 2674 dout("%s %p - already queued\n", __func__, con); 2677 2675 con->ops->put(con); 2678 - 2679 2676 return -EBUSY; 2680 2677 } 2681 2678 2682 2679 dout("%s %p %lu\n", __func__, con, delay); 2683 - 2684 2680 return 0; 2685 2681 } 2686 2682 2687 2683 static void queue_con(struct ceph_connection *con) 2688 2684 { 2689 2685 (void) queue_con_delay(con, 0); 2686 + } 2687 + 2688 + static void cancel_con(struct ceph_connection *con) 2689 + { 2690 + if (cancel_delayed_work(&con->work)) { 2691 + dout("%s %p\n", __func__, con); 2692 + con->ops->put(con); 2693 + } 2690 2694 } 2691 2695 2692 2696 static bool con_sock_closed(struct ceph_connection *con) ··· 3275 3269 /* 3276 3270 * Free a generically kmalloc'd message. 3277 3271 */ 3278 - void ceph_msg_kfree(struct ceph_msg *m) 3272 + static void ceph_msg_free(struct ceph_msg *m) 3279 3273 { 3280 - dout("msg_kfree %p\n", m); 3274 + dout("%s %p\n", __func__, m); 3281 3275 ceph_kvfree(m->front.iov_base); 3282 3276 kmem_cache_free(ceph_msg_cache, m); 3283 3277 } 3284 3278 3285 - /* 3286 - * Drop a msg ref. Destroy as needed. 3287 - */ 3288 - void ceph_msg_last_put(struct kref *kref) 3279 + static void ceph_msg_release(struct kref *kref) 3289 3280 { 3290 3281 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); 3291 3282 LIST_HEAD(data); 3292 3283 struct list_head *links; 3293 3284 struct list_head *next; 3294 3285 3295 - dout("ceph_msg_put last one on %p\n", m); 3286 + dout("%s %p\n", __func__, m); 3296 3287 WARN_ON(!list_empty(&m->list_head)); 3297 3288 3298 3289 /* drop middle, data, if any */ ··· 3311 3308 if (m->pool) 3312 3309 ceph_msgpool_put(m->pool, m); 3313 3310 else 3314 - ceph_msg_kfree(m); 3311 + ceph_msg_free(m); 3315 3312 } 3316 - EXPORT_SYMBOL(ceph_msg_last_put); 3313 + 3314 + struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) 3315 + { 3316 + dout("%s %p (was %d)\n", __func__, msg, 3317 + atomic_read(&msg->kref.refcount)); 3318 + kref_get(&msg->kref); 3319 + return msg; 3320 + } 3321 + EXPORT_SYMBOL(ceph_msg_get); 3322 + 3323 + void ceph_msg_put(struct ceph_msg *msg) 3324 + { 3325 + dout("%s %p (was %d)\n", __func__, msg, 3326 + atomic_read(&msg->kref.refcount)); 3327 + kref_put(&msg->kref, ceph_msg_release); 3328 + } 3329 + EXPORT_SYMBOL(ceph_msg_put); 3317 3330 3318 3331 void ceph_msg_dump(struct ceph_msg *msg) 3319 3332 {
+89 -42
net/ceph/osd_client.c
··· 297 297 /* 298 298 * requests 299 299 */ 300 - void ceph_osdc_release_request(struct kref *kref) 300 + static void ceph_osdc_release_request(struct kref *kref) 301 301 { 302 - struct ceph_osd_request *req; 302 + struct ceph_osd_request *req = container_of(kref, 303 + struct ceph_osd_request, r_kref); 303 304 unsigned int which; 304 305 305 - req = container_of(kref, struct ceph_osd_request, r_kref); 306 + dout("%s %p (r_request %p r_reply %p)\n", __func__, req, 307 + req->r_request, req->r_reply); 308 + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); 309 + WARN_ON(!list_empty(&req->r_req_lru_item)); 310 + WARN_ON(!list_empty(&req->r_osd_item)); 311 + WARN_ON(!list_empty(&req->r_linger_item)); 312 + WARN_ON(!list_empty(&req->r_linger_osd_item)); 313 + WARN_ON(req->r_osd); 314 + 306 315 if (req->r_request) 307 316 ceph_msg_put(req->r_request); 308 317 if (req->r_reply) { ··· 329 320 kmem_cache_free(ceph_osd_request_cache, req); 330 321 331 322 } 332 - EXPORT_SYMBOL(ceph_osdc_release_request); 323 + 324 + void ceph_osdc_get_request(struct ceph_osd_request *req) 325 + { 326 + dout("%s %p (was %d)\n", __func__, req, 327 + atomic_read(&req->r_kref.refcount)); 328 + kref_get(&req->r_kref); 329 + } 330 + EXPORT_SYMBOL(ceph_osdc_get_request); 331 + 332 + void ceph_osdc_put_request(struct ceph_osd_request *req) 333 + { 334 + dout("%s %p (was %d)\n", __func__, req, 335 + atomic_read(&req->r_kref.refcount)); 336 + kref_put(&req->r_kref, ceph_osdc_release_request); 337 + } 338 + EXPORT_SYMBOL(ceph_osdc_put_request); 333 339 334 340 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 335 341 struct ceph_snap_context *snapc, ··· 388 364 RB_CLEAR_NODE(&req->r_node); 389 365 INIT_LIST_HEAD(&req->r_unsafe_item); 390 366 INIT_LIST_HEAD(&req->r_linger_item); 391 - INIT_LIST_HEAD(&req->r_linger_osd); 367 + INIT_LIST_HEAD(&req->r_linger_osd_item); 392 368 INIT_LIST_HEAD(&req->r_req_lru_item); 393 369 INIT_LIST_HEAD(&req->r_osd_item); 394 370 ··· 940 916 * list at the end to keep things in tid order. 941 917 */ 942 918 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, 943 - r_linger_osd) { 919 + r_linger_osd_item) { 944 920 /* 945 921 * reregister request prior to unregistering linger so 946 922 * that r_osd is preserved. ··· 1032 1008 { 1033 1009 dout("__remove_osd %p\n", osd); 1034 1010 BUG_ON(!list_empty(&osd->o_requests)); 1011 + BUG_ON(!list_empty(&osd->o_linger_requests)); 1012 + 1035 1013 rb_erase(&osd->o_node, &osdc->osds); 1036 1014 list_del_init(&osd->o_osd_lru); 1037 1015 ceph_con_close(&osd->o_con); ··· 1055 1029 static void __move_osd_to_lru(struct ceph_osd_client *osdc, 1056 1030 struct ceph_osd *osd) 1057 1031 { 1058 - dout("__move_osd_to_lru %p\n", osd); 1032 + dout("%s %p\n", __func__, osd); 1059 1033 BUG_ON(!list_empty(&osd->o_osd_lru)); 1034 + 1060 1035 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 1061 1036 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; 1037 + } 1038 + 1039 + static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, 1040 + struct ceph_osd *osd) 1041 + { 1042 + dout("%s %p\n", __func__, osd); 1043 + 1044 + if (list_empty(&osd->o_requests) && 1045 + list_empty(&osd->o_linger_requests)) 1046 + __move_osd_to_lru(osdc, osd); 1062 1047 } 1063 1048 1064 1049 static void __remove_osd_from_lru(struct ceph_osd *osd) ··· 1212 1175 1213 1176 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 1214 1177 rb_erase(&req->r_node, &osdc->requests); 1178 + RB_CLEAR_NODE(&req->r_node); 1215 1179 osdc->num_requests--; 1216 1180 1217 1181 if (req->r_osd) { ··· 1220 1182 ceph_msg_revoke(req->r_request); 1221 1183 1222 1184 list_del_init(&req->r_osd_item); 1223 - if (list_empty(&req->r_osd->o_requests) && 1224 - list_empty(&req->r_osd->o_linger_requests)) { 1225 - dout("moving osd to %p lru\n", req->r_osd); 1226 - __move_osd_to_lru(osdc, req->r_osd); 1227 - } 1228 - if (list_empty(&req->r_linger_item)) 1185 + maybe_move_osd_to_lru(osdc, req->r_osd); 1186 + if (list_empty(&req->r_linger_osd_item)) 1229 1187 req->r_osd = NULL; 1230 1188 } 1231 1189 ··· 1248 1214 static void __register_linger_request(struct ceph_osd_client *osdc, 1249 1215 struct ceph_osd_request *req) 1250 1216 { 1251 - dout("__register_linger_request %p\n", req); 1217 + dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1218 + WARN_ON(!req->r_linger); 1219 + 1252 1220 ceph_osdc_get_request(req); 1253 1221 list_add_tail(&req->r_linger_item, &osdc->req_linger); 1254 1222 if (req->r_osd) 1255 - list_add_tail(&req->r_linger_osd, 1223 + list_add_tail(&req->r_linger_osd_item, 1256 1224 &req->r_osd->o_linger_requests); 1257 1225 } 1258 1226 1259 1227 static void __unregister_linger_request(struct ceph_osd_client *osdc, 1260 1228 struct ceph_osd_request *req) 1261 1229 { 1262 - dout("__unregister_linger_request %p\n", req); 1263 - list_del_init(&req->r_linger_item); 1264 - if (req->r_osd) { 1265 - list_del_init(&req->r_linger_osd); 1230 + WARN_ON(!req->r_linger); 1266 1231 1267 - if (list_empty(&req->r_osd->o_requests) && 1268 - list_empty(&req->r_osd->o_linger_requests)) { 1269 - dout("moving osd to %p lru\n", req->r_osd); 1270 - __move_osd_to_lru(osdc, req->r_osd); 1271 - } 1232 + if (list_empty(&req->r_linger_item)) { 1233 + dout("%s %p tid %llu not registered\n", __func__, req, 1234 + req->r_tid); 1235 + return; 1236 + } 1237 + 1238 + dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1239 + list_del_init(&req->r_linger_item); 1240 + 1241 + if (req->r_osd) { 1242 + list_del_init(&req->r_linger_osd_item); 1243 + maybe_move_osd_to_lru(osdc, req->r_osd); 1272 1244 if (list_empty(&req->r_osd_item)) 1273 1245 req->r_osd = NULL; 1274 1246 } 1275 1247 ceph_osdc_put_request(req); 1276 1248 } 1277 - 1278 - void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, 1279 - struct ceph_osd_request *req) 1280 - { 1281 - mutex_lock(&osdc->request_mutex); 1282 - if (req->r_linger) { 1283 - req->r_linger = 0; 1284 - __unregister_linger_request(osdc, req); 1285 - } 1286 - mutex_unlock(&osdc->request_mutex); 1287 - } 1288 - EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); 1289 1249 1290 1250 void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 1291 1251 struct ceph_osd_request *req) ··· 2458 2430 EXPORT_SYMBOL(ceph_osdc_start_request); 2459 2431 2460 2432 /* 2433 + * Unregister a registered request. The request is not completed (i.e. 2434 + * no callbacks or wakeups) - higher layers are supposed to know what 2435 + * they are canceling. 2436 + */ 2437 + void ceph_osdc_cancel_request(struct ceph_osd_request *req) 2438 + { 2439 + struct ceph_osd_client *osdc = req->r_osdc; 2440 + 2441 + mutex_lock(&osdc->request_mutex); 2442 + if (req->r_linger) 2443 + __unregister_linger_request(osdc, req); 2444 + __unregister_request(osdc, req); 2445 + mutex_unlock(&osdc->request_mutex); 2446 + 2447 + dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); 2448 + } 2449 + EXPORT_SYMBOL(ceph_osdc_cancel_request); 2450 + 2451 + /* 2461 2452 * wait for a request to complete 2462 2453 */ 2463 2454 int ceph_osdc_wait_request(struct ceph_osd_client *osdc, ··· 2484 2437 { 2485 2438 int rc; 2486 2439 2440 + dout("%s %p tid %llu\n", __func__, req, req->r_tid); 2441 + 2487 2442 rc = wait_for_completion_interruptible(&req->r_completion); 2488 2443 if (rc < 0) { 2489 - mutex_lock(&osdc->request_mutex); 2490 - __cancel_request(req); 2491 - __unregister_request(osdc, req); 2492 - mutex_unlock(&osdc->request_mutex); 2444 + dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); 2445 + ceph_osdc_cancel_request(req); 2493 2446 complete_request(req); 2494 - dout("wait_request tid %llu canceled/timed out\n", req->r_tid); 2495 2447 return rc; 2496 2448 } 2497 2449 2498 - dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); 2450 + dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, 2451 + req->r_result); 2499 2452 return req->r_result; 2500 2453 } 2501 2454 EXPORT_SYMBOL(ceph_osdc_wait_request);