Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

+2 -2

Documentation/ABI/testing/sysfs-bus-rbd

··· 94 94 95 95 parent 96 96 97 - Information identifying the pool, image, and snapshot id for 98 - the parent image in a layered rbd image (format 2 only). 97 + Information identifying the chain of parent images in a layered rbd 98 + image. Entries are separated by empty lines.

+380 -339

drivers/block/rbd.c

··· 42 42 #include <linux/blkdev.h> 43 43 #include <linux/slab.h> 44 44 #include <linux/idr.h> 45 + #include <linux/workqueue.h> 45 46 46 47 #include "rbd_types.h" 47 48 ··· 333 332 334 333 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 335 334 335 + struct list_head rq_queue; /* incoming rq queue */ 336 336 spinlock_t lock; /* queue, flags, open_count */ 337 + struct workqueue_struct *rq_wq; 338 + struct work_struct rq_work; 337 339 338 340 struct rbd_image_header header; 339 341 unsigned long flags; /* possibly lock protected */ ··· 518 514 519 515 static int rbd_dev_refresh(struct rbd_device *rbd_dev); 520 516 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 521 - static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); 517 + static int rbd_dev_header_info(struct rbd_device *rbd_dev); 518 + static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 522 519 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 523 520 u64 snap_id); 524 521 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ··· 976 971 header->snap_names = snap_names; 977 972 header->snap_sizes = snap_sizes; 978 973 979 - /* Make sure mapping size is consistent with header info */ 980 - 981 - if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) 982 - if (rbd_dev->mapping.size != header->image_size) 983 - rbd_dev->mapping.size = header->image_size; 984 - 985 974 return 0; 986 975 out_2big: 987 976 ret = -EIO; ··· 1138 1139 rbd_dev->mapping.features = 0; 1139 1140 } 1140 1141 1142 + static void rbd_segment_name_free(const char *name) 1143 + { 1144 + /* The explicit cast here is needed to drop the const qualifier */ 1145 + 1146 + kmem_cache_free(rbd_segment_name_cache, (void *)name); 1147 + } 1148 + 1141 1149 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1142 1150 { 1143 1151 char *name; ··· 1164 1158 if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 1165 1159 pr_err("error formatting segment name for #%llu (%d)\n", 1166 1160 segment, ret); 1167 - kfree(name); 1161 + rbd_segment_name_free(name); 1168 1162 name = NULL; 1169 1163 } 1170 1164 1171 1165 return name; 1172 - } 1173 - 1174 - static void rbd_segment_name_free(const char *name) 1175 - { 1176 - /* The explicit cast here is needed to drop the const qualifier */ 1177 - 1178 - kmem_cache_free(rbd_segment_name_cache, (void *)name); 1179 1166 } 1180 1167 1181 1168 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) ··· 1370 1371 struct rbd_device *rbd_dev; 1371 1372 1372 1373 rbd_dev = obj_request->img_request->rbd_dev; 1373 - rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 1374 + rbd_warn(rbd_dev, "obj_request %p already marked img_data", 1374 1375 obj_request); 1375 1376 } 1376 1377 } ··· 1388 1389 1389 1390 if (obj_request_img_data_test(obj_request)) 1390 1391 rbd_dev = obj_request->img_request->rbd_dev; 1391 - rbd_warn(rbd_dev, "obj_request %p already marked done\n", 1392 + rbd_warn(rbd_dev, "obj_request %p already marked done", 1392 1393 obj_request); 1393 1394 } 1394 1395 } ··· 1526 1527 static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1527 1528 struct rbd_obj_request *obj_request) 1528 1529 { 1529 - dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 1530 - 1530 + dout("%s %p\n", __func__, obj_request); 1531 1531 return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1532 + } 1533 + 1534 + static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 1535 + { 1536 + dout("%s %p\n", __func__, obj_request); 1537 + ceph_osdc_cancel_request(obj_request->osd_req); 1538 + } 1539 + 1540 + /* 1541 + * Wait for an object request to complete. If interrupted, cancel the 1542 + * underlying osd request. 1543 + */ 1544 + static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1545 + { 1546 + int ret; 1547 + 1548 + dout("%s %p\n", __func__, obj_request); 1549 + 1550 + ret = wait_for_completion_interruptible(&obj_request->completion); 1551 + if (ret < 0) { 1552 + dout("%s %p interrupted\n", __func__, obj_request); 1553 + rbd_obj_request_end(obj_request); 1554 + return ret; 1555 + } 1556 + 1557 + dout("%s %p done\n", __func__, obj_request); 1558 + return 0; 1532 1559 } 1533 1560 1534 1561 static void rbd_img_request_complete(struct rbd_img_request *img_request) ··· 1581 1556 img_request->callback(img_request); 1582 1557 else 1583 1558 rbd_img_request_put(img_request); 1584 - } 1585 - 1586 - /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1587 - 1588 - static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1589 - { 1590 - dout("%s: obj %p\n", __func__, obj_request); 1591 - 1592 - return wait_for_completion_interruptible(&obj_request->completion); 1593 1559 } 1594 1560 1595 1561 /* ··· 1779 1763 rbd_osd_trivial_callback(obj_request); 1780 1764 break; 1781 1765 default: 1782 - rbd_warn(NULL, "%s: unsupported op %hu\n", 1766 + rbd_warn(NULL, "%s: unsupported op %hu", 1783 1767 obj_request->object_name, (unsigned short) opcode); 1784 1768 break; 1785 1769 } ··· 2014 1998 if (!counter) 2015 1999 rbd_dev_unparent(rbd_dev); 2016 2000 else 2017 - rbd_warn(rbd_dev, "parent reference underflow\n"); 2001 + rbd_warn(rbd_dev, "parent reference underflow"); 2018 2002 } 2019 2003 2020 2004 /* ··· 2044 2028 /* Image was flattened, but parent is not yet torn down */ 2045 2029 2046 2030 if (counter < 0) 2047 - rbd_warn(rbd_dev, "parent reference overflow\n"); 2031 + rbd_warn(rbd_dev, "parent reference overflow"); 2048 2032 2049 2033 return false; 2050 2034 } ··· 2061 2045 { 2062 2046 struct rbd_img_request *img_request; 2063 2047 2064 - img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); 2048 + img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2065 2049 if (!img_request) 2066 2050 return NULL; 2067 2051 ··· 2177 2161 if (result) { 2178 2162 struct rbd_device *rbd_dev = img_request->rbd_dev; 2179 2163 2180 - rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 2164 + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 2181 2165 img_request_write_test(img_request) ? "write" : "read", 2182 2166 obj_request->length, obj_request->img_offset, 2183 2167 obj_request->offset); 2184 - rbd_warn(rbd_dev, " result %d xferred %x\n", 2168 + rbd_warn(rbd_dev, " result %d xferred %x", 2185 2169 result, xferred); 2186 2170 if (!img_request->result) 2187 2171 img_request->result = result; ··· 2962 2946 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2963 2947 rbd_dev->header_name, (unsigned long long)notify_id, 2964 2948 (unsigned int)opcode); 2949 + 2950 + /* 2951 + * Until adequate refresh error handling is in place, there is 2952 + * not much we can do here, except warn. 2953 + * 2954 + * See http://tracker.ceph.com/issues/5040 2955 + */ 2965 2956 ret = rbd_dev_refresh(rbd_dev); 2966 2957 if (ret) 2967 - rbd_warn(rbd_dev, "header refresh error (%d)\n", ret); 2958 + rbd_warn(rbd_dev, "refresh failed: %d", ret); 2968 2959 2969 - rbd_obj_notify_ack_sync(rbd_dev, notify_id); 2960 + ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 2961 + if (ret) 2962 + rbd_warn(rbd_dev, "notify_ack ret %d", ret); 2963 + } 2964 + 2965 + /* 2966 + * Send a (un)watch request and wait for the ack. Return a request 2967 + * with a ref held on success or error. 2968 + */ 2969 + static struct rbd_obj_request *rbd_obj_watch_request_helper( 2970 + struct rbd_device *rbd_dev, 2971 + bool watch) 2972 + { 2973 + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2974 + struct rbd_obj_request *obj_request; 2975 + int ret; 2976 + 2977 + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2978 + OBJ_REQUEST_NODATA); 2979 + if (!obj_request) 2980 + return ERR_PTR(-ENOMEM); 2981 + 2982 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 2983 + obj_request); 2984 + if (!obj_request->osd_req) { 2985 + ret = -ENOMEM; 2986 + goto out; 2987 + } 2988 + 2989 + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 2990 + rbd_dev->watch_event->cookie, 0, watch); 2991 + rbd_osd_req_format_write(obj_request); 2992 + 2993 + if (watch) 2994 + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 2995 + 2996 + ret = rbd_obj_request_submit(osdc, obj_request); 2997 + if (ret) 2998 + goto out; 2999 + 3000 + ret = rbd_obj_request_wait(obj_request); 3001 + if (ret) 3002 + goto out; 3003 + 3004 + ret = obj_request->result; 3005 + if (ret) { 3006 + if (watch) 3007 + rbd_obj_request_end(obj_request); 3008 + goto out; 3009 + } 3010 + 3011 + return obj_request; 3012 + 3013 + out: 3014 + rbd_obj_request_put(obj_request); 3015 + return ERR_PTR(ret); 2970 3016 } 2971 3017 2972 3018 /* ··· 3048 2970 if (ret < 0) 3049 2971 return ret; 3050 2972 3051 - rbd_assert(rbd_dev->watch_event); 3052 - 3053 - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3054 - OBJ_REQUEST_NODATA); 3055 - if (!obj_request) { 3056 - ret = -ENOMEM; 3057 - goto out_cancel; 2973 + obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 2974 + if (IS_ERR(obj_request)) { 2975 + ceph_osdc_cancel_event(rbd_dev->watch_event); 2976 + rbd_dev->watch_event = NULL; 2977 + return PTR_ERR(obj_request); 3058 2978 } 3059 - 3060 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 3061 - obj_request); 3062 - if (!obj_request->osd_req) { 3063 - ret = -ENOMEM; 3064 - goto out_put; 3065 - } 3066 - 3067 - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 3068 - 3069 - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3070 - rbd_dev->watch_event->cookie, 0, 1); 3071 - rbd_osd_req_format_write(obj_request); 3072 - 3073 - ret = rbd_obj_request_submit(osdc, obj_request); 3074 - if (ret) 3075 - goto out_linger; 3076 - 3077 - ret = rbd_obj_request_wait(obj_request); 3078 - if (ret) 3079 - goto out_linger; 3080 - 3081 - ret = obj_request->result; 3082 - if (ret) 3083 - goto out_linger; 3084 2979 3085 2980 /* 3086 2981 * A watch request is set to linger, so the underlying osd 3087 2982 * request won't go away until we unregister it. We retain 3088 2983 * a pointer to the object request during that time (in 3089 - * rbd_dev->watch_request), so we'll keep a reference to 3090 - * it. We'll drop that reference (below) after we've 3091 - * unregistered it. 2984 + * rbd_dev->watch_request), so we'll keep a reference to it. 2985 + * We'll drop that reference after we've unregistered it in 2986 + * rbd_dev_header_unwatch_sync(). 3092 2987 */ 3093 2988 rbd_dev->watch_request = obj_request; 3094 2989 3095 2990 return 0; 3096 - 3097 - out_linger: 3098 - ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req); 3099 - out_put: 3100 - rbd_obj_request_put(obj_request); 3101 - out_cancel: 3102 - ceph_osdc_cancel_event(rbd_dev->watch_event); 3103 - rbd_dev->watch_event = NULL; 3104 - 3105 - return ret; 3106 2991 } 3107 2992 3108 2993 /* 3109 2994 * Tear down a watch request, synchronously. 3110 2995 */ 3111 - static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 2996 + static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3112 2997 { 3113 - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3114 2998 struct rbd_obj_request *obj_request; 3115 - int ret; 3116 2999 3117 3000 rbd_assert(rbd_dev->watch_event); 3118 3001 rbd_assert(rbd_dev->watch_request); 3119 3002 3120 - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3121 - OBJ_REQUEST_NODATA); 3122 - if (!obj_request) { 3123 - ret = -ENOMEM; 3124 - goto out_cancel; 3125 - } 3126 - 3127 - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 3128 - obj_request); 3129 - if (!obj_request->osd_req) { 3130 - ret = -ENOMEM; 3131 - goto out_put; 3132 - } 3133 - 3134 - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3135 - rbd_dev->watch_event->cookie, 0, 0); 3136 - rbd_osd_req_format_write(obj_request); 3137 - 3138 - ret = rbd_obj_request_submit(osdc, obj_request); 3139 - if (ret) 3140 - goto out_put; 3141 - 3142 - ret = rbd_obj_request_wait(obj_request); 3143 - if (ret) 3144 - goto out_put; 3145 - 3146 - ret = obj_request->result; 3147 - if (ret) 3148 - goto out_put; 3149 - 3150 - /* We have successfully torn down the watch request */ 3151 - 3152 - ceph_osdc_unregister_linger_request(osdc, 3153 - rbd_dev->watch_request->osd_req); 3003 + rbd_obj_request_end(rbd_dev->watch_request); 3154 3004 rbd_obj_request_put(rbd_dev->watch_request); 3155 3005 rbd_dev->watch_request = NULL; 3156 3006 3157 - out_put: 3158 - rbd_obj_request_put(obj_request); 3159 - out_cancel: 3007 + obj_request = rbd_obj_watch_request_helper(rbd_dev, false); 3008 + if (!IS_ERR(obj_request)) 3009 + rbd_obj_request_put(obj_request); 3010 + else 3011 + rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", 3012 + PTR_ERR(obj_request)); 3013 + 3160 3014 ceph_osdc_cancel_event(rbd_dev->watch_event); 3161 3015 rbd_dev->watch_event = NULL; 3162 - 3163 - return ret; 3164 - } 3165 - 3166 - static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3167 - { 3168 - int ret; 3169 - 3170 - ret = __rbd_dev_header_unwatch_sync(rbd_dev); 3171 - if (ret) { 3172 - rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", 3173 - ret); 3174 - } 3175 3016 } 3176 3017 3177 3018 /* ··· 3180 3183 return ret; 3181 3184 } 3182 3185 3186 + static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 3187 + { 3188 + struct rbd_img_request *img_request; 3189 + u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3190 + u64 length = blk_rq_bytes(rq); 3191 + bool wr = rq_data_dir(rq) == WRITE; 3192 + int result; 3193 + 3194 + /* Ignore/skip any zero-length requests */ 3195 + 3196 + if (!length) { 3197 + dout("%s: zero-length request\n", __func__); 3198 + result = 0; 3199 + goto err_rq; 3200 + } 3201 + 3202 + /* Disallow writes to a read-only device */ 3203 + 3204 + if (wr) { 3205 + if (rbd_dev->mapping.read_only) { 3206 + result = -EROFS; 3207 + goto err_rq; 3208 + } 3209 + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3210 + } 3211 + 3212 + /* 3213 + * Quit early if the mapped snapshot no longer exists. It's 3214 + * still possible the snapshot will have disappeared by the 3215 + * time our request arrives at the osd, but there's no sense in 3216 + * sending it if we already know. 3217 + */ 3218 + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3219 + dout("request for non-existent snapshot"); 3220 + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3221 + result = -ENXIO; 3222 + goto err_rq; 3223 + } 3224 + 3225 + if (offset && length > U64_MAX - offset + 1) { 3226 + rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3227 + length); 3228 + result = -EINVAL; 3229 + goto err_rq; /* Shouldn't happen */ 3230 + } 3231 + 3232 + if (offset + length > rbd_dev->mapping.size) { 3233 + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 3234 + length, rbd_dev->mapping.size); 3235 + result = -EIO; 3236 + goto err_rq; 3237 + } 3238 + 3239 + img_request = rbd_img_request_create(rbd_dev, offset, length, wr); 3240 + if (!img_request) { 3241 + result = -ENOMEM; 3242 + goto err_rq; 3243 + } 3244 + img_request->rq = rq; 3245 + 3246 + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); 3247 + if (result) 3248 + goto err_img_request; 3249 + 3250 + result = rbd_img_request_submit(img_request); 3251 + if (result) 3252 + goto err_img_request; 3253 + 3254 + return; 3255 + 3256 + err_img_request: 3257 + rbd_img_request_put(img_request); 3258 + err_rq: 3259 + if (result) 3260 + rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3261 + wr ? "write" : "read", length, offset, result); 3262 + blk_end_request_all(rq, result); 3263 + } 3264 + 3265 + static void rbd_request_workfn(struct work_struct *work) 3266 + { 3267 + struct rbd_device *rbd_dev = 3268 + container_of(work, struct rbd_device, rq_work); 3269 + struct request *rq, *next; 3270 + LIST_HEAD(requests); 3271 + 3272 + spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ 3273 + list_splice_init(&rbd_dev->rq_queue, &requests); 3274 + spin_unlock_irq(&rbd_dev->lock); 3275 + 3276 + list_for_each_entry_safe(rq, next, &requests, queuelist) { 3277 + list_del_init(&rq->queuelist); 3278 + rbd_handle_request(rbd_dev, rq); 3279 + } 3280 + } 3281 + 3282 + /* 3283 + * Called with q->queue_lock held and interrupts disabled, possibly on 3284 + * the way to schedule(). Do not sleep here! 3285 + */ 3183 3286 static void rbd_request_fn(struct request_queue *q) 3184 - __releases(q->queue_lock) __acquires(q->queue_lock) 3185 3287 { 3186 3288 struct rbd_device *rbd_dev = q->queuedata; 3187 3289 struct request *rq; 3188 - int result; 3290 + int queued = 0; 3291 + 3292 + rbd_assert(rbd_dev); 3189 3293 3190 3294 while ((rq = blk_fetch_request(q))) { 3191 - bool write_request = rq_data_dir(rq) == WRITE; 3192 - struct rbd_img_request *img_request; 3193 - u64 offset; 3194 - u64 length; 3195 - 3196 3295 /* Ignore any non-FS requests that filter through. */ 3197 - 3198 3296 if (rq->cmd_type != REQ_TYPE_FS) { 3199 3297 dout("%s: non-fs request type %d\n", __func__, 3200 3298 (int) rq->cmd_type); ··· 3297 3205 continue; 3298 3206 } 3299 3207 3300 - /* Ignore/skip any zero-length requests */ 3301 - 3302 - offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 3303 - length = (u64) blk_rq_bytes(rq); 3304 - 3305 - if (!length) { 3306 - dout("%s: zero-length request\n", __func__); 3307 - __blk_end_request_all(rq, 0); 3308 - continue; 3309 - } 3310 - 3311 - spin_unlock_irq(q->queue_lock); 3312 - 3313 - /* Disallow writes to a read-only device */ 3314 - 3315 - if (write_request) { 3316 - result = -EROFS; 3317 - if (rbd_dev->mapping.read_only) 3318 - goto end_request; 3319 - rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3320 - } 3321 - 3322 - /* 3323 - * Quit early if the mapped snapshot no longer 3324 - * exists. It's still possible the snapshot will 3325 - * have disappeared by the time our request arrives 3326 - * at the osd, but there's no sense in sending it if 3327 - * we already know. 3328 - */ 3329 - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3330 - dout("request for non-existent snapshot"); 3331 - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3332 - result = -ENXIO; 3333 - goto end_request; 3334 - } 3335 - 3336 - result = -EINVAL; 3337 - if (offset && length > U64_MAX - offset + 1) { 3338 - rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", 3339 - offset, length); 3340 - goto end_request; /* Shouldn't happen */ 3341 - } 3342 - 3343 - result = -EIO; 3344 - if (offset + length > rbd_dev->mapping.size) { 3345 - rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 3346 - offset, length, rbd_dev->mapping.size); 3347 - goto end_request; 3348 - } 3349 - 3350 - result = -ENOMEM; 3351 - img_request = rbd_img_request_create(rbd_dev, offset, length, 3352 - write_request); 3353 - if (!img_request) 3354 - goto end_request; 3355 - 3356 - img_request->rq = rq; 3357 - 3358 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3359 - rq->bio); 3360 - if (!result) 3361 - result = rbd_img_request_submit(img_request); 3362 - if (result) 3363 - rbd_img_request_put(img_request); 3364 - end_request: 3365 - spin_lock_irq(q->queue_lock); 3366 - if (result < 0) { 3367 - rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 3368 - write_request ? "write" : "read", 3369 - length, offset, result); 3370 - 3371 - __blk_end_request_all(rq, result); 3372 - } 3208 + list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); 3209 + queued++; 3373 3210 } 3211 + 3212 + if (queued) 3213 + queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work); 3374 3214 } 3375 3215 3376 3216 /* ··· 3541 3517 u64 mapping_size; 3542 3518 int ret; 3543 3519 3544 - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3545 3520 down_write(&rbd_dev->header_rwsem); 3546 3521 mapping_size = rbd_dev->mapping.size; 3547 - if (rbd_dev->image_format == 1) 3548 - ret = rbd_dev_v1_header_info(rbd_dev); 3549 - else 3550 - ret = rbd_dev_v2_header_info(rbd_dev); 3551 3522 3552 - /* If it's a mapped snapshot, validate its EXISTS flag */ 3523 + ret = rbd_dev_header_info(rbd_dev); 3524 + if (ret) 3525 + return ret; 3553 3526 3554 - rbd_exists_validate(rbd_dev); 3555 - up_write(&rbd_dev->header_rwsem); 3556 - 3557 - if (mapping_size != rbd_dev->mapping.size) { 3558 - rbd_dev_update_size(rbd_dev); 3527 + /* 3528 + * If there is a parent, see if it has disappeared due to the 3529 + * mapped image getting flattened. 3530 + */ 3531 + if (rbd_dev->parent) { 3532 + ret = rbd_dev_v2_parent_info(rbd_dev); 3533 + if (ret) 3534 + return ret; 3559 3535 } 3560 3536 3561 - return ret; 3537 + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 3538 + if (rbd_dev->mapping.size != rbd_dev->header.image_size) 3539 + rbd_dev->mapping.size = rbd_dev->header.image_size; 3540 + } else { 3541 + /* validate mapped snapshot's EXISTS flag */ 3542 + rbd_exists_validate(rbd_dev); 3543 + } 3544 + 3545 + up_write(&rbd_dev->header_rwsem); 3546 + 3547 + if (mapping_size != rbd_dev->mapping.size) 3548 + rbd_dev_update_size(rbd_dev); 3549 + 3550 + return 0; 3562 3551 } 3563 3552 3564 3553 static int rbd_init_disk(struct rbd_device *rbd_dev) ··· 3733 3696 } 3734 3697 3735 3698 /* 3736 - * For an rbd v2 image, shows the pool id, image id, and snapshot id 3737 - * for the parent image. If there is no parent, simply shows 3738 - * "(no parent image)". 3699 + * For a v2 image, shows the chain of parent images, separated by empty 3700 + * lines. For v1 images or if there is no parent, shows "(no parent 3701 + * image)". 3739 3702 */ 3740 3703 static ssize_t rbd_parent_show(struct device *dev, 3741 - struct device_attribute *attr, 3742 - char *buf) 3704 + struct device_attribute *attr, 3705 + char *buf) 3743 3706 { 3744 3707 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3745 - struct rbd_spec *spec = rbd_dev->parent_spec; 3746 - int count; 3747 - char *bufp = buf; 3708 + ssize_t count = 0; 3748 3709 3749 - if (!spec) 3710 + if (!rbd_dev->parent) 3750 3711 return sprintf(buf, "(no parent image)\n"); 3751 3712 3752 - count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 3753 - (unsigned long long) spec->pool_id, spec->pool_name); 3754 - if (count < 0) 3755 - return count; 3756 - bufp += count; 3713 + for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 3714 + struct rbd_spec *spec = rbd_dev->parent_spec; 3757 3715 3758 - count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 3759 - spec->image_name ? spec->image_name : "(unknown)"); 3760 - if (count < 0) 3761 - return count; 3762 - bufp += count; 3716 + count += sprintf(&buf[count], "%s" 3717 + "pool_id %llu\npool_name %s\n" 3718 + "image_id %s\nimage_name %s\n" 3719 + "snap_id %llu\nsnap_name %s\n" 3720 + "overlap %llu\n", 3721 + !count ? "" : "\n", /* first? */ 3722 + spec->pool_id, spec->pool_name, 3723 + spec->image_id, spec->image_name ?: "(unknown)", 3724 + spec->snap_id, spec->snap_name, 3725 + rbd_dev->parent_overlap); 3726 + } 3763 3727 3764 - count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 3765 - (unsigned long long) spec->snap_id, spec->snap_name); 3766 - if (count < 0) 3767 - return count; 3768 - bufp += count; 3769 - 3770 - count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 3771 - if (count < 0) 3772 - return count; 3773 - bufp += count; 3774 - 3775 - return (ssize_t) (bufp - buf); 3728 + return count; 3776 3729 } 3777 3730 3778 3731 static ssize_t rbd_image_refresh(struct device *dev, ··· 3775 3748 3776 3749 ret = rbd_dev_refresh(rbd_dev); 3777 3750 if (ret) 3778 - rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); 3751 + return ret; 3779 3752 3780 - return ret < 0 ? ret : size; 3753 + return size; 3781 3754 } 3782 3755 3783 3756 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); ··· 3849 3822 spec = kzalloc(sizeof (*spec), GFP_KERNEL); 3850 3823 if (!spec) 3851 3824 return NULL; 3825 + 3826 + spec->pool_id = CEPH_NOPOOL; 3827 + spec->snap_id = CEPH_NOSNAP; 3852 3828 kref_init(&spec->kref); 3853 3829 3854 3830 return spec; ··· 3878 3848 return NULL; 3879 3849 3880 3850 spin_lock_init(&rbd_dev->lock); 3851 + INIT_LIST_HEAD(&rbd_dev->rq_queue); 3852 + INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); 3881 3853 rbd_dev->flags = 0; 3882 3854 atomic_set(&rbd_dev->parent_ref, 0); 3883 3855 INIT_LIST_HEAD(&rbd_dev->node); ··· 4053 4021 goto out_err; 4054 4022 } 4055 4023 4056 - snapid = cpu_to_le64(CEPH_NOSNAP); 4024 + snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4057 4025 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4058 4026 "rbd", "get_parent", 4059 4027 &snapid, sizeof (snapid), ··· 4091 4059 4092 4060 ret = -EIO; 4093 4061 if (pool_id > (u64)U32_MAX) { 4094 - rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", 4062 + rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4095 4063 (unsigned long long)pool_id, U32_MAX); 4096 4064 goto out_err; 4097 4065 } ··· 4115 4083 parent_spec->snap_id = snap_id; 4116 4084 rbd_dev->parent_spec = parent_spec; 4117 4085 parent_spec = NULL; /* rbd_dev now owns this */ 4086 + } else { 4087 + kfree(image_id); 4118 4088 } 4119 4089 4120 4090 /* ··· 4144 4110 * overlap is zero we just pretend there was 4145 4111 * no parent image. 4146 4112 */ 4147 - rbd_warn(rbd_dev, "ignoring parent of " 4148 - "clone with overlap 0\n"); 4113 + rbd_warn(rbd_dev, "ignoring parent with overlap 0"); 4149 4114 } 4150 4115 } 4151 4116 out: ··· 4312 4279 } 4313 4280 4314 4281 /* 4315 - * When an rbd image has a parent image, it is identified by the 4316 - * pool, image, and snapshot ids (not names). This function fills 4317 - * in the names for those ids. (It's OK if we can't figure out the 4318 - * name for an image id, but the pool and snapshot ids should always 4319 - * exist and have names.) All names in an rbd spec are dynamically 4320 - * allocated. 4321 - * 4322 - * When an image being mapped (not a parent) is probed, we have the 4323 - * pool name and pool id, image name and image id, and the snapshot 4324 - * name. The only thing we're missing is the snapshot id. 4282 + * An image being mapped will have everything but the snap id. 4325 4283 */ 4326 - static int rbd_dev_spec_update(struct rbd_device *rbd_dev) 4284 + static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 4285 + { 4286 + struct rbd_spec *spec = rbd_dev->spec; 4287 + 4288 + rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 4289 + rbd_assert(spec->image_id && spec->image_name); 4290 + rbd_assert(spec->snap_name); 4291 + 4292 + if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 4293 + u64 snap_id; 4294 + 4295 + snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 4296 + if (snap_id == CEPH_NOSNAP) 4297 + return -ENOENT; 4298 + 4299 + spec->snap_id = snap_id; 4300 + } else { 4301 + spec->snap_id = CEPH_NOSNAP; 4302 + } 4303 + 4304 + return 0; 4305 + } 4306 + 4307 + /* 4308 + * A parent image will have all ids but none of the names. 4309 + * 4310 + * All names in an rbd spec are dynamically allocated. It's OK if we 4311 + * can't figure out the name for an image id. 4312 + */ 4313 + static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 4327 4314 { 4328 4315 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4329 4316 struct rbd_spec *spec = rbd_dev->spec; ··· 4352 4299 const char *snap_name; 4353 4300 int ret; 4354 4301 4355 - /* 4356 - * An image being mapped will have the pool name (etc.), but 4357 - * we need to look up the snapshot id. 4358 - */ 4359 - if (spec->pool_name) { 4360 - if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 4361 - u64 snap_id; 4362 - 4363 - snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 4364 - if (snap_id == CEPH_NOSNAP) 4365 - return -ENOENT; 4366 - spec->snap_id = snap_id; 4367 - } else { 4368 - spec->snap_id = CEPH_NOSNAP; 4369 - } 4370 - 4371 - return 0; 4372 - } 4302 + rbd_assert(spec->pool_id != CEPH_NOPOOL); 4303 + rbd_assert(spec->image_id); 4304 + rbd_assert(spec->snap_id != CEPH_NOSNAP); 4373 4305 4374 4306 /* Get the pool name; we have to make our own copy of this */ 4375 4307 ··· 4373 4335 if (!image_name) 4374 4336 rbd_warn(rbd_dev, "unable to get image name"); 4375 4337 4376 - /* Look up the snapshot name, and make a copy */ 4338 + /* Fetch the snapshot name */ 4377 4339 4378 4340 snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4379 4341 if (IS_ERR(snap_name)) { ··· 4386 4348 spec->snap_name = snap_name; 4387 4349 4388 4350 return 0; 4351 + 4389 4352 out_err: 4390 4353 kfree(image_name); 4391 4354 kfree(pool_name); 4392 - 4393 4355 return ret; 4394 4356 } 4395 4357 ··· 4521 4483 return ret; 4522 4484 } 4523 4485 4524 - /* 4525 - * If the image supports layering, get the parent info. We 4526 - * need to probe the first time regardless. Thereafter we 4527 - * only need to if there's a parent, to see if it has 4528 - * disappeared due to the mapped image getting flattened. 4529 - */ 4530 - if (rbd_dev->header.features & RBD_FEATURE_LAYERING && 4531 - (first_time || rbd_dev->parent_spec)) { 4532 - bool warn; 4533 - 4534 - ret = rbd_dev_v2_parent_info(rbd_dev); 4535 - if (ret) 4536 - return ret; 4537 - 4538 - /* 4539 - * Print a warning if this is the initial probe and 4540 - * the image has a parent. Don't print it if the 4541 - * image now being probed is itself a parent. We 4542 - * can tell at this point because we won't know its 4543 - * pool name yet (just its pool id). 4544 - */ 4545 - warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; 4546 - if (first_time && warn) 4547 - rbd_warn(rbd_dev, "WARNING: kernel layering " 4548 - "is EXPERIMENTAL!"); 4549 - } 4550 - 4551 - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) 4552 - if (rbd_dev->mapping.size != rbd_dev->header.image_size) 4553 - rbd_dev->mapping.size = rbd_dev->header.image_size; 4554 - 4555 4486 ret = rbd_dev_v2_snap_context(rbd_dev); 4556 4487 dout("rbd_dev_v2_snap_context returned %d\n", ret); 4557 4488 4558 4489 return ret; 4490 + } 4491 + 4492 + static int rbd_dev_header_info(struct rbd_device *rbd_dev) 4493 + { 4494 + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4495 + 4496 + if (rbd_dev->image_format == 1) 4497 + return rbd_dev_v1_header_info(rbd_dev); 4498 + 4499 + return rbd_dev_v2_header_info(rbd_dev); 4559 4500 } 4560 4501 4561 4502 static int rbd_bus_add_dev(struct rbd_device *rbd_dev) ··· 5083 5066 ret = rbd_dev_mapping_set(rbd_dev); 5084 5067 if (ret) 5085 5068 goto err_out_disk; 5069 + 5086 5070 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 5087 5071 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5088 5072 5073 + rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0); 5074 + if (!rbd_dev->rq_wq) 5075 + goto err_out_mapping; 5076 + 5089 5077 ret = rbd_bus_add_dev(rbd_dev); 5090 5078 if (ret) 5091 - goto err_out_mapping; 5079 + goto err_out_workqueue; 5092 5080 5093 5081 /* Everything's ready. Announce the disk to the world. */ 5094 5082 ··· 5105 5083 5106 5084 return ret; 5107 5085 5086 + err_out_workqueue: 5087 + destroy_workqueue(rbd_dev->rq_wq); 5088 + rbd_dev->rq_wq = NULL; 5108 5089 err_out_mapping: 5109 5090 rbd_dev_mapping_clear(rbd_dev); 5110 5091 err_out_disk: ··· 5180 5155 ret = rbd_dev_image_id(rbd_dev); 5181 5156 if (ret) 5182 5157 return ret; 5183 - rbd_assert(rbd_dev->spec->image_id); 5184 - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5185 5158 5186 5159 ret = rbd_dev_header_name(rbd_dev); 5187 5160 if (ret) ··· 5191 5168 goto out_header_name; 5192 5169 } 5193 5170 5194 - if (rbd_dev->image_format == 1) 5195 - ret = rbd_dev_v1_header_info(rbd_dev); 5196 - else 5197 - ret = rbd_dev_v2_header_info(rbd_dev); 5171 + ret = rbd_dev_header_info(rbd_dev); 5198 5172 if (ret) 5199 5173 goto err_out_watch; 5200 5174 5201 - ret = rbd_dev_spec_update(rbd_dev); 5175 + /* 5176 + * If this image is the one being mapped, we have pool name and 5177 + * id, image name and id, and snap name - need to fill snap id. 5178 + * Otherwise this is a parent image, identified by pool, image 5179 + * and snap ids - need to fill in names for those ids. 5180 + */ 5181 + if (mapping) 5182 + ret = rbd_spec_fill_snap_id(rbd_dev); 5183 + else 5184 + ret = rbd_spec_fill_names(rbd_dev); 5202 5185 if (ret) 5203 5186 goto err_out_probe; 5187 + 5188 + if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5189 + ret = rbd_dev_v2_parent_info(rbd_dev); 5190 + if (ret) 5191 + goto err_out_probe; 5192 + 5193 + /* 5194 + * Need to warn users if this image is the one being 5195 + * mapped and has a parent. 5196 + */ 5197 + if (mapping && rbd_dev->parent_spec) 5198 + rbd_warn(rbd_dev, 5199 + "WARNING: kernel layering is EXPERIMENTAL!"); 5200 + } 5204 5201 5205 5202 ret = rbd_dev_probe_parent(rbd_dev); 5206 5203 if (ret) ··· 5228 5185 5229 5186 dout("discovered format %u image, header name is %s\n", 5230 5187 rbd_dev->image_format, rbd_dev->header_name); 5231 - 5232 5188 return 0; 5189 + 5233 5190 err_out_probe: 5234 5191 rbd_dev_unprobe(rbd_dev); 5235 5192 err_out_watch: ··· 5242 5199 rbd_dev->image_format = 0; 5243 5200 kfree(rbd_dev->spec->image_id); 5244 5201 rbd_dev->spec->image_id = NULL; 5245 - 5246 - dout("probe failed, returning %d\n", ret); 5247 - 5248 5202 return ret; 5249 5203 } 5250 5204 ··· 5283 5243 /* The ceph file layout needs to fit pool id in 32 bits */ 5284 5244 5285 5245 if (spec->pool_id > (u64)U32_MAX) { 5286 - rbd_warn(NULL, "pool id too large (%llu > %u)\n", 5246 + rbd_warn(NULL, "pool id too large (%llu > %u)", 5287 5247 (unsigned long long)spec->pool_id, U32_MAX); 5288 5248 rc = -EIO; 5289 5249 goto err_out_client; ··· 5354 5314 { 5355 5315 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5356 5316 5317 + destroy_workqueue(rbd_dev->rq_wq); 5357 5318 rbd_free_disk(rbd_dev); 5358 5319 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5359 5320 rbd_dev_mapping_clear(rbd_dev);

+12 -2

fs/ceph/acl.c

··· 172 172 int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) 173 173 { 174 174 struct posix_acl *default_acl, *acl; 175 + umode_t new_mode = inode->i_mode; 175 176 int error; 176 177 177 - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); 178 + error = posix_acl_create(dir, &new_mode, &default_acl, &acl); 178 179 if (error) 179 180 return error; 180 181 181 - if (!default_acl && !acl) 182 + if (!default_acl && !acl) { 182 183 cache_no_acl(inode); 184 + if (new_mode != inode->i_mode) { 185 + struct iattr newattrs = { 186 + .ia_mode = new_mode, 187 + .ia_valid = ATTR_MODE, 188 + }; 189 + error = ceph_setattr(dentry, &newattrs); 190 + } 191 + return error; 192 + } 183 193 184 194 if (default_acl) { 185 195 error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);

+1 -1

fs/ceph/caps.c

··· 3277 3277 rel->ino = cpu_to_le64(ceph_ino(inode)); 3278 3278 rel->cap_id = cpu_to_le64(cap->cap_id); 3279 3279 rel->seq = cpu_to_le32(cap->seq); 3280 - rel->issue_seq = cpu_to_le32(cap->issue_seq), 3280 + rel->issue_seq = cpu_to_le32(cap->issue_seq); 3281 3281 rel->mseq = cpu_to_le32(cap->mseq); 3282 3282 rel->caps = cpu_to_le32(cap->implemented); 3283 3283 rel->wanted = cpu_to_le32(cap->mds_wanted);

+16 -8

fs/ceph/file.c

··· 423 423 dout("sync_read on file %p %llu~%u %s\n", file, off, 424 424 (unsigned)len, 425 425 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 426 + 427 + if (!len) 428 + return 0; 426 429 /* 427 430 * flush any page cache pages in this range. this 428 431 * will make concurrent normal and sync io slow, ··· 473 470 size_t left = ret; 474 471 475 472 while (left) { 476 - int copy = min_t(size_t, PAGE_SIZE, left); 477 - l = copy_page_to_iter(pages[k++], 0, copy, i); 473 + size_t page_off = off & ~PAGE_MASK; 474 + size_t copy = min_t(size_t, 475 + PAGE_SIZE - page_off, left); 476 + l = copy_page_to_iter(pages[k++], page_off, 477 + copy, i); 478 478 off += l; 479 479 left -= l; 480 480 if (l < copy) ··· 537 531 * objects, rollback on failure, etc.) 538 532 */ 539 533 static ssize_t 540 - ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) 534 + ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) 541 535 { 542 536 struct file *file = iocb->ki_filp; 543 537 struct inode *inode = file_inode(file); ··· 553 547 int check_caps = 0; 554 548 int ret; 555 549 struct timespec mtime = CURRENT_TIME; 556 - loff_t pos = iocb->ki_pos; 557 550 size_t count = iov_iter_count(from); 558 551 559 552 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) ··· 651 646 * correct atomic write, we should e.g. take write locks on all 652 647 * objects, rollback on failure, etc.) 653 648 */ 654 - static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) 649 + static ssize_t 650 + ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) 655 651 { 656 652 struct file *file = iocb->ki_filp; 657 653 struct inode *inode = file_inode(file); ··· 669 663 int check_caps = 0; 670 664 int ret; 671 665 struct timespec mtime = CURRENT_TIME; 672 - loff_t pos = iocb->ki_pos; 673 666 size_t count = iov_iter_count(from); 674 667 675 668 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) ··· 923 918 /* we might need to revert back to that point */ 924 919 data = *from; 925 920 if (file->f_flags & O_DIRECT) 926 - written = ceph_sync_direct_write(iocb, &data); 921 + written = ceph_sync_direct_write(iocb, &data, pos); 927 922 else 928 - written = ceph_sync_write(iocb, &data); 923 + written = ceph_sync_write(iocb, &data, pos); 929 924 if (written == -EOLDSNAPC) { 930 925 dout("aio_write %p %llx.%llx %llu~%u" 931 926 "got EOLDSNAPC, retrying\n", ··· 1181 1176 int ret = 0; 1182 1177 loff_t endoff = 0; 1183 1178 loff_t size; 1179 + 1180 + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 1181 + return -EOPNOTSUPP; 1184 1182 1185 1183 if (!S_ISREG(inode->i_mode)) 1186 1184 return -EOPNOTSUPP;

+12 -4

fs/ceph/mds_client.c

··· 1904 1904 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1905 1905 1906 1906 if (req->r_got_unsafe) { 1907 + void *p; 1907 1908 /* 1908 1909 * Replay. Do not regenerate message (and rebuild 1909 1910 * paths, etc.); just use the original message. ··· 1925 1924 1926 1925 /* remove cap/dentry releases from message */ 1927 1926 rhead->num_releases = 0; 1928 - msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); 1929 - msg->front.iov_len = req->r_request_release_offset; 1927 + 1928 + /* time stamp */ 1929 + p = msg->front.iov_base + req->r_request_release_offset; 1930 + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 1931 + 1932 + msg->front.iov_len = p - msg->front.iov_base; 1933 + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1930 1934 return 0; 1931 1935 } 1932 1936 ··· 2067 2061 static void kick_requests(struct ceph_mds_client *mdsc, int mds) 2068 2062 { 2069 2063 struct ceph_mds_request *req; 2070 - struct rb_node *p; 2064 + struct rb_node *p = rb_first(&mdsc->request_tree); 2071 2065 2072 2066 dout("kick_requests mds%d\n", mds); 2073 - for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { 2067 + while (p) { 2074 2068 req = rb_entry(p, struct ceph_mds_request, r_node); 2069 + p = rb_next(p); 2075 2070 if (req->r_got_unsafe) 2076 2071 continue; 2077 2072 if (req->r_session && ··· 2255 2248 */ 2256 2249 if (result == -ESTALE) { 2257 2250 dout("got ESTALE on request %llu", req->r_tid); 2251 + req->r_resend_mds = -1; 2258 2252 if (req->r_direct_mode != USE_AUTH_MDS) { 2259 2253 dout("not using auth, setting for that now"); 2260 2254 req->r_direct_mode = USE_AUTH_MDS;

+2 -2

fs/ceph/xattr.c

··· 592 592 xattr_version = ci->i_xattrs.version; 593 593 spin_unlock(&ci->i_ceph_lock); 594 594 595 - xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), 595 + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), 596 596 GFP_NOFS); 597 597 err = -ENOMEM; 598 598 if (!xattrs) 599 599 goto bad_lock; 600 - memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); 600 + 601 601 for (i = 0; i < numattr; i++) { 602 602 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), 603 603 GFP_NOFS);

+2 -12

include/linux/ceph/messenger.h

··· 285 285 286 286 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 287 287 bool can_fail); 288 - extern void ceph_msg_kfree(struct ceph_msg *m); 289 288 290 - 291 - static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) 292 - { 293 - kref_get(&msg->kref); 294 - return msg; 295 - } 296 - extern void ceph_msg_last_put(struct kref *kref); 297 - static inline void ceph_msg_put(struct ceph_msg *msg) 298 - { 299 - kref_put(&msg->kref, ceph_msg_last_put); 300 - } 289 + extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg); 290 + extern void ceph_msg_put(struct ceph_msg *msg); 301 291 302 292 extern void ceph_msg_dump(struct ceph_msg *msg); 303 293

+4 -12

include/linux/ceph/osd_client.h

··· 117 117 struct list_head r_req_lru_item; 118 118 struct list_head r_osd_item; 119 119 struct list_head r_linger_item; 120 - struct list_head r_linger_osd; 120 + struct list_head r_linger_osd_item; 121 121 struct ceph_osd *r_osd; 122 122 struct ceph_pg r_pgid; 123 123 int r_pg_osds[CEPH_PG_MAX_SIZE]; ··· 325 325 326 326 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 327 327 struct ceph_osd_request *req); 328 - extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, 329 - struct ceph_osd_request *req); 330 328 331 - static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 332 - { 333 - kref_get(&req->r_kref); 334 - } 335 - extern void ceph_osdc_release_request(struct kref *kref); 336 - static inline void ceph_osdc_put_request(struct ceph_osd_request *req) 337 - { 338 - kref_put(&req->r_kref, ceph_osdc_release_request); 339 - } 329 + extern void ceph_osdc_get_request(struct ceph_osd_request *req); 330 + extern void ceph_osdc_put_request(struct ceph_osd_request *req); 340 331 341 332 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, 342 333 struct ceph_osd_request *req, 343 334 bool nofail); 335 + extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); 344 336 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 345 337 struct ceph_osd_request *req); 346 338 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);

+33 -14

net/ceph/messenger.c

··· 174 174 #define SKIP_BUF_SIZE 1024 175 175 176 176 static void queue_con(struct ceph_connection *con); 177 + static void cancel_con(struct ceph_connection *con); 177 178 static void con_work(struct work_struct *); 178 179 static void con_fault(struct ceph_connection *con); 179 180 ··· 681 680 682 681 reset_connection(con); 683 682 con->peer_global_seq = 0; 684 - cancel_delayed_work(&con->work); 683 + cancel_con(con); 685 684 con_close_socket(con); 686 685 mutex_unlock(&con->mutex); 687 686 } ··· 901 900 BUG_ON(page_count > (int)USHRT_MAX); 902 901 cursor->page_count = (unsigned short)page_count; 903 902 BUG_ON(length > SIZE_MAX - cursor->page_offset); 904 - cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; 903 + cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; 905 904 } 906 905 907 906 static struct page * ··· 2668 2667 { 2669 2668 if (!con->ops->get(con)) { 2670 2669 dout("%s %p ref count 0\n", __func__, con); 2671 - 2672 2670 return -ENOENT; 2673 2671 } 2674 2672 2675 2673 if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { 2676 2674 dout("%s %p - already queued\n", __func__, con); 2677 2675 con->ops->put(con); 2678 - 2679 2676 return -EBUSY; 2680 2677 } 2681 2678 2682 2679 dout("%s %p %lu\n", __func__, con, delay); 2683 - 2684 2680 return 0; 2685 2681 } 2686 2682 2687 2683 static void queue_con(struct ceph_connection *con) 2688 2684 { 2689 2685 (void) queue_con_delay(con, 0); 2686 + } 2687 + 2688 + static void cancel_con(struct ceph_connection *con) 2689 + { 2690 + if (cancel_delayed_work(&con->work)) { 2691 + dout("%s %p\n", __func__, con); 2692 + con->ops->put(con); 2693 + } 2690 2694 } 2691 2695 2692 2696 static bool con_sock_closed(struct ceph_connection *con) ··· 3275 3269 /* 3276 3270 * Free a generically kmalloc'd message. 3277 3271 */ 3278 - void ceph_msg_kfree(struct ceph_msg *m) 3272 + static void ceph_msg_free(struct ceph_msg *m) 3279 3273 { 3280 - dout("msg_kfree %p\n", m); 3274 + dout("%s %p\n", __func__, m); 3281 3275 ceph_kvfree(m->front.iov_base); 3282 3276 kmem_cache_free(ceph_msg_cache, m); 3283 3277 } 3284 3278 3285 - /* 3286 - * Drop a msg ref. Destroy as needed. 3287 - */ 3288 - void ceph_msg_last_put(struct kref *kref) 3279 + static void ceph_msg_release(struct kref *kref) 3289 3280 { 3290 3281 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); 3291 3282 LIST_HEAD(data); 3292 3283 struct list_head *links; 3293 3284 struct list_head *next; 3294 3285 3295 - dout("ceph_msg_put last one on %p\n", m); 3286 + dout("%s %p\n", __func__, m); 3296 3287 WARN_ON(!list_empty(&m->list_head)); 3297 3288 3298 3289 /* drop middle, data, if any */ ··· 3311 3308 if (m->pool) 3312 3309 ceph_msgpool_put(m->pool, m); 3313 3310 else 3314 - ceph_msg_kfree(m); 3311 + ceph_msg_free(m); 3315 3312 } 3316 - EXPORT_SYMBOL(ceph_msg_last_put); 3313 + 3314 + struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) 3315 + { 3316 + dout("%s %p (was %d)\n", __func__, msg, 3317 + atomic_read(&msg->kref.refcount)); 3318 + kref_get(&msg->kref); 3319 + return msg; 3320 + } 3321 + EXPORT_SYMBOL(ceph_msg_get); 3322 + 3323 + void ceph_msg_put(struct ceph_msg *msg) 3324 + { 3325 + dout("%s %p (was %d)\n", __func__, msg, 3326 + atomic_read(&msg->kref.refcount)); 3327 + kref_put(&msg->kref, ceph_msg_release); 3328 + } 3329 + EXPORT_SYMBOL(ceph_msg_put); 3317 3330 3318 3331 void ceph_msg_dump(struct ceph_msg *msg) 3319 3332 {

+89 -42

net/ceph/osd_client.c

··· 297 297 /* 298 298 * requests 299 299 */ 300 - void ceph_osdc_release_request(struct kref *kref) 300 + static void ceph_osdc_release_request(struct kref *kref) 301 301 { 302 - struct ceph_osd_request *req; 302 + struct ceph_osd_request *req = container_of(kref, 303 + struct ceph_osd_request, r_kref); 303 304 unsigned int which; 304 305 305 - req = container_of(kref, struct ceph_osd_request, r_kref); 306 + dout("%s %p (r_request %p r_reply %p)\n", __func__, req, 307 + req->r_request, req->r_reply); 308 + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); 309 + WARN_ON(!list_empty(&req->r_req_lru_item)); 310 + WARN_ON(!list_empty(&req->r_osd_item)); 311 + WARN_ON(!list_empty(&req->r_linger_item)); 312 + WARN_ON(!list_empty(&req->r_linger_osd_item)); 313 + WARN_ON(req->r_osd); 314 + 306 315 if (req->r_request) 307 316 ceph_msg_put(req->r_request); 308 317 if (req->r_reply) { ··· 329 320 kmem_cache_free(ceph_osd_request_cache, req); 330 321 331 322 } 332 - EXPORT_SYMBOL(ceph_osdc_release_request); 323 + 324 + void ceph_osdc_get_request(struct ceph_osd_request *req) 325 + { 326 + dout("%s %p (was %d)\n", __func__, req, 327 + atomic_read(&req->r_kref.refcount)); 328 + kref_get(&req->r_kref); 329 + } 330 + EXPORT_SYMBOL(ceph_osdc_get_request); 331 + 332 + void ceph_osdc_put_request(struct ceph_osd_request *req) 333 + { 334 + dout("%s %p (was %d)\n", __func__, req, 335 + atomic_read(&req->r_kref.refcount)); 336 + kref_put(&req->r_kref, ceph_osdc_release_request); 337 + } 338 + EXPORT_SYMBOL(ceph_osdc_put_request); 333 339 334 340 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 335 341 struct ceph_snap_context *snapc, ··· 388 364 RB_CLEAR_NODE(&req->r_node); 389 365 INIT_LIST_HEAD(&req->r_unsafe_item); 390 366 INIT_LIST_HEAD(&req->r_linger_item); 391 - INIT_LIST_HEAD(&req->r_linger_osd); 367 + INIT_LIST_HEAD(&req->r_linger_osd_item); 392 368 INIT_LIST_HEAD(&req->r_req_lru_item); 393 369 INIT_LIST_HEAD(&req->r_osd_item); 394 370 ··· 940 916 * list at the end to keep things in tid order. 941 917 */ 942 918 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, 943 - r_linger_osd) { 919 + r_linger_osd_item) { 944 920 /* 945 921 * reregister request prior to unregistering linger so 946 922 * that r_osd is preserved. ··· 1032 1008 { 1033 1009 dout("__remove_osd %p\n", osd); 1034 1010 BUG_ON(!list_empty(&osd->o_requests)); 1011 + BUG_ON(!list_empty(&osd->o_linger_requests)); 1012 + 1035 1013 rb_erase(&osd->o_node, &osdc->osds); 1036 1014 list_del_init(&osd->o_osd_lru); 1037 1015 ceph_con_close(&osd->o_con); ··· 1055 1029 static void __move_osd_to_lru(struct ceph_osd_client *osdc, 1056 1030 struct ceph_osd *osd) 1057 1031 { 1058 - dout("__move_osd_to_lru %p\n", osd); 1032 + dout("%s %p\n", __func__, osd); 1059 1033 BUG_ON(!list_empty(&osd->o_osd_lru)); 1034 + 1060 1035 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 1061 1036 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; 1037 + } 1038 + 1039 + static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, 1040 + struct ceph_osd *osd) 1041 + { 1042 + dout("%s %p\n", __func__, osd); 1043 + 1044 + if (list_empty(&osd->o_requests) && 1045 + list_empty(&osd->o_linger_requests)) 1046 + __move_osd_to_lru(osdc, osd); 1062 1047 } 1063 1048 1064 1049 static void __remove_osd_from_lru(struct ceph_osd *osd) ··· 1212 1175 1213 1176 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 1214 1177 rb_erase(&req->r_node, &osdc->requests); 1178 + RB_CLEAR_NODE(&req->r_node); 1215 1179 osdc->num_requests--; 1216 1180 1217 1181 if (req->r_osd) { ··· 1220 1182 ceph_msg_revoke(req->r_request); 1221 1183 1222 1184 list_del_init(&req->r_osd_item); 1223 - if (list_empty(&req->r_osd->o_requests) && 1224 - list_empty(&req->r_osd->o_linger_requests)) { 1225 - dout("moving osd to %p lru\n", req->r_osd); 1226 - __move_osd_to_lru(osdc, req->r_osd); 1227 - } 1228 - if (list_empty(&req->r_linger_item)) 1185 + maybe_move_osd_to_lru(osdc, req->r_osd); 1186 + if (list_empty(&req->r_linger_osd_item)) 1229 1187 req->r_osd = NULL; 1230 1188 } 1231 1189 ··· 1248 1214 static void __register_linger_request(struct ceph_osd_client *osdc, 1249 1215 struct ceph_osd_request *req) 1250 1216 { 1251 - dout("__register_linger_request %p\n", req); 1217 + dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1218 + WARN_ON(!req->r_linger); 1219 + 1252 1220 ceph_osdc_get_request(req); 1253 1221 list_add_tail(&req->r_linger_item, &osdc->req_linger); 1254 1222 if (req->r_osd) 1255 - list_add_tail(&req->r_linger_osd, 1223 + list_add_tail(&req->r_linger_osd_item, 1256 1224 &req->r_osd->o_linger_requests); 1257 1225 } 1258 1226 1259 1227 static void __unregister_linger_request(struct ceph_osd_client *osdc, 1260 1228 struct ceph_osd_request *req) 1261 1229 { 1262 - dout("__unregister_linger_request %p\n", req); 1263 - list_del_init(&req->r_linger_item); 1264 - if (req->r_osd) { 1265 - list_del_init(&req->r_linger_osd); 1230 + WARN_ON(!req->r_linger); 1266 1231 1267 - if (list_empty(&req->r_osd->o_requests) && 1268 - list_empty(&req->r_osd->o_linger_requests)) { 1269 - dout("moving osd to %p lru\n", req->r_osd); 1270 - __move_osd_to_lru(osdc, req->r_osd); 1271 - } 1232 + if (list_empty(&req->r_linger_item)) { 1233 + dout("%s %p tid %llu not registered\n", __func__, req, 1234 + req->r_tid); 1235 + return; 1236 + } 1237 + 1238 + dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1239 + list_del_init(&req->r_linger_item); 1240 + 1241 + if (req->r_osd) { 1242 + list_del_init(&req->r_linger_osd_item); 1243 + maybe_move_osd_to_lru(osdc, req->r_osd); 1272 1244 if (list_empty(&req->r_osd_item)) 1273 1245 req->r_osd = NULL; 1274 1246 } 1275 1247 ceph_osdc_put_request(req); 1276 1248 } 1277 - 1278 - void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, 1279 - struct ceph_osd_request *req) 1280 - { 1281 - mutex_lock(&osdc->request_mutex); 1282 - if (req->r_linger) { 1283 - req->r_linger = 0; 1284 - __unregister_linger_request(osdc, req); 1285 - } 1286 - mutex_unlock(&osdc->request_mutex); 1287 - } 1288 - EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); 1289 1249 1290 1250 void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 1291 1251 struct ceph_osd_request *req) ··· 2458 2430 EXPORT_SYMBOL(ceph_osdc_start_request); 2459 2431 2460 2432 /* 2433 + * Unregister a registered request. The request is not completed (i.e. 2434 + * no callbacks or wakeups) - higher layers are supposed to know what 2435 + * they are canceling. 2436 + */ 2437 + void ceph_osdc_cancel_request(struct ceph_osd_request *req) 2438 + { 2439 + struct ceph_osd_client *osdc = req->r_osdc; 2440 + 2441 + mutex_lock(&osdc->request_mutex); 2442 + if (req->r_linger) 2443 + __unregister_linger_request(osdc, req); 2444 + __unregister_request(osdc, req); 2445 + mutex_unlock(&osdc->request_mutex); 2446 + 2447 + dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); 2448 + } 2449 + EXPORT_SYMBOL(ceph_osdc_cancel_request); 2450 + 2451 + /* 2461 2452 * wait for a request to complete 2462 2453 */ 2463 2454 int ceph_osdc_wait_request(struct ceph_osd_client *osdc, ··· 2484 2437 { 2485 2438 int rc; 2486 2439 2440 + dout("%s %p tid %llu\n", __func__, req, req->r_tid); 2441 + 2487 2442 rc = wait_for_completion_interruptible(&req->r_completion); 2488 2443 if (rc < 0) { 2489 - mutex_lock(&osdc->request_mutex); 2490 - __cancel_request(req); 2491 - __unregister_request(osdc, req); 2492 - mutex_unlock(&osdc->request_mutex); 2444 + dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); 2445 + ceph_osdc_cancel_request(req); 2493 2446 complete_request(req); 2494 - dout("wait_request tid %llu canceled/timed out\n", req->r_tid); 2495 2447 return rc; 2496 2448 } 2497 2449 2498 - dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); 2450 + dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, 2451 + req->r_result); 2499 2452 return req->r_result; 2500 2453 } 2501 2454 EXPORT_SYMBOL(ceph_osdc_wait_request);