Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rbd: rework rbd_request_fn()

While it was never a good idea to sleep in request_fn(), commit
34c6bc2c919a ("locking/mutexes: Add extra reschedule point") made it
a *bad* idea. mutex_lock() since 3.15 may reschedule *before* putting
task on the mutex wait queue, which for tasks in !TASK_RUNNING state
means block forever. request_fn() may be called with !TASK_RUNNING on
the way to schedule() in io_schedule().

Offload request handling to a workqueue, one per rbd device, to avoid
calling blocking primitives from rbd_request_fn().

Fixes: http://tracker.ceph.com/issues/8818

Cc: stable@vger.kernel.org # 3.16, needs backporting for 3.15
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Tested-by: Eric Eastman <eric0e@aol.com>
Tested-by: Greg Wilson <greg.wilson@keepertech.com>
Reviewed-by: Alex Elder <elder@linaro.org>

+124 -82
+124 -82
drivers/block/rbd.c
··· 42 42 #include <linux/blkdev.h> 43 43 #include <linux/slab.h> 44 44 #include <linux/idr.h> 45 + #include <linux/workqueue.h> 45 46 46 47 #include "rbd_types.h" 47 48 ··· 333 332 334 333 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 335 334 335 + struct list_head rq_queue; /* incoming rq queue */ 336 336 spinlock_t lock; /* queue, flags, open_count */ 337 + struct workqueue_struct *rq_wq; 338 + struct work_struct rq_work; 337 339 338 340 struct rbd_image_header header; 339 341 unsigned long flags; /* possibly lock protected */ ··· 3180 3176 return ret; 3181 3177 } 3182 3178 3179 + static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 3180 + { 3181 + struct rbd_img_request *img_request; 3182 + u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3183 + u64 length = blk_rq_bytes(rq); 3184 + bool wr = rq_data_dir(rq) == WRITE; 3185 + int result; 3186 + 3187 + /* Ignore/skip any zero-length requests */ 3188 + 3189 + if (!length) { 3190 + dout("%s: zero-length request\n", __func__); 3191 + result = 0; 3192 + goto err_rq; 3193 + } 3194 + 3195 + /* Disallow writes to a read-only device */ 3196 + 3197 + if (wr) { 3198 + if (rbd_dev->mapping.read_only) { 3199 + result = -EROFS; 3200 + goto err_rq; 3201 + } 3202 + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3203 + } 3204 + 3205 + /* 3206 + * Quit early if the mapped snapshot no longer exists. It's 3207 + * still possible the snapshot will have disappeared by the 3208 + * time our request arrives at the osd, but there's no sense in 3209 + * sending it if we already know. 3210 + */ 3211 + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3212 + dout("request for non-existent snapshot"); 3213 + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3214 + result = -ENXIO; 3215 + goto err_rq; 3216 + } 3217 + 3218 + if (offset && length > U64_MAX - offset + 1) { 3219 + rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3220 + length); 3221 + result = -EINVAL; 3222 + goto err_rq; /* Shouldn't happen */ 3223 + } 3224 + 3225 + if (offset + length > rbd_dev->mapping.size) { 3226 + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 3227 + length, rbd_dev->mapping.size); 3228 + result = -EIO; 3229 + goto err_rq; 3230 + } 3231 + 3232 + img_request = rbd_img_request_create(rbd_dev, offset, length, wr); 3233 + if (!img_request) { 3234 + result = -ENOMEM; 3235 + goto err_rq; 3236 + } 3237 + img_request->rq = rq; 3238 + 3239 + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); 3240 + if (result) 3241 + goto err_img_request; 3242 + 3243 + result = rbd_img_request_submit(img_request); 3244 + if (result) 3245 + goto err_img_request; 3246 + 3247 + return; 3248 + 3249 + err_img_request: 3250 + rbd_img_request_put(img_request); 3251 + err_rq: 3252 + if (result) 3253 + rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3254 + wr ? "write" : "read", length, offset, result); 3255 + blk_end_request_all(rq, result); 3256 + } 3257 + 3258 + static void rbd_request_workfn(struct work_struct *work) 3259 + { 3260 + struct rbd_device *rbd_dev = 3261 + container_of(work, struct rbd_device, rq_work); 3262 + struct request *rq, *next; 3263 + LIST_HEAD(requests); 3264 + 3265 + spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ 3266 + list_splice_init(&rbd_dev->rq_queue, &requests); 3267 + spin_unlock_irq(&rbd_dev->lock); 3268 + 3269 + list_for_each_entry_safe(rq, next, &requests, queuelist) { 3270 + list_del_init(&rq->queuelist); 3271 + rbd_handle_request(rbd_dev, rq); 3272 + } 3273 + } 3274 + 3275 + /* 3276 + * Called with q->queue_lock held and interrupts disabled, possibly on 3277 + * the way to schedule(). Do not sleep here! 3278 + */ 3183 3279 static void rbd_request_fn(struct request_queue *q) 3184 - __releases(q->queue_lock) __acquires(q->queue_lock) 3185 3280 { 3186 3281 struct rbd_device *rbd_dev = q->queuedata; 3187 3282 struct request *rq; 3188 - int result; 3283 + int queued = 0; 3284 + 3285 + rbd_assert(rbd_dev); 3189 3286 3190 3287 while ((rq = blk_fetch_request(q))) { 3191 - bool write_request = rq_data_dir(rq) == WRITE; 3192 - struct rbd_img_request *img_request; 3193 - u64 offset; 3194 - u64 length; 3195 - 3196 3288 /* Ignore any non-FS requests that filter through. */ 3197 - 3198 3289 if (rq->cmd_type != REQ_TYPE_FS) { 3199 3290 dout("%s: non-fs request type %d\n", __func__, 3200 3291 (int) rq->cmd_type); ··· 3297 3198 continue; 3298 3199 } 3299 3200 3300 - /* Ignore/skip any zero-length requests */ 3301 - 3302 - offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 3303 - length = (u64) blk_rq_bytes(rq); 3304 - 3305 - if (!length) { 3306 - dout("%s: zero-length request\n", __func__); 3307 - __blk_end_request_all(rq, 0); 3308 - continue; 3309 - } 3310 - 3311 - spin_unlock_irq(q->queue_lock); 3312 - 3313 - /* Disallow writes to a read-only device */ 3314 - 3315 - if (write_request) { 3316 - result = -EROFS; 3317 - if (rbd_dev->mapping.read_only) 3318 - goto end_request; 3319 - rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3320 - } 3321 - 3322 - /* 3323 - * Quit early if the mapped snapshot no longer 3324 - * exists. It's still possible the snapshot will 3325 - * have disappeared by the time our request arrives 3326 - * at the osd, but there's no sense in sending it if 3327 - * we already know. 3328 - */ 3329 - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3330 - dout("request for non-existent snapshot"); 3331 - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3332 - result = -ENXIO; 3333 - goto end_request; 3334 - } 3335 - 3336 - result = -EINVAL; 3337 - if (offset && length > U64_MAX - offset + 1) { 3338 - rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", 3339 - offset, length); 3340 - goto end_request; /* Shouldn't happen */ 3341 - } 3342 - 3343 - result = -EIO; 3344 - if (offset + length > rbd_dev->mapping.size) { 3345 - rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 3346 - offset, length, rbd_dev->mapping.size); 3347 - goto end_request; 3348 - } 3349 - 3350 - result = -ENOMEM; 3351 - img_request = rbd_img_request_create(rbd_dev, offset, length, 3352 - write_request); 3353 - if (!img_request) 3354 - goto end_request; 3355 - 3356 - img_request->rq = rq; 3357 - 3358 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3359 - rq->bio); 3360 - if (!result) 3361 - result = rbd_img_request_submit(img_request); 3362 - if (result) 3363 - rbd_img_request_put(img_request); 3364 - end_request: 3365 - spin_lock_irq(q->queue_lock); 3366 - if (result < 0) { 3367 - rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 3368 - write_request ? "write" : "read", 3369 - length, offset, result); 3370 - 3371 - __blk_end_request_all(rq, result); 3372 - } 3201 + list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); 3202 + queued++; 3373 3203 } 3204 + 3205 + if (queued) 3206 + queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work); 3374 3207 } 3375 3208 3376 3209 /* ··· 3878 3847 return NULL; 3879 3848 3880 3849 spin_lock_init(&rbd_dev->lock); 3850 + INIT_LIST_HEAD(&rbd_dev->rq_queue); 3851 + INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); 3881 3852 rbd_dev->flags = 0; 3882 3853 atomic_set(&rbd_dev->parent_ref, 0); 3883 3854 INIT_LIST_HEAD(&rbd_dev->node); ··· 5084 5051 ret = rbd_dev_mapping_set(rbd_dev); 5085 5052 if (ret) 5086 5053 goto err_out_disk; 5054 + 5087 5055 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 5088 5056 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5089 5057 5058 + rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0); 5059 + if (!rbd_dev->rq_wq) 5060 + goto err_out_mapping; 5061 + 5090 5062 ret = rbd_bus_add_dev(rbd_dev); 5091 5063 if (ret) 5092 - goto err_out_mapping; 5064 + goto err_out_workqueue; 5093 5065 5094 5066 /* Everything's ready. Announce the disk to the world. */ 5095 5067 ··· 5106 5068 5107 5069 return ret; 5108 5070 5071 + err_out_workqueue: 5072 + destroy_workqueue(rbd_dev->rq_wq); 5073 + rbd_dev->rq_wq = NULL; 5109 5074 err_out_mapping: 5110 5075 rbd_dev_mapping_clear(rbd_dev); 5111 5076 err_out_disk: ··· 5355 5314 { 5356 5315 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5357 5316 5317 + destroy_workqueue(rbd_dev->rq_wq); 5358 5318 rbd_free_disk(rbd_dev); 5359 5319 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5360 5320 rbd_dev_mapping_clear(rbd_dev);