Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

block: loop: support DIO & AIO

There are at least 3 advantages to use direct I/O and AIO on
read/write loop's backing file:

1) double cache can be avoided, then memory usage gets
decreased a lot

2) not like user space direct I/O, there isn't cost of
pinning pages

3) avoid context switch for obtaining good throughput
- in buffered file read, random I/O top throughput is often obtained
only if they are submitted concurrently from lots of tasks; but for
sequential I/O, most of times they can be hit from page cache, so
concurrent submissions often introduce unnecessary context switch
and can't improve throughput much. There was such discussion[1]
to use non-blocking I/O to improve the problem for application.
- with direct I/O and AIO, concurrent submissions can be
avoided and random read throughput can't be affected meantime

xfstests(-g auto, ext4) is basically passed when running with
direct I/O(aio), one exception is generic/232, but it failed in
loop buffered I/O(4.2-rc6-next-20150814) too.

Follows the fio test result for performance purpose:
4 jobs fio test inside ext4 file system over loop block

1) How to run
- KVM: 4 VCPUs, 2G RAM
- linux kernel: 4.2-rc6-next-20150814(base) with the patchset
- the loop block is over one image on SSD.
- linux psync, 4 jobs, size 1500M, ext4 over loop block
- test result: IOPS from fio output

2) Throughput(IOPS) becomes a bit better with direct I/O(aio)
-------------------------------------------------------------
test cases |randread |read |randwrite |write |
-------------------------------------------------------------
base |8015 |113811 |67442 |106978
-------------------------------------------------------------
base+loop aio |8136 |125040 |67811 |111376
-------------------------------------------------------------

- somehow, it should be caused by more page cache avaiable for
application or one extra page copy is avoided in case of direct I/O

3) context switch
- context switch decreased by ~50% with loop direct I/O(aio)
compared with loop buffered I/O(4.2-rc6-next-20150814)

4) memory usage from /proc/meminfo
-------------------------------------------------------------
| Buffers | Cached
-------------------------------------------------------------
base | > 760MB | ~950MB
-------------------------------------------------------------
base+loop direct I/O(aio) | < 5MB | ~1.6GB
-------------------------------------------------------------

- so there are much more page caches available for application with
direct I/O

[1] https://lwn.net/Articles/612483/

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

authored by

Ming Lei and committed by
Jens Axboe
bc07c10a ab1cb278

+97 -3
+95 -3
drivers/block/loop.c
··· 445 445 return ret; 446 446 } 447 447 448 + static inline void handle_partial_read(struct loop_cmd *cmd, long bytes) 449 + { 450 + if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE)) 451 + return; 452 + 453 + if (unlikely(bytes < blk_rq_bytes(cmd->rq))) { 454 + struct bio *bio = cmd->rq->bio; 455 + 456 + bio_advance(bio, bytes); 457 + zero_fill_bio(bio); 458 + } 459 + } 460 + 461 + static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) 462 + { 463 + struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); 464 + struct request *rq = cmd->rq; 465 + 466 + handle_partial_read(cmd, ret); 467 + 468 + if (ret > 0) 469 + ret = 0; 470 + else if (ret < 0) 471 + ret = -EIO; 472 + 473 + rq->errors = ret; 474 + blk_mq_complete_request(rq); 475 + } 476 + 477 + static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, 478 + loff_t pos, bool rw) 479 + { 480 + struct iov_iter iter; 481 + struct bio_vec *bvec; 482 + struct bio *bio = cmd->rq->bio; 483 + struct file *file = lo->lo_backing_file; 484 + int ret; 485 + 486 + /* nomerge for loop request queue */ 487 + WARN_ON(cmd->rq->bio != cmd->rq->biotail); 488 + 489 + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); 490 + iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, 491 + bio_segments(bio), blk_rq_bytes(cmd->rq)); 492 + 493 + cmd->iocb.ki_pos = pos; 494 + cmd->iocb.ki_filp = file; 495 + cmd->iocb.ki_complete = lo_rw_aio_complete; 496 + cmd->iocb.ki_flags = IOCB_DIRECT; 497 + 498 + if (rw == WRITE) 499 + ret = file->f_op->write_iter(&cmd->iocb, &iter); 500 + else 501 + ret = file->f_op->read_iter(&cmd->iocb, &iter); 502 + 503 + if (ret != -EIOCBQUEUED) 504 + cmd->iocb.ki_complete(&cmd->iocb, ret, 0); 505 + return 0; 506 + } 507 + 508 + 509 + static inline int lo_rw_simple(struct loop_device *lo, 510 + struct request *rq, loff_t pos, bool rw) 511 + { 512 + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); 513 + 514 + if (cmd->use_aio) 515 + return lo_rw_aio(lo, cmd, pos, rw); 516 + 517 + /* 518 + * lo_write_simple and lo_read_simple should have been covered 519 + * by io submit style function like lo_rw_aio(), one blocker 520 + * is that lo_read_simple() need to call flush_dcache_page after 521 + * the page is written from kernel, and it isn't easy to handle 522 + * this in io submit style function which submits all segments 523 + * of the req at one time. And direct read IO doesn't need to 524 + * run flush_dcache_page(). 525 + */ 526 + if (rw == WRITE) 527 + return lo_write_simple(lo, rq, pos); 528 + else 529 + return lo_read_simple(lo, rq, pos); 530 + } 531 + 448 532 static int do_req_filebacked(struct loop_device *lo, struct request *rq) 449 533 { 450 534 loff_t pos; ··· 544 460 else if (lo->transfer) 545 461 ret = lo_write_transfer(lo, rq, pos); 546 462 else 547 - ret = lo_write_simple(lo, rq, pos); 463 + ret = lo_rw_simple(lo, rq, pos, WRITE); 548 464 549 465 } else { 550 466 if (lo->transfer) 551 467 ret = lo_read_transfer(lo, rq, pos); 552 468 else 553 - ret = lo_read_simple(lo, rq, pos); 469 + ret = lo_rw_simple(lo, rq, pos, READ); 554 470 } 555 471 556 472 return ret; ··· 1654 1570 if (lo->lo_state != Lo_bound) 1655 1571 return -EIO; 1656 1572 1573 + if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH | 1574 + REQ_DISCARD))) 1575 + cmd->use_aio = true; 1576 + else 1577 + cmd->use_aio = false; 1578 + 1657 1579 queue_kthread_work(&lo->worker, &cmd->work); 1658 1580 1659 1581 return BLK_MQ_RQ_QUEUE_OK; ··· 1679 1589 failed: 1680 1590 if (ret) 1681 1591 cmd->rq->errors = -EIO; 1682 - blk_mq_complete_request(cmd->rq); 1592 + /* complete non-aio request */ 1593 + if (!cmd->use_aio || ret) 1594 + blk_mq_complete_request(cmd->rq); 1683 1595 } 1684 1596 1685 1597 static void loop_queue_work(struct kthread_work *work)
+2
drivers/block/loop.h
··· 69 69 struct kthread_work work; 70 70 struct request *rq; 71 71 struct list_head list; 72 + bool use_aio; /* use AIO interface to handle I/O */ 73 + struct kiocb iocb; 72 74 }; 73 75 74 76 /* Support for loadable transfer modules */