[PATCH] optimize o_direct on block devices

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Implement block device specific .direct_IO method instead of going through
generic direct_io_worker for block device.

direct_io_worker() is fairly complex because it needs to handle O_DIRECT on
file system, where it needs to perform block allocation, hole detection,
extents file on write, and tons of other corner cases. The end result is
that it takes tons of CPU time to submit an I/O.

For block device, the block allocation is much simpler and a tight triple
loop can be written to iterate each iovec and each page within the iovec in
order to construct/prepare bio structure and then subsequently submit it to
the block layer. This significantly speeds up O_D on block device.

[akpm@osdl.org: small speedup]
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Chen, Kenneth W and committed by

Linus Torvalds 19 years ago e61c9018 7e913c53

+177 -27

4 changed files

expand all

bio.c

block_dev.c

include

linux

aio.h

bio.h

+1 -1

fs/bio.c

··· 916 916 } 917 917 } 918 918 919 - static void bio_release_pages(struct bio *bio) 919 + void bio_release_pages(struct bio *bio) 920 920 { 921 921 struct bio_vec *bvec = bio->bi_io_vec; 922 922 int i;

+174 -26

fs/block_dev.c

··· 129 129 return 0; 130 130 } 131 131 132 - static int 133 - blkdev_get_blocks(struct inode *inode, sector_t iblock, 134 - struct buffer_head *bh, int create) 132 + static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error) 135 133 { 136 - sector_t end_block = max_block(I_BDEV(inode)); 137 - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 134 + struct kiocb *iocb = bio->bi_private; 135 + atomic_t *bio_count = &iocb->ki_bio_count; 138 136 139 - if ((iblock + max_blocks) > end_block) { 140 - max_blocks = end_block - iblock; 141 - if ((long)max_blocks <= 0) { 142 - if (create) 143 - return -EIO; /* write fully beyond EOF */ 144 - /* 145 - * It is a read which is fully beyond EOF. We return 146 - * a !buffer_mapped buffer 147 - */ 148 - max_blocks = 0; 149 - } 137 + if (bio_data_dir(bio) == READ) 138 + bio_check_pages_dirty(bio); 139 + else { 140 + bio_release_pages(bio); 141 + bio_put(bio); 150 142 } 151 143 152 - bh->b_bdev = I_BDEV(inode); 153 - bh->b_blocknr = iblock; 154 - bh->b_size = max_blocks << inode->i_blkbits; 155 - if (max_blocks) 156 - set_buffer_mapped(bh); 144 + /* iocb->ki_nbytes stores error code from LLDD */ 145 + if (error) 146 + iocb->ki_nbytes = -EIO; 147 + 148 + if (atomic_dec_and_test(bio_count)) { 149 + if (iocb->ki_nbytes < 0) 150 + aio_complete(iocb, iocb->ki_nbytes, 0); 151 + else 152 + aio_complete(iocb, iocb->ki_left, 0); 153 + } 154 + 157 155 return 0; 156 + } 157 + 158 + #define VEC_SIZE 16 159 + struct pvec { 160 + unsigned short nr; 161 + unsigned short idx; 162 + struct page *page[VEC_SIZE]; 163 + }; 164 + 165 + #define PAGES_SPANNED(addr, len) \ 166 + (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE); 167 + 168 + /* 169 + * get page pointer for user addr, we internally cache struct page array for 170 + * (addr, count) range in pvec to avoid frequent call to get_user_pages. If 171 + * internal page list is exhausted, a batch count of up to VEC_SIZE is used 172 + * to get next set of page struct. 173 + */ 174 + static struct page *blk_get_page(unsigned long addr, size_t count, int rw, 175 + struct pvec *pvec) 176 + { 177 + int ret, nr_pages; 178 + if (pvec->idx == pvec->nr) { 179 + nr_pages = PAGES_SPANNED(addr, count); 180 + nr_pages = min(nr_pages, VEC_SIZE); 181 + down_read(&current->mm->mmap_sem); 182 + ret = get_user_pages(current, current->mm, addr, nr_pages, 183 + rw == READ, 0, pvec->page, NULL); 184 + up_read(&current->mm->mmap_sem); 185 + if (ret < 0) 186 + return ERR_PTR(ret); 187 + pvec->nr = ret; 188 + pvec->idx = 0; 189 + } 190 + return pvec->page[pvec->idx++]; 158 191 } 159 192 160 193 static ssize_t 161 194 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 162 - loff_t offset, unsigned long nr_segs) 195 + loff_t pos, unsigned long nr_segs) 163 196 { 164 - struct file *file = iocb->ki_filp; 165 - struct inode *inode = file->f_mapping->host; 197 + struct inode *inode = iocb->ki_filp->f_mapping->host; 198 + unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode))); 199 + unsigned blocksize_mask = (1 << blkbits) - 1; 200 + unsigned long seg = 0; /* iov segment iterator */ 201 + unsigned long nvec; /* number of bio vec needed */ 202 + unsigned long cur_off; /* offset into current page */ 203 + unsigned long cur_len; /* I/O len of current page, up to PAGE_SIZE */ 166 204 167 - return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 168 - iov, offset, nr_segs, blkdev_get_blocks, NULL); 205 + unsigned long addr; /* user iovec address */ 206 + size_t count; /* user iovec len */ 207 + size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */ 208 + loff_t size; /* size of block device */ 209 + struct bio *bio; 210 + atomic_t *bio_count = &iocb->ki_bio_count; 211 + struct page *page; 212 + struct pvec pvec; 213 + 214 + pvec.nr = 0; 215 + pvec.idx = 0; 216 + 217 + if (pos & blocksize_mask) 218 + return -EINVAL; 219 + 220 + size = i_size_read(inode); 221 + if (pos + nbytes > size) { 222 + nbytes = size - pos; 223 + iocb->ki_left = nbytes; 224 + } 225 + 226 + /* 227 + * check first non-zero iov alignment, the remaining 228 + * iov alignment is checked inside bio loop below. 229 + */ 230 + do { 231 + addr = (unsigned long) iov[seg].iov_base; 232 + count = min(iov[seg].iov_len, nbytes); 233 + if (addr & blocksize_mask || count & blocksize_mask) 234 + return -EINVAL; 235 + } while (!count && ++seg < nr_segs); 236 + atomic_set(bio_count, 1); 237 + 238 + while (nbytes) { 239 + /* roughly estimate number of bio vec needed */ 240 + nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE; 241 + nvec = max(nvec, nr_segs - seg); 242 + nvec = min(nvec, (unsigned long) BIO_MAX_PAGES); 243 + 244 + /* bio_alloc should not fail with GFP_KERNEL flag */ 245 + bio = bio_alloc(GFP_KERNEL, nvec); 246 + bio->bi_bdev = I_BDEV(inode); 247 + bio->bi_end_io = blk_end_aio; 248 + bio->bi_private = iocb; 249 + bio->bi_sector = pos >> blkbits; 250 + same_bio: 251 + cur_off = addr & ~PAGE_MASK; 252 + cur_len = PAGE_SIZE - cur_off; 253 + if (count < cur_len) 254 + cur_len = count; 255 + 256 + page = blk_get_page(addr, count, rw, &pvec); 257 + if (unlikely(IS_ERR(page))) 258 + goto backout; 259 + 260 + if (bio_add_page(bio, page, cur_len, cur_off)) { 261 + pos += cur_len; 262 + addr += cur_len; 263 + count -= cur_len; 264 + nbytes -= cur_len; 265 + 266 + if (count) 267 + goto same_bio; 268 + while (++seg < nr_segs) { 269 + addr = (unsigned long) iov[seg].iov_base; 270 + count = iov[seg].iov_len; 271 + if (!count) 272 + continue; 273 + if (unlikely(addr & blocksize_mask || 274 + count & blocksize_mask)) { 275 + page = ERR_PTR(-EINVAL); 276 + goto backout; 277 + } 278 + count = min(count, nbytes); 279 + goto same_bio; 280 + } 281 + } 282 + 283 + /* bio is ready, submit it */ 284 + if (rw == READ) 285 + bio_set_pages_dirty(bio); 286 + atomic_inc(bio_count); 287 + submit_bio(rw, bio); 288 + } 289 + 290 + completion: 291 + iocb->ki_left -= nbytes; 292 + nbytes = iocb->ki_left; 293 + iocb->ki_pos += nbytes; 294 + 295 + blk_run_address_space(inode->i_mapping); 296 + if (atomic_dec_and_test(bio_count)) 297 + aio_complete(iocb, nbytes, 0); 298 + 299 + return -EIOCBQUEUED; 300 + 301 + backout: 302 + /* 303 + * back out nbytes count constructed so far for this bio, 304 + * we will throw away current bio. 305 + */ 306 + nbytes += bio->bi_size; 307 + bio_release_pages(bio); 308 + bio_put(bio); 309 + 310 + /* 311 + * if no bio was submmitted, return the error code. 312 + * otherwise, proceed with pending I/O completion. 313 + */ 314 + if (atomic_read(bio_count) == 1) 315 + return PTR_ERR(page); 316 + goto completion; 169 317 } 170 318 171 319 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)

include/linux/aio.h

··· 105 105 wait_queue_t ki_wait; 106 106 loff_t ki_pos; 107 107 108 + atomic_t ki_bio_count; /* num bio used for this iocb */ 108 109 void *private; 109 110 /* State that we remember to be able to restart/retry */ 110 111 unsigned short ki_opcode;

include/linux/bio.h

··· 309 309 gfp_t); 310 310 extern void bio_set_pages_dirty(struct bio *bio); 311 311 extern void bio_check_pages_dirty(struct bio *bio); 312 + extern void bio_release_pages(struct bio *bio); 312 313 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int); 313 314 extern int bio_uncopy_user(struct bio *); 314 315 void zero_fill_bio(struct bio *bio);