Btrfs: use WRITE_SYNC for synchronous writes

Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for
writes we plan on waiting on in the near future. This patch
mirrors recent changes in other filesystems and the generic code to
use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for
other latency critical writes.

Btrfs uses async worker threads for checksumming before the write is done,
and then again to actually submit the bios. The bio submission code just
runs a per-device list of bios that need to be sent down the pipe.

This list is split into low priority and high priority lists so the
WRITE_SYNC IO happens first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

+142 -47
+2 -2
fs/btrfs/disk-io.c
··· 2095 device->barriers = 0; 2096 get_bh(bh); 2097 lock_buffer(bh); 2098 - ret = submit_bh(WRITE, bh); 2099 } 2100 } else { 2101 - ret = submit_bh(WRITE, bh); 2102 } 2103 2104 if (!ret && wait) {
··· 2095 device->barriers = 0; 2096 get_bh(bh); 2097 lock_buffer(bh); 2098 + ret = submit_bh(WRITE_SYNC, bh); 2099 } 2100 } else { 2101 + ret = submit_bh(WRITE_SYNC, bh); 2102 } 2103 2104 if (!ret && wait) {
+31 -15
fs/btrfs/extent_io.c
··· 50 /* tells writepage not to lock the state bits for this range 51 * it still does the unlocking 52 */ 53 - int extent_locked; 54 }; 55 56 int __init extent_io_init(void) ··· 2139 u64 delalloc_end; 2140 int page_started; 2141 int compressed; 2142 unsigned long nr_written = 0; 2143 2144 WARN_ON(!PageLocked(page)); 2145 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); ··· 2323 (unsigned long long)end); 2324 } 2325 2326 - ret = submit_extent_page(WRITE, tree, page, sector, 2327 - iosize, pg_offset, bdev, 2328 - &epd->bio, max_nr, 2329 end_bio_extent_writepage, 2330 0, 0, 0); 2331 if (ret) ··· 2469 return ret; 2470 } 2471 2472 static noinline void flush_write_bio(void *data) 2473 { 2474 struct extent_page_data *epd = data; 2475 - if (epd->bio) { 2476 - submit_one_bio(WRITE, epd->bio, 0, 0); 2477 - epd->bio = NULL; 2478 - } 2479 } 2480 2481 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, ··· 2497 .tree = tree, 2498 .get_extent = get_extent, 2499 .extent_locked = 0, 2500 }; 2501 struct writeback_control wbc_writepages = { 2502 .bdi = wbc->bdi, ··· 2508 .range_end = (loff_t)-1, 2509 }; 2510 2511 - 2512 ret = __extent_writepage(page, wbc, &epd); 2513 2514 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2515 __extent_writepage, &epd, flush_write_bio); 2516 - if (epd.bio) 2517 - submit_one_bio(WRITE, epd.bio, 0, 0); 2518 return ret; 2519 } 2520 ··· 2531 .tree = tree, 2532 .get_extent = get_extent, 2533 .extent_locked = 1, 2534 }; 2535 struct writeback_control wbc_writepages = { 2536 .bdi = inode->i_mapping->backing_dev_info, ··· 2557 start += PAGE_CACHE_SIZE; 2558 } 2559 2560 - if (epd.bio) 2561 - submit_one_bio(WRITE, epd.bio, 0, 0); 2562 return ret; 2563 } 2564 ··· 2572 .tree = tree, 2573 .get_extent = get_extent, 2574 .extent_locked = 0, 2575 }; 2576 2577 ret = extent_write_cache_pages(tree, mapping, wbc, 2578 __extent_writepage, &epd, 2579 flush_write_bio); 2580 - if (epd.bio) 2581 - submit_one_bio(WRITE, epd.bio, 0, 0); 2582 return ret; 2583 } 2584
··· 50 /* tells writepage not to lock the state bits for this range 51 * it still does the unlocking 52 */ 53 + unsigned int extent_locked:1; 54 + 55 + /* tells the submit_bio code to use a WRITE_SYNC */ 56 + unsigned int sync_io:1; 57 }; 58 59 int __init extent_io_init(void) ··· 2136 u64 delalloc_end; 2137 int page_started; 2138 int compressed; 2139 + int write_flags; 2140 unsigned long nr_written = 0; 2141 + 2142 + if (wbc->sync_mode == WB_SYNC_ALL) 2143 + write_flags = WRITE_SYNC_PLUG; 2144 + else 2145 + write_flags = WRITE; 2146 2147 WARN_ON(!PageLocked(page)); 2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); ··· 2314 (unsigned long long)end); 2315 } 2316 2317 + ret = submit_extent_page(write_flags, tree, page, 2318 + sector, iosize, pg_offset, 2319 + bdev, &epd->bio, max_nr, 2320 end_bio_extent_writepage, 2321 0, 0, 0); 2322 if (ret) ··· 2460 return ret; 2461 } 2462 2463 + static void flush_epd_write_bio(struct extent_page_data *epd) 2464 + { 2465 + if (epd->bio) { 2466 + if (epd->sync_io) 2467 + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2468 + else 2469 + submit_one_bio(WRITE, epd->bio, 0, 0); 2470 + epd->bio = NULL; 2471 + } 2472 + } 2473 + 2474 static noinline void flush_write_bio(void *data) 2475 { 2476 struct extent_page_data *epd = data; 2477 + flush_epd_write_bio(epd); 2478 } 2479 2480 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, ··· 2480 .tree = tree, 2481 .get_extent = get_extent, 2482 .extent_locked = 0, 2483 + .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2484 }; 2485 struct writeback_control wbc_writepages = { 2486 .bdi = wbc->bdi, ··· 2490 .range_end = (loff_t)-1, 2491 }; 2492 2493 ret = __extent_writepage(page, wbc, &epd); 2494 2495 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2496 __extent_writepage, &epd, flush_write_bio); 2497 + flush_epd_write_bio(&epd); 2498 return ret; 2499 } 2500 ··· 2515 .tree = tree, 2516 .get_extent = get_extent, 2517 .extent_locked = 1, 2518 + .sync_io = mode == WB_SYNC_ALL, 2519 }; 2520 struct writeback_control wbc_writepages = { 2521 .bdi = inode->i_mapping->backing_dev_info, ··· 2540 start += PAGE_CACHE_SIZE; 2541 } 2542 2543 + flush_epd_write_bio(&epd); 2544 return ret; 2545 } 2546 ··· 2556 .tree = tree, 2557 .get_extent = get_extent, 2558 .extent_locked = 0, 2559 + .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2560 }; 2561 2562 ret = extent_write_cache_pages(tree, mapping, wbc, 2563 __extent_writepage, &epd, 2564 flush_write_bio); 2565 + flush_epd_write_bio(&epd); 2566 return ret; 2567 } 2568
+1 -1
fs/btrfs/ordered-data.c
··· 489 /* start IO across the range first to instantiate any delalloc 490 * extents 491 */ 492 - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 493 494 /* The compression code will leave pages locked but return from 495 * writepage without setting the page writeback. Starting again
··· 489 /* start IO across the range first to instantiate any delalloc 490 * extents 491 */ 492 + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 493 494 /* The compression code will leave pages locked but return from 495 * writepage without setting the page writeback. Starting again
+97 -27
fs/btrfs/volumes.c
··· 125 return NULL; 126 } 127 128 /* 129 * we try to collect pending bios for a device so we don't get a large 130 * number of procs sending bios down to the same device. This greatly ··· 155 struct bio *pending; 156 struct backing_dev_info *bdi; 157 struct btrfs_fs_info *fs_info; 158 struct bio *tail; 159 struct bio *cur; 160 int again = 0; 161 - unsigned long num_run = 0; 162 unsigned long limit; 163 unsigned long last_waited = 0; 164 ··· 169 limit = btrfs_async_submit_limit(fs_info); 170 limit = limit * 2 / 3; 171 172 loop: 173 spin_lock(&device->io_lock); 174 175 loop_lock: 176 /* take all the bios off the list at once and process them 177 * later on (without the lock held). But, remember the 178 * tail and other pointers so the bios can be properly reinserted 179 * into the list if we hit congestion 180 */ 181 - pending = device->pending_bios; 182 - tail = device->pending_bio_tail; 183 WARN_ON(pending && !tail); 184 - device->pending_bios = NULL; 185 - device->pending_bio_tail = NULL; 186 187 /* 188 * if pending was null this time around, no bios need processing ··· 202 * device->running_pending is used to synchronize with the 203 * schedule_bio code. 204 */ 205 - if (pending) { 206 - again = 1; 207 - device->running_pending = 1; 208 - } else { 209 again = 0; 210 device->running_pending = 0; 211 } 212 spin_unlock(&device->io_lock); 213 214 while (pending) { 215 cur = pending; 216 pending = pending->bi_next; 217 cur->bi_next = NULL; ··· 247 wake_up(&fs_info->async_submit_wait); 248 249 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 250 - bio_get(cur); 251 submit_bio(cur->bi_rw, cur); 252 - bio_put(cur); 253 num_run++; 254 255 /* 256 * we made progress, there is more work to do and the bdi ··· 267 */ 268 if (pending && bdi_write_congested(bdi) && num_run > 16 && 269 fs_info->fs_devices->open_devices > 1) { 270 - struct bio *old_head; 271 struct io_context *ioc; 272 273 ioc = current->io_context; ··· 291 * against it before looping 292 */ 293 last_waited = ioc->last_waited; 294 continue; 295 } 296 spin_lock(&device->io_lock); 297 - 298 - old_head = device->pending_bios; 299 - device->pending_bios = pending; 300 - if (device->pending_bio_tail) 301 - tail->bi_next = old_head; 302 - else 303 - device->pending_bio_tail = tail; 304 - 305 device->running_pending = 1; 306 307 spin_unlock(&device->io_lock); ··· 309 goto done; 310 } 311 } 312 if (again) 313 goto loop; 314 315 spin_lock(&device->io_lock); 316 - if (device->pending_bios) 317 goto loop_lock; 318 spin_unlock(&device->io_lock); 319 ··· 2562 max_errors = 1; 2563 } 2564 } 2565 - if (multi_ret && rw == WRITE && 2566 stripes_allocated < stripes_required) { 2567 stripes_allocated = map->num_stripes; 2568 free_extent_map(em); ··· 2827 int rw, struct bio *bio) 2828 { 2829 int should_queue = 1; 2830 2831 /* don't bother with additional async steps for reads, right now */ 2832 if (!(rw & (1 << BIO_RW))) { ··· 2849 bio->bi_rw |= rw; 2850 2851 spin_lock(&device->io_lock); 2852 2853 - if (device->pending_bio_tail) 2854 - device->pending_bio_tail->bi_next = bio; 2855 2856 - device->pending_bio_tail = bio; 2857 - if (!device->pending_bios) 2858 - device->pending_bios = bio; 2859 if (device->running_pending) 2860 should_queue = 0; 2861
··· 125 return NULL; 126 } 127 128 + static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 + struct bio *head, struct bio *tail) 130 + { 131 + 132 + struct bio *old_head; 133 + 134 + old_head = pending_bios->head; 135 + pending_bios->head = head; 136 + if (pending_bios->tail) 137 + tail->bi_next = old_head; 138 + else 139 + pending_bios->tail = tail; 140 + } 141 + 142 /* 143 * we try to collect pending bios for a device so we don't get a large 144 * number of procs sending bios down to the same device. This greatly ··· 141 struct bio *pending; 142 struct backing_dev_info *bdi; 143 struct btrfs_fs_info *fs_info; 144 + struct btrfs_pending_bios *pending_bios; 145 struct bio *tail; 146 struct bio *cur; 147 int again = 0; 148 + unsigned long num_run; 149 + unsigned long num_sync_run; 150 unsigned long limit; 151 unsigned long last_waited = 0; 152 ··· 153 limit = btrfs_async_submit_limit(fs_info); 154 limit = limit * 2 / 3; 155 156 + /* we want to make sure that every time we switch from the sync 157 + * list to the normal list, we unplug 158 + */ 159 + num_sync_run = 0; 160 + 161 loop: 162 spin_lock(&device->io_lock); 163 + num_run = 0; 164 165 loop_lock: 166 + 167 /* take all the bios off the list at once and process them 168 * later on (without the lock held). But, remember the 169 * tail and other pointers so the bios can be properly reinserted 170 * into the list if we hit congestion 171 */ 172 + if (device->pending_sync_bios.head) 173 + pending_bios = &device->pending_sync_bios; 174 + else 175 + pending_bios = &device->pending_bios; 176 + 177 + pending = pending_bios->head; 178 + tail = pending_bios->tail; 179 WARN_ON(pending && !tail); 180 181 /* 182 * if pending was null this time around, no bios need processing ··· 176 * device->running_pending is used to synchronize with the 177 * schedule_bio code. 178 */ 179 + if (device->pending_sync_bios.head == NULL && 180 + device->pending_bios.head == NULL) { 181 again = 0; 182 device->running_pending = 0; 183 + } else { 184 + again = 1; 185 + device->running_pending = 1; 186 } 187 + 188 + pending_bios->head = NULL; 189 + pending_bios->tail = NULL; 190 + 191 spin_unlock(&device->io_lock); 192 193 + /* 194 + * if we're doing the regular priority list, make sure we unplug 195 + * for any high prio bios we've sent down 196 + */ 197 + if (pending_bios == &device->pending_bios && num_sync_run > 0) { 198 + num_sync_run = 0; 199 + blk_run_backing_dev(bdi, NULL); 200 + } 201 + 202 while (pending) { 203 + 204 + rmb(); 205 + if (pending_bios != &device->pending_sync_bios && 206 + device->pending_sync_bios.head && 207 + num_run > 16) { 208 + cond_resched(); 209 + spin_lock(&device->io_lock); 210 + requeue_list(pending_bios, pending, tail); 211 + goto loop_lock; 212 + } 213 + 214 cur = pending; 215 pending = pending->bi_next; 216 cur->bi_next = NULL; ··· 196 wake_up(&fs_info->async_submit_wait); 197 198 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 199 submit_bio(cur->bi_rw, cur); 200 num_run++; 201 + if (bio_sync(cur)) 202 + num_sync_run++; 203 + 204 + if (need_resched()) { 205 + if (num_sync_run) { 206 + blk_run_backing_dev(bdi, NULL); 207 + num_sync_run = 0; 208 + } 209 + cond_resched(); 210 + } 211 212 /* 213 * we made progress, there is more work to do and the bdi ··· 208 */ 209 if (pending && bdi_write_congested(bdi) && num_run > 16 && 210 fs_info->fs_devices->open_devices > 1) { 211 struct io_context *ioc; 212 213 ioc = current->io_context; ··· 233 * against it before looping 234 */ 235 last_waited = ioc->last_waited; 236 + if (need_resched()) { 237 + if (num_sync_run) { 238 + blk_run_backing_dev(bdi, NULL); 239 + num_sync_run = 0; 240 + } 241 + cond_resched(); 242 + } 243 continue; 244 } 245 spin_lock(&device->io_lock); 246 + requeue_list(pending_bios, pending, tail); 247 device->running_pending = 1; 248 249 spin_unlock(&device->io_lock); ··· 251 goto done; 252 } 253 } 254 + 255 + if (num_sync_run) { 256 + num_sync_run = 0; 257 + blk_run_backing_dev(bdi, NULL); 258 + } 259 + 260 + cond_resched(); 261 if (again) 262 goto loop; 263 264 spin_lock(&device->io_lock); 265 + if (device->pending_bios.head || device->pending_sync_bios.head) 266 goto loop_lock; 267 spin_unlock(&device->io_lock); 268 ··· 2497 max_errors = 1; 2498 } 2499 } 2500 + if (multi_ret && (rw & (1 << BIO_RW)) && 2501 stripes_allocated < stripes_required) { 2502 stripes_allocated = map->num_stripes; 2503 free_extent_map(em); ··· 2762 int rw, struct bio *bio) 2763 { 2764 int should_queue = 1; 2765 + struct btrfs_pending_bios *pending_bios; 2766 2767 /* don't bother with additional async steps for reads, right now */ 2768 if (!(rw & (1 << BIO_RW))) { ··· 2783 bio->bi_rw |= rw; 2784 2785 spin_lock(&device->io_lock); 2786 + if (bio_sync(bio)) 2787 + pending_bios = &device->pending_sync_bios; 2788 + else 2789 + pending_bios = &device->pending_bios; 2790 2791 + if (pending_bios->tail) 2792 + pending_bios->tail->bi_next = bio; 2793 2794 + pending_bios->tail = bio; 2795 + if (!pending_bios->head) 2796 + pending_bios->head = bio; 2797 if (device->running_pending) 2798 should_queue = 0; 2799
+11 -2
fs/btrfs/volumes.h
··· 23 #include "async-thread.h" 24 25 struct buffer_head; 26 struct btrfs_device { 27 struct list_head dev_list; 28 struct list_head dev_alloc_list; 29 struct btrfs_fs_devices *fs_devices; 30 struct btrfs_root *dev_root; 31 - struct bio *pending_bios; 32 - struct bio *pending_bio_tail; 33 int running_pending; 34 u64 generation; 35
··· 23 #include "async-thread.h" 24 25 struct buffer_head; 26 + struct btrfs_pending_bios { 27 + struct bio *head; 28 + struct bio *tail; 29 + }; 30 + 31 struct btrfs_device { 32 struct list_head dev_list; 33 struct list_head dev_alloc_list; 34 struct btrfs_fs_devices *fs_devices; 35 struct btrfs_root *dev_root; 36 + 37 + /* regular prio bios */ 38 + struct btrfs_pending_bios pending_bios; 39 + /* WRITE_SYNC bios */ 40 + struct btrfs_pending_bios pending_sync_bios; 41 + 42 int running_pending; 43 u64 generation; 44