Btrfs: use WRITE_SYNC for synchronous writes

Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for
writes we plan on waiting on in the near future. This patch
mirrors recent changes in other filesystems and the generic code to
use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for
other latency critical writes.

Btrfs uses async worker threads for checksumming before the write is done,
and then again to actually submit the bios. The bio submission code just
runs a per-device list of bios that need to be sent down the pipe.

This list is split into low priority and high priority lists so the
WRITE_SYNC IO happens first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

+142 -47
+2 -2
fs/btrfs/disk-io.c
··· 2095 2095 device->barriers = 0; 2096 2096 get_bh(bh); 2097 2097 lock_buffer(bh); 2098 - ret = submit_bh(WRITE, bh); 2098 + ret = submit_bh(WRITE_SYNC, bh); 2099 2099 } 2100 2100 } else { 2101 - ret = submit_bh(WRITE, bh); 2101 + ret = submit_bh(WRITE_SYNC, bh); 2102 2102 } 2103 2103 2104 2104 if (!ret && wait) {
+31 -15
fs/btrfs/extent_io.c
··· 50 50 /* tells writepage not to lock the state bits for this range 51 51 * it still does the unlocking 52 52 */ 53 - int extent_locked; 53 + unsigned int extent_locked:1; 54 + 55 + /* tells the submit_bio code to use a WRITE_SYNC */ 56 + unsigned int sync_io:1; 54 57 }; 55 58 56 59 int __init extent_io_init(void) ··· 2139 2136 u64 delalloc_end; 2140 2137 int page_started; 2141 2138 int compressed; 2139 + int write_flags; 2142 2140 unsigned long nr_written = 0; 2141 + 2142 + if (wbc->sync_mode == WB_SYNC_ALL) 2143 + write_flags = WRITE_SYNC_PLUG; 2144 + else 2145 + write_flags = WRITE; 2143 2146 2144 2147 WARN_ON(!PageLocked(page)); 2145 2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); ··· 2323 2314 (unsigned long long)end); 2324 2315 } 2325 2316 2326 - ret = submit_extent_page(WRITE, tree, page, sector, 2327 - iosize, pg_offset, bdev, 2328 - &epd->bio, max_nr, 2317 + ret = submit_extent_page(write_flags, tree, page, 2318 + sector, iosize, pg_offset, 2319 + bdev, &epd->bio, max_nr, 2329 2320 end_bio_extent_writepage, 2330 2321 0, 0, 0); 2331 2322 if (ret) ··· 2469 2460 return ret; 2470 2461 } 2471 2462 2463 + static void flush_epd_write_bio(struct extent_page_data *epd) 2464 + { 2465 + if (epd->bio) { 2466 + if (epd->sync_io) 2467 + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2468 + else 2469 + submit_one_bio(WRITE, epd->bio, 0, 0); 2470 + epd->bio = NULL; 2471 + } 2472 + } 2473 + 2472 2474 static noinline void flush_write_bio(void *data) 2473 2475 { 2474 2476 struct extent_page_data *epd = data; 2475 - if (epd->bio) { 2476 - submit_one_bio(WRITE, epd->bio, 0, 0); 2477 - epd->bio = NULL; 2478 - } 2477 + flush_epd_write_bio(epd); 2479 2478 } 2480 2479 2481 2480 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, ··· 2497 2480 .tree = tree, 2498 2481 .get_extent = get_extent, 2499 2482 .extent_locked = 0, 2483 + .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2500 2484 }; 2501 2485 struct writeback_control wbc_writepages = { 2502 2486 .bdi = wbc->bdi, ··· 2508 2490 .range_end = (loff_t)-1, 2509 2491 }; 2510 2492 2511 - 2512 2493 ret = __extent_writepage(page, wbc, &epd); 2513 2494 2514 2495 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2515 2496 __extent_writepage, &epd, flush_write_bio); 2516 - if (epd.bio) 2517 - submit_one_bio(WRITE, epd.bio, 0, 0); 2497 + flush_epd_write_bio(&epd); 2518 2498 return ret; 2519 2499 } 2520 2500 ··· 2531 2515 .tree = tree, 2532 2516 .get_extent = get_extent, 2533 2517 .extent_locked = 1, 2518 + .sync_io = mode == WB_SYNC_ALL, 2534 2519 }; 2535 2520 struct writeback_control wbc_writepages = { 2536 2521 .bdi = inode->i_mapping->backing_dev_info, ··· 2557 2540 start += PAGE_CACHE_SIZE; 2558 2541 } 2559 2542 2560 - if (epd.bio) 2561 - submit_one_bio(WRITE, epd.bio, 0, 0); 2543 + flush_epd_write_bio(&epd); 2562 2544 return ret; 2563 2545 } 2564 2546 ··· 2572 2556 .tree = tree, 2573 2557 .get_extent = get_extent, 2574 2558 .extent_locked = 0, 2559 + .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2575 2560 }; 2576 2561 2577 2562 ret = extent_write_cache_pages(tree, mapping, wbc, 2578 2563 __extent_writepage, &epd, 2579 2564 flush_write_bio); 2580 - if (epd.bio) 2581 - submit_one_bio(WRITE, epd.bio, 0, 0); 2565 + flush_epd_write_bio(&epd); 2582 2566 return ret; 2583 2567 } 2584 2568
+1 -1
fs/btrfs/ordered-data.c
··· 489 489 /* start IO across the range first to instantiate any delalloc 490 490 * extents 491 491 */ 492 - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 492 + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 493 493 494 494 /* The compression code will leave pages locked but return from 495 495 * writepage without setting the page writeback. Starting again
+97 -27
fs/btrfs/volumes.c
··· 125 125 return NULL; 126 126 } 127 127 128 + static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 + struct bio *head, struct bio *tail) 130 + { 131 + 132 + struct bio *old_head; 133 + 134 + old_head = pending_bios->head; 135 + pending_bios->head = head; 136 + if (pending_bios->tail) 137 + tail->bi_next = old_head; 138 + else 139 + pending_bios->tail = tail; 140 + } 141 + 128 142 /* 129 143 * we try to collect pending bios for a device so we don't get a large 130 144 * number of procs sending bios down to the same device. This greatly ··· 155 141 struct bio *pending; 156 142 struct backing_dev_info *bdi; 157 143 struct btrfs_fs_info *fs_info; 144 + struct btrfs_pending_bios *pending_bios; 158 145 struct bio *tail; 159 146 struct bio *cur; 160 147 int again = 0; 161 - unsigned long num_run = 0; 148 + unsigned long num_run; 149 + unsigned long num_sync_run; 162 150 unsigned long limit; 163 151 unsigned long last_waited = 0; 164 152 ··· 169 153 limit = btrfs_async_submit_limit(fs_info); 170 154 limit = limit * 2 / 3; 171 155 156 + /* we want to make sure that every time we switch from the sync 157 + * list to the normal list, we unplug 158 + */ 159 + num_sync_run = 0; 160 + 172 161 loop: 173 162 spin_lock(&device->io_lock); 163 + num_run = 0; 174 164 175 165 loop_lock: 166 + 176 167 /* take all the bios off the list at once and process them 177 168 * later on (without the lock held). But, remember the 178 169 * tail and other pointers so the bios can be properly reinserted 179 170 * into the list if we hit congestion 180 171 */ 181 - pending = device->pending_bios; 182 - tail = device->pending_bio_tail; 172 + if (device->pending_sync_bios.head) 173 + pending_bios = &device->pending_sync_bios; 174 + else 175 + pending_bios = &device->pending_bios; 176 + 177 + pending = pending_bios->head; 178 + tail = pending_bios->tail; 183 179 WARN_ON(pending && !tail); 184 - device->pending_bios = NULL; 185 - device->pending_bio_tail = NULL; 186 180 187 181 /* 188 182 * if pending was null this time around, no bios need processing ··· 202 176 * device->running_pending is used to synchronize with the 203 177 * schedule_bio code. 204 178 */ 205 - if (pending) { 206 - again = 1; 207 - device->running_pending = 1; 208 - } else { 179 + if (device->pending_sync_bios.head == NULL && 180 + device->pending_bios.head == NULL) { 209 181 again = 0; 210 182 device->running_pending = 0; 183 + } else { 184 + again = 1; 185 + device->running_pending = 1; 211 186 } 187 + 188 + pending_bios->head = NULL; 189 + pending_bios->tail = NULL; 190 + 212 191 spin_unlock(&device->io_lock); 213 192 193 + /* 194 + * if we're doing the regular priority list, make sure we unplug 195 + * for any high prio bios we've sent down 196 + */ 197 + if (pending_bios == &device->pending_bios && num_sync_run > 0) { 198 + num_sync_run = 0; 199 + blk_run_backing_dev(bdi, NULL); 200 + } 201 + 214 202 while (pending) { 203 + 204 + rmb(); 205 + if (pending_bios != &device->pending_sync_bios && 206 + device->pending_sync_bios.head && 207 + num_run > 16) { 208 + cond_resched(); 209 + spin_lock(&device->io_lock); 210 + requeue_list(pending_bios, pending, tail); 211 + goto loop_lock; 212 + } 213 + 215 214 cur = pending; 216 215 pending = pending->bi_next; 217 216 cur->bi_next = NULL; ··· 247 196 wake_up(&fs_info->async_submit_wait); 248 197 249 198 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 250 - bio_get(cur); 251 199 submit_bio(cur->bi_rw, cur); 252 - bio_put(cur); 253 200 num_run++; 201 + if (bio_sync(cur)) 202 + num_sync_run++; 203 + 204 + if (need_resched()) { 205 + if (num_sync_run) { 206 + blk_run_backing_dev(bdi, NULL); 207 + num_sync_run = 0; 208 + } 209 + cond_resched(); 210 + } 254 211 255 212 /* 256 213 * we made progress, there is more work to do and the bdi ··· 267 208 */ 268 209 if (pending && bdi_write_congested(bdi) && num_run > 16 && 269 210 fs_info->fs_devices->open_devices > 1) { 270 - struct bio *old_head; 271 211 struct io_context *ioc; 272 212 273 213 ioc = current->io_context; ··· 291 233 * against it before looping 292 234 */ 293 235 last_waited = ioc->last_waited; 236 + if (need_resched()) { 237 + if (num_sync_run) { 238 + blk_run_backing_dev(bdi, NULL); 239 + num_sync_run = 0; 240 + } 241 + cond_resched(); 242 + } 294 243 continue; 295 244 } 296 245 spin_lock(&device->io_lock); 297 - 298 - old_head = device->pending_bios; 299 - device->pending_bios = pending; 300 - if (device->pending_bio_tail) 301 - tail->bi_next = old_head; 302 - else 303 - device->pending_bio_tail = tail; 304 - 246 + requeue_list(pending_bios, pending, tail); 305 247 device->running_pending = 1; 306 248 307 249 spin_unlock(&device->io_lock); ··· 309 251 goto done; 310 252 } 311 253 } 254 + 255 + if (num_sync_run) { 256 + num_sync_run = 0; 257 + blk_run_backing_dev(bdi, NULL); 258 + } 259 + 260 + cond_resched(); 312 261 if (again) 313 262 goto loop; 314 263 315 264 spin_lock(&device->io_lock); 316 - if (device->pending_bios) 265 + if (device->pending_bios.head || device->pending_sync_bios.head) 317 266 goto loop_lock; 318 267 spin_unlock(&device->io_lock); 319 268 ··· 2562 2497 max_errors = 1; 2563 2498 } 2564 2499 } 2565 - if (multi_ret && rw == WRITE && 2500 + if (multi_ret && (rw & (1 << BIO_RW)) && 2566 2501 stripes_allocated < stripes_required) { 2567 2502 stripes_allocated = map->num_stripes; 2568 2503 free_extent_map(em); ··· 2827 2762 int rw, struct bio *bio) 2828 2763 { 2829 2764 int should_queue = 1; 2765 + struct btrfs_pending_bios *pending_bios; 2830 2766 2831 2767 /* don't bother with additional async steps for reads, right now */ 2832 2768 if (!(rw & (1 << BIO_RW))) { ··· 2849 2783 bio->bi_rw |= rw; 2850 2784 2851 2785 spin_lock(&device->io_lock); 2786 + if (bio_sync(bio)) 2787 + pending_bios = &device->pending_sync_bios; 2788 + else 2789 + pending_bios = &device->pending_bios; 2852 2790 2853 - if (device->pending_bio_tail) 2854 - device->pending_bio_tail->bi_next = bio; 2791 + if (pending_bios->tail) 2792 + pending_bios->tail->bi_next = bio; 2855 2793 2856 - device->pending_bio_tail = bio; 2857 - if (!device->pending_bios) 2858 - device->pending_bios = bio; 2794 + pending_bios->tail = bio; 2795 + if (!pending_bios->head) 2796 + pending_bios->head = bio; 2859 2797 if (device->running_pending) 2860 2798 should_queue = 0; 2861 2799
+11 -2
fs/btrfs/volumes.h
··· 23 23 #include "async-thread.h" 24 24 25 25 struct buffer_head; 26 + struct btrfs_pending_bios { 27 + struct bio *head; 28 + struct bio *tail; 29 + }; 30 + 26 31 struct btrfs_device { 27 32 struct list_head dev_list; 28 33 struct list_head dev_alloc_list; 29 34 struct btrfs_fs_devices *fs_devices; 30 35 struct btrfs_root *dev_root; 31 - struct bio *pending_bios; 32 - struct bio *pending_bio_tail; 36 + 37 + /* regular prio bios */ 38 + struct btrfs_pending_bios pending_bios; 39 + /* WRITE_SYNC bios */ 40 + struct btrfs_pending_bios pending_sync_bios; 41 + 33 42 int running_pending; 34 43 u64 generation; 35 44