Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull more btrfs updates from David Sterba:
"Fixes that arrived after the merge window freeze, mostly stable
material.

- fix race in tree-mod-log element tracking

- fix bio flushing inside extent writepages

- fix assertion when in-memory tracking of discarded extents finds an
empty tree (eg. after adding a new device)

- update logic of temporary read-only block groups to take into
account overcommit

- fix some fixup worker corner cases:
- page could not go through proper COW cycle and the dirty status
is lost due to page migration
- deadlock if delayed allocation is performed under page lock

- fix send emitting invalid clones within the same file

- fix statfs reporting 0 free space when global block reserve size is
larger than remaining free space but there is still space for new
chunks"

* tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: do not zero f_bavail if we have available space
Btrfs: send, fix emission of invalid clone operations within the same file
btrfs: do not do delalloc reservation under page lock
btrfs: drop the -EBUSY case in __extent_writepage_io
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
btrfs: take overcommit into account in inc_block_group_ro
btrfs: fix force usage in inc_block_group_ro
btrfs: Correctly handle empty trees in find_first_clear_extent_bit
btrfs: flush write bio if we loop in extent_write_cache_pages
Btrfs: fix race between adding and putting tree mod seq elements and nodes

+193 -83
+27 -12
fs/btrfs/block-group.c
··· 1191 1191 { 1192 1192 struct btrfs_space_info *sinfo = cache->space_info; 1193 1193 u64 num_bytes; 1194 - u64 sinfo_used; 1195 1194 int ret = -ENOSPC; 1196 1195 1197 1196 spin_lock(&sinfo->lock); ··· 1204 1205 1205 1206 num_bytes = cache->length - cache->reserved - cache->pinned - 1206 1207 cache->bytes_super - cache->used; 1207 - sinfo_used = btrfs_space_info_used(sinfo, true); 1208 1208 1209 1209 /* 1210 - * sinfo_used + num_bytes should always <= sinfo->total_bytes. 1211 - * 1212 - * Here we make sure if we mark this bg RO, we still have enough 1213 - * free space as buffer. 1210 + * Data never overcommits, even in mixed mode, so do just the straight 1211 + * check of left over space in how much we have allocated. 1214 1212 */ 1215 - if (sinfo_used + num_bytes <= sinfo->total_bytes) { 1213 + if (force) { 1214 + ret = 0; 1215 + } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 1216 + u64 sinfo_used = btrfs_space_info_used(sinfo, true); 1217 + 1218 + /* 1219 + * Here we make sure if we mark this bg RO, we still have enough 1220 + * free space as buffer. 1221 + */ 1222 + if (sinfo_used + num_bytes <= sinfo->total_bytes) 1223 + ret = 0; 1224 + } else { 1225 + /* 1226 + * We overcommit metadata, so we need to do the 1227 + * btrfs_can_overcommit check here, and we need to pass in 1228 + * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 1229 + * leeway to allow us to mark this block group as read only. 1230 + */ 1231 + if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 1232 + BTRFS_RESERVE_NO_FLUSH)) 1233 + ret = 0; 1234 + } 1235 + 1236 + if (!ret) { 1216 1237 sinfo->bytes_readonly += num_bytes; 1217 1238 cache->ro++; 1218 1239 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1219 - ret = 0; 1220 1240 } 1221 1241 out: 1222 1242 spin_unlock(&cache->lock); ··· 1243 1225 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1244 1226 btrfs_info(cache->fs_info, 1245 1227 "unable to make block group %llu ro", cache->start); 1246 - btrfs_info(cache->fs_info, 1247 - "sinfo_used=%llu bg_num_bytes=%llu", 1248 - sinfo_used, num_bytes); 1249 1228 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1250 1229 } 1251 1230 return ret; ··· 2240 2225 } 2241 2226 } 2242 2227 2243 - ret = inc_block_group_ro(cache, !do_chunk_alloc); 2228 + ret = inc_block_group_ro(cache, 0); 2244 2229 if (!do_chunk_alloc) 2245 2230 goto unlock_out; 2246 2231 if (!ret)
+2 -6
fs/btrfs/ctree.c
··· 326 326 struct seq_list *elem) 327 327 { 328 328 write_lock(&fs_info->tree_mod_log_lock); 329 - spin_lock(&fs_info->tree_mod_seq_lock); 330 329 if (!elem->seq) { 331 330 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 332 331 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 333 332 } 334 - spin_unlock(&fs_info->tree_mod_seq_lock); 335 333 write_unlock(&fs_info->tree_mod_log_lock); 336 334 337 335 return elem->seq; ··· 349 351 if (!seq_putting) 350 352 return; 351 353 352 - spin_lock(&fs_info->tree_mod_seq_lock); 354 + write_lock(&fs_info->tree_mod_log_lock); 353 355 list_del(&elem->list); 354 356 elem->seq = 0; 355 357 ··· 360 362 * blocker with lower sequence number exists, we 361 363 * cannot remove anything from the log 362 364 */ 363 - spin_unlock(&fs_info->tree_mod_seq_lock); 365 + write_unlock(&fs_info->tree_mod_log_lock); 364 366 return; 365 367 } 366 368 min_seq = cur_elem->seq; 367 369 } 368 370 } 369 - spin_unlock(&fs_info->tree_mod_seq_lock); 370 371 371 372 /* 372 373 * anything that's lower than the lowest existing (read: blocked) 373 374 * sequence number can be removed from the tree. 374 375 */ 375 - write_lock(&fs_info->tree_mod_log_lock); 376 376 tm_root = &fs_info->tree_mod_log; 377 377 for (node = rb_first(tm_root); node; node = next) { 378 378 next = rb_next(node);
+2 -4
fs/btrfs/ctree.h
··· 714 714 atomic_t nr_delayed_iputs; 715 715 wait_queue_head_t delayed_iputs_wait; 716 716 717 - /* this protects tree_mod_seq_list */ 718 - spinlock_t tree_mod_seq_lock; 719 717 atomic64_t tree_mod_seq; 720 - struct list_head tree_mod_seq_list; 721 718 722 - /* this protects tree_mod_log */ 719 + /* this protects tree_mod_log and tree_mod_seq_list */ 723 720 rwlock_t tree_mod_log_lock; 724 721 struct rb_root tree_mod_log; 722 + struct list_head tree_mod_seq_list; 725 723 726 724 atomic_t async_delalloc_pages; 727 725
+4 -4
fs/btrfs/delayed-ref.c
··· 492 492 if (head->is_data) 493 493 return; 494 494 495 - spin_lock(&fs_info->tree_mod_seq_lock); 495 + read_lock(&fs_info->tree_mod_log_lock); 496 496 if (!list_empty(&fs_info->tree_mod_seq_list)) { 497 497 struct seq_list *elem; 498 498 ··· 500 500 struct seq_list, list); 501 501 seq = elem->seq; 502 502 } 503 - spin_unlock(&fs_info->tree_mod_seq_lock); 503 + read_unlock(&fs_info->tree_mod_log_lock); 504 504 505 505 again: 506 506 for (node = rb_first_cached(&head->ref_tree); node; ··· 518 518 struct seq_list *elem; 519 519 int ret = 0; 520 520 521 - spin_lock(&fs_info->tree_mod_seq_lock); 521 + read_lock(&fs_info->tree_mod_log_lock); 522 522 if (!list_empty(&fs_info->tree_mod_seq_list)) { 523 523 elem = list_first_entry(&fs_info->tree_mod_seq_list, 524 524 struct seq_list, list); ··· 531 531 } 532 532 } 533 533 534 - spin_unlock(&fs_info->tree_mod_seq_lock); 534 + read_unlock(&fs_info->tree_mod_log_lock); 535 535 return ret; 536 536 } 537 537
-1
fs/btrfs/disk-io.c
··· 2697 2697 spin_lock_init(&fs_info->fs_roots_radix_lock); 2698 2698 spin_lock_init(&fs_info->delayed_iput_lock); 2699 2699 spin_lock_init(&fs_info->defrag_inodes_lock); 2700 - spin_lock_init(&fs_info->tree_mod_seq_lock); 2701 2700 spin_lock_init(&fs_info->super_lock); 2702 2701 spin_lock_init(&fs_info->buffer_lock); 2703 2702 spin_lock_init(&fs_info->unused_bgs_lock);
+29 -20
fs/btrfs/extent_io.c
··· 1593 1593 /* Find first extent with bits cleared */ 1594 1594 while (1) { 1595 1595 node = __etree_search(tree, start, &next, &prev, NULL, NULL); 1596 - if (!node) { 1596 + if (!node && !next && !prev) { 1597 + /* 1598 + * Tree is completely empty, send full range and let 1599 + * caller deal with it 1600 + */ 1601 + *start_ret = 0; 1602 + *end_ret = -1; 1603 + goto out; 1604 + } else if (!node && !next) { 1605 + /* 1606 + * We are past the last allocated chunk, set start at 1607 + * the end of the last extent. 1608 + */ 1609 + state = rb_entry(prev, struct extent_state, rb_node); 1610 + *start_ret = state->end + 1; 1611 + *end_ret = -1; 1612 + goto out; 1613 + } else if (!node) { 1597 1614 node = next; 1598 - if (!node) { 1599 - /* 1600 - * We are past the last allocated chunk, 1601 - * set start at the end of the last extent. The 1602 - * device alloc tree should never be empty so 1603 - * prev is always set. 1604 - */ 1605 - ASSERT(prev); 1606 - state = rb_entry(prev, struct extent_state, rb_node); 1607 - *start_ret = state->end + 1; 1608 - *end_ret = -1; 1609 - goto out; 1610 - } 1611 1615 } 1612 1616 /* 1613 1617 * At this point 'node' either contains 'start' or start is ··· 3442 3438 ret = btrfs_writepage_cow_fixup(page, start, page_end); 3443 3439 if (ret) { 3444 3440 /* Fixup worker will requeue */ 3445 - if (ret == -EBUSY) 3446 - wbc->pages_skipped++; 3447 - else 3448 - redirty_page_for_writepage(wbc, page); 3449 - 3441 + redirty_page_for_writepage(wbc, page); 3450 3442 update_nr_written(wbc, nr_written); 3451 3443 unlock_page(page); 3452 3444 return 1; ··· 4166 4166 */ 4167 4167 scanned = 1; 4168 4168 index = 0; 4169 - goto retry; 4169 + 4170 + /* 4171 + * If we're looping we could run into a page that is locked by a 4172 + * writer and that writer could be waiting on writeback for a 4173 + * page in our current bio, and thus deadlock, so flush the 4174 + * write bio here. 4175 + */ 4176 + ret = flush_write_bio(epd); 4177 + if (!ret) 4178 + goto retry; 4170 4179 } 4171 4180 4172 4181 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
+96 -25
fs/btrfs/inode.c
··· 2189 2189 /* see btrfs_writepage_start_hook for details on why this is required */ 2190 2190 struct btrfs_writepage_fixup { 2191 2191 struct page *page; 2192 + struct inode *inode; 2192 2193 struct btrfs_work work; 2193 2194 }; 2194 2195 ··· 2203 2202 struct inode *inode; 2204 2203 u64 page_start; 2205 2204 u64 page_end; 2206 - int ret; 2205 + int ret = 0; 2206 + bool free_delalloc_space = true; 2207 2207 2208 2208 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2209 2209 page = fixup->page; 2210 + inode = fixup->inode; 2211 + page_start = page_offset(page); 2212 + page_end = page_offset(page) + PAGE_SIZE - 1; 2213 + 2214 + /* 2215 + * This is similar to page_mkwrite, we need to reserve the space before 2216 + * we take the page lock. 2217 + */ 2218 + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2219 + PAGE_SIZE); 2210 2220 again: 2211 2221 lock_page(page); 2222 + 2223 + /* 2224 + * Before we queued this fixup, we took a reference on the page. 2225 + * page->mapping may go NULL, but it shouldn't be moved to a different 2226 + * address space. 2227 + */ 2212 2228 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2213 - ClearPageChecked(page); 2229 + /* 2230 + * Unfortunately this is a little tricky, either 2231 + * 2232 + * 1) We got here and our page had already been dealt with and 2233 + * we reserved our space, thus ret == 0, so we need to just 2234 + * drop our space reservation and bail. This can happen the 2235 + * first time we come into the fixup worker, or could happen 2236 + * while waiting for the ordered extent. 2237 + * 2) Our page was already dealt with, but we happened to get an 2238 + * ENOSPC above from the btrfs_delalloc_reserve_space. In 2239 + * this case we obviously don't have anything to release, but 2240 + * because the page was already dealt with we don't want to 2241 + * mark the page with an error, so make sure we're resetting 2242 + * ret to 0. This is why we have this check _before_ the ret 2243 + * check, because we do not want to have a surprise ENOSPC 2244 + * when the page was already properly dealt with. 2245 + */ 2246 + if (!ret) { 2247 + btrfs_delalloc_release_extents(BTRFS_I(inode), 2248 + PAGE_SIZE); 2249 + btrfs_delalloc_release_space(inode, data_reserved, 2250 + page_start, PAGE_SIZE, 2251 + true); 2252 + } 2253 + ret = 0; 2214 2254 goto out_page; 2215 2255 } 2216 2256 2217 - inode = page->mapping->host; 2218 - page_start = page_offset(page); 2219 - page_end = page_offset(page) + PAGE_SIZE - 1; 2257 + /* 2258 + * We can't mess with the page state unless it is locked, so now that 2259 + * it is locked bail if we failed to make our space reservation. 2260 + */ 2261 + if (ret) 2262 + goto out_page; 2220 2263 2221 2264 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 2222 2265 &cached_state); 2223 2266 2224 2267 /* already ordered? We're done */ 2225 2268 if (PagePrivate2(page)) 2226 - goto out; 2269 + goto out_reserved; 2227 2270 2228 2271 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, 2229 2272 PAGE_SIZE); ··· 2280 2235 goto again; 2281 2236 } 2282 2237 2283 - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2284 - PAGE_SIZE); 2285 - if (ret) { 2286 - mapping_set_error(page->mapping, ret); 2287 - end_extent_writepage(page, ret, page_start, page_end); 2288 - ClearPageChecked(page); 2289 - goto out; 2290 - } 2291 - 2292 2238 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2293 2239 &cached_state); 2294 - if (ret) { 2295 - mapping_set_error(page->mapping, ret); 2296 - end_extent_writepage(page, ret, page_start, page_end); 2297 - ClearPageChecked(page); 2240 + if (ret) 2298 2241 goto out_reserved; 2299 - } 2300 2242 2301 - ClearPageChecked(page); 2302 - set_page_dirty(page); 2243 + /* 2244 + * Everything went as planned, we're now the owner of a dirty page with 2245 + * delayed allocation bits set and space reserved for our COW 2246 + * destination. 2247 + * 2248 + * The page was dirty when we started, nothing should have cleaned it. 2249 + */ 2250 + BUG_ON(!PageDirty(page)); 2251 + free_delalloc_space = false; 2303 2252 out_reserved: 2304 2253 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 2305 - if (ret) 2254 + if (free_delalloc_space) 2306 2255 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2307 2256 PAGE_SIZE, true); 2308 - out: 2309 2257 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 2310 2258 &cached_state); 2311 2259 out_page: 2260 + if (ret) { 2261 + /* 2262 + * We hit ENOSPC or other errors. Update the mapping and page 2263 + * to reflect the errors and clean the page. 2264 + */ 2265 + mapping_set_error(page->mapping, ret); 2266 + end_extent_writepage(page, ret, page_start, page_end); 2267 + clear_page_dirty_for_io(page); 2268 + SetPageError(page); 2269 + } 2270 + ClearPageChecked(page); 2312 2271 unlock_page(page); 2313 2272 put_page(page); 2314 2273 kfree(fixup); 2315 2274 extent_changeset_free(data_reserved); 2275 + /* 2276 + * As a precaution, do a delayed iput in case it would be the last iput 2277 + * that could need flushing space. Recursing back to fixup worker would 2278 + * deadlock. 2279 + */ 2280 + btrfs_add_delayed_iput(inode); 2316 2281 } 2317 2282 2318 2283 /* ··· 2346 2291 if (TestClearPagePrivate2(page)) 2347 2292 return 0; 2348 2293 2294 + /* 2295 + * PageChecked is set below when we create a fixup worker for this page, 2296 + * don't try to create another one if we're already PageChecked() 2297 + * 2298 + * The extent_io writepage code will redirty the page if we send back 2299 + * EAGAIN. 2300 + */ 2349 2301 if (PageChecked(page)) 2350 2302 return -EAGAIN; 2351 2303 ··· 2360 2298 if (!fixup) 2361 2299 return -EAGAIN; 2362 2300 2301 + /* 2302 + * We are already holding a reference to this inode from 2303 + * write_cache_pages. We need to hold it because the space reservation 2304 + * takes place outside of the page lock, and we can't trust 2305 + * page->mapping outside of the page lock. 2306 + */ 2307 + ihold(inode); 2363 2308 SetPageChecked(page); 2364 2309 get_page(page); 2365 2310 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 2366 2311 fixup->page = page; 2312 + fixup->inode = inode; 2367 2313 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2368 - return -EBUSY; 2314 + 2315 + return -EAGAIN; 2369 2316 } 2370 2317 2371 2318 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+2 -1
fs/btrfs/send.c
··· 1269 1269 * destination of the stream. 1270 1270 */ 1271 1271 if (ino == bctx->cur_objectid && 1272 - offset >= bctx->sctx->cur_inode_next_write_offset) 1272 + offset + bctx->extent_len > 1273 + bctx->sctx->cur_inode_next_write_offset) 1273 1274 return 0; 1274 1275 } 1275 1276
+10 -8
fs/btrfs/space-info.c
··· 159 159 return (global->size << 1); 160 160 } 161 161 162 - static int can_overcommit(struct btrfs_fs_info *fs_info, 163 - struct btrfs_space_info *space_info, u64 bytes, 164 - enum btrfs_reserve_flush_enum flush) 162 + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, 163 + struct btrfs_space_info *space_info, u64 bytes, 164 + enum btrfs_reserve_flush_enum flush) 165 165 { 166 166 u64 profile; 167 167 u64 avail; ··· 226 226 227 227 /* Check and see if our ticket can be satisified now. */ 228 228 if ((used + ticket->bytes <= space_info->total_bytes) || 229 - can_overcommit(fs_info, space_info, ticket->bytes, flush)) { 229 + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, 230 + flush)) { 230 231 btrfs_space_info_update_bytes_may_use(fs_info, 231 232 space_info, 232 233 ticket->bytes); ··· 640 639 return to_reclaim; 641 640 642 641 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 643 - if (can_overcommit(fs_info, space_info, to_reclaim, 644 - BTRFS_RESERVE_FLUSH_ALL)) 642 + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, 643 + BTRFS_RESERVE_FLUSH_ALL)) 645 644 return 0; 646 645 647 646 used = btrfs_space_info_used(space_info, true); 648 647 649 - if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 648 + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, 649 + BTRFS_RESERVE_FLUSH_ALL)) 650 650 expected = div_factor_fine(space_info->total_bytes, 95); 651 651 else 652 652 expected = div_factor_fine(space_info->total_bytes, 90); ··· 1006 1004 */ 1007 1005 if (!pending_tickets && 1008 1006 ((used + orig_bytes <= space_info->total_bytes) || 1009 - can_overcommit(fs_info, space_info, orig_bytes, flush))) { 1007 + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { 1010 1008 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 1011 1009 orig_bytes); 1012 1010 ret = 0;
+3
fs/btrfs/space-info.h
··· 127 127 enum btrfs_reserve_flush_enum flush); 128 128 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 129 129 struct btrfs_space_info *space_info); 130 + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, 131 + struct btrfs_space_info *space_info, u64 bytes, 132 + enum btrfs_reserve_flush_enum flush); 130 133 131 134 static inline void btrfs_space_info_free_bytes_may_use( 132 135 struct btrfs_fs_info *fs_info,
+9 -1
fs/btrfs/super.c
··· 2135 2135 */ 2136 2136 thresh = SZ_4M; 2137 2137 2138 - if (!mixed && total_free_meta - thresh < block_rsv->size) 2138 + /* 2139 + * We only want to claim there's no available space if we can no longer 2140 + * allocate chunks for our metadata profile and our global reserve will 2141 + * not fit in the free metadata space. If we aren't ->full then we 2142 + * still can allocate chunks and thus are fine using the currently 2143 + * calculated f_bavail. 2144 + */ 2145 + if (!mixed && block_rsv->space_info->full && 2146 + total_free_meta - thresh < block_rsv->size) 2139 2147 buf->f_bavail = 0; 2140 2148 2141 2149 buf->f_type = BTRFS_SUPER_MAGIC;
-1
fs/btrfs/tests/btrfs-tests.c
··· 142 142 spin_lock_init(&fs_info->qgroup_lock); 143 143 spin_lock_init(&fs_info->super_lock); 144 144 spin_lock_init(&fs_info->fs_roots_radix_lock); 145 - spin_lock_init(&fs_info->tree_mod_seq_lock); 146 145 mutex_init(&fs_info->qgroup_ioctl_lock); 147 146 mutex_init(&fs_info->qgroup_rescan_lock); 148 147 rwlock_init(&fs_info->tree_mod_log_lock);
+9
fs/btrfs/tests/extent-io-tests.c
··· 441 441 int ret = -EINVAL; 442 442 443 443 test_msg("running find_first_clear_extent_bit test"); 444 + 444 445 extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); 445 446 447 + /* Test correct handling of empty tree */ 448 + find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); 449 + if (start != 0 || end != -1) { 450 + test_err( 451 + "error getting a range from completely empty tree: start %llu end %llu", 452 + start, end); 453 + goto out; 454 + } 446 455 /* 447 456 * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between 448 457 * 4M-32M