Merge tag 'for-6.12-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull btrfs fixes from David Sterba:

- in incremental send, fix invalid clone operation for file that got
its size decreased

- fix __counted_by() annotation of send path cache entries, we do not
store the terminating NUL

- fix a longstanding bug in relocation (and quite hard to hit by
chance), drop back reference cache that can get out of sync after
transaction commit

- wait for fixup worker kthread before finishing umount

- add missing raid-stripe-tree extent for NOCOW files, zoned mode
cannot have NOCOW files but RST is meant to be a standalone feature

- handle transaction start error during relocation, avoid potential
NULL pointer dereference of relocation control structure (reported by
syzbot)

- disable module-wide rate limiting of debug level messages

- minor fix to tracepoint definition (reported by checkpatch.pl)

* tag 'for-6.12-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: disable rate limiting when debug enabled
btrfs: wait for fixup workers before stopping cleaner kthread during umount
btrfs: fix a NULL pointer dereference when failed to start a new trasacntion
btrfs: send: fix invalid clone operation for file that got its size decreased
btrfs: tracepoints: end assignment with semicolon at btrfs_qgroup_extent event class
btrfs: drop the backref cache during relocation if we commit
btrfs: also add stripe entries for NOCOW writes
btrfs: send: fix buffer overflow detection when copying path to cache entry

Linus Torvalds 2 years ago 79eb2c07 b7a838ee

+58 -83

7 changed files

expand all

btrfs

backref.c

disk-io.c

inode.c

messages.c

relocation.c

send.c

include

trace

events

btrfs.h

+8 -4

fs/btrfs/backref.c

··· 3179 3179 btrfs_backref_cleanup_node(cache, node); 3180 3180 } 3181 3181 3182 - cache->last_trans = 0; 3183 - 3184 - for (i = 0; i < BTRFS_MAX_LEVEL; i++) 3185 - ASSERT(list_empty(&cache->pending[i])); 3182 + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 3183 + while (!list_empty(&cache->pending[i])) { 3184 + node = list_first_entry(&cache->pending[i], 3185 + struct btrfs_backref_node, 3186 + list); 3187 + btrfs_backref_cleanup_node(cache, node); 3188 + } 3189 + } 3186 3190 ASSERT(list_empty(&cache->pending_edge)); 3187 3191 ASSERT(list_empty(&cache->useless_node)); 3188 3192 ASSERT(list_empty(&cache->changed));

+11

fs/btrfs/disk-io.c

··· 4256 4256 btrfs_cleanup_defrag_inodes(fs_info); 4257 4257 4258 4258 /* 4259 + * Wait for any fixup workers to complete. 4260 + * If we don't wait for them here and they are still running by the time 4261 + * we call kthread_stop() against the cleaner kthread further below, we 4262 + * get an use-after-free on the cleaner because the fixup worker adds an 4263 + * inode to the list of delayed iputs and then attempts to wakeup the 4264 + * cleaner kthread, which was already stopped and destroyed. We parked 4265 + * already the cleaner, but below we run all pending delayed iputs. 4266 + */ 4267 + btrfs_flush_workqueue(fs_info->fixup_workers); 4268 + 4269 + /* 4259 4270 * After we parked the cleaner kthread, ordered extents may have 4260 4271 * completed and created new delayed iputs. If one of the async reclaim 4261 4272 * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we

fs/btrfs/inode.c

··· 3111 3111 ret = btrfs_update_inode_fallback(trans, inode); 3112 3112 if (ret) /* -ENOMEM or corruption */ 3113 3113 btrfs_abort_transaction(trans, ret); 3114 + 3115 + ret = btrfs_insert_raid_extent(trans, ordered_extent); 3116 + if (ret) 3117 + btrfs_abort_transaction(trans, ret); 3118 + 3114 3119 goto out; 3115 3120 } 3116 3121

+2 -1

fs/btrfs/messages.c

··· 239 239 vaf.fmt = fmt; 240 240 vaf.va = &args; 241 241 242 - if (__ratelimit(ratelimit)) { 242 + /* Do not ratelimit if CONFIG_BTRFS_DEBUG is enabled. */ 243 + if (IS_ENABLED(CONFIG_BTRFS_DEBUG) || __ratelimit(ratelimit)) { 243 244 if (fs_info) { 244 245 char statestr[STATE_STRING_BUF_LEN]; 245 246

+4 -73

fs/btrfs/relocation.c

··· 232 232 return NULL; 233 233 } 234 234 235 - static void update_backref_node(struct btrfs_backref_cache *cache, 236 - struct btrfs_backref_node *node, u64 bytenr) 237 - { 238 - struct rb_node *rb_node; 239 - rb_erase(&node->rb_node, &cache->rb_root); 240 - node->bytenr = bytenr; 241 - rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 242 - if (rb_node) 243 - btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); 244 - } 245 - 246 - /* 247 - * update backref cache after a transaction commit 248 - */ 249 - static int update_backref_cache(struct btrfs_trans_handle *trans, 250 - struct btrfs_backref_cache *cache) 251 - { 252 - struct btrfs_backref_node *node; 253 - int level = 0; 254 - 255 - if (cache->last_trans == 0) { 256 - cache->last_trans = trans->transid; 257 - return 0; 258 - } 259 - 260 - if (cache->last_trans == trans->transid) 261 - return 0; 262 - 263 - /* 264 - * detached nodes are used to avoid unnecessary backref 265 - * lookup. transaction commit changes the extent tree. 266 - * so the detached nodes are no longer useful. 267 - */ 268 - while (!list_empty(&cache->detached)) { 269 - node = list_entry(cache->detached.next, 270 - struct btrfs_backref_node, list); 271 - btrfs_backref_cleanup_node(cache, node); 272 - } 273 - 274 - while (!list_empty(&cache->changed)) { 275 - node = list_entry(cache->changed.next, 276 - struct btrfs_backref_node, list); 277 - list_del_init(&node->list); 278 - BUG_ON(node->pending); 279 - update_backref_node(cache, node, node->new_bytenr); 280 - } 281 - 282 - /* 283 - * some nodes can be left in the pending list if there were 284 - * errors during processing the pending nodes. 285 - */ 286 - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 287 - list_for_each_entry(node, &cache->pending[level], list) { 288 - BUG_ON(!node->pending); 289 - if (node->bytenr == node->new_bytenr) 290 - continue; 291 - update_backref_node(cache, node, node->new_bytenr); 292 - } 293 - } 294 - 295 - cache->last_trans = 0; 296 - return 1; 297 - } 298 - 299 235 static bool reloc_root_is_dead(const struct btrfs_root *root) 300 236 { 301 237 /* ··· 486 550 struct btrfs_backref_edge *edge; 487 551 struct btrfs_backref_edge *new_edge; 488 552 struct rb_node *rb_node; 489 - 490 - if (cache->last_trans > 0) 491 - update_backref_cache(trans, cache); 492 553 493 554 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 494 555 if (rb_node) { ··· 856 923 btrfs_grab_root(reloc_root); 857 924 858 925 /* root->reloc_root will stay until current relocation finished */ 859 - if (fs_info->reloc_ctl->merge_reloc_tree && 926 + if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 860 927 btrfs_root_refs(root_item) == 0) { 861 928 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 862 929 /* ··· 3631 3698 break; 3632 3699 } 3633 3700 restart: 3634 - if (update_backref_cache(trans, &rc->backref_cache)) { 3635 - btrfs_end_transaction(trans); 3636 - trans = NULL; 3637 - continue; 3638 - } 3701 + if (rc->backref_cache.last_trans != trans->transid) 3702 + btrfs_backref_release_cache(&rc->backref_cache); 3703 + rc->backref_cache.last_trans = trans->transid; 3639 3704 3640 3705 ret = find_next_extent(rc, path, &key); 3641 3706 if (ret < 0)

+27 -4

fs/btrfs/send.c

··· 346 346 u64 parent_gen; 347 347 int ret; 348 348 int need_later_update; 349 + /* Name length without NUL terminator. */ 349 350 int name_len; 350 - char name[] __counted_by(name_len); 351 + /* Not NUL terminated. */ 352 + char name[] __counted_by(name_len) __nonstring; 351 353 }; 352 354 353 355 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ ··· 2390 2388 /* 2391 2389 * Store the result of the lookup in the name cache. 2392 2390 */ 2393 - nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); 2391 + nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); 2394 2392 if (!nce) { 2395 2393 ret = -ENOMEM; 2396 2394 goto out; ··· 2402 2400 nce->parent_gen = *parent_gen; 2403 2401 nce->name_len = fs_path_len(dest); 2404 2402 nce->ret = ret; 2405 - strcpy(nce->name, dest->start); 2403 + memcpy(nce->name, dest->start, nce->name_len); 2406 2404 2407 2405 if (ino < sctx->send_progress) 2408 2406 nce->need_later_update = 0; ··· 6189 6187 if (ret < 0) 6190 6188 return ret; 6191 6189 6192 - if (clone_root->offset + num_bytes == info.size) 6190 + if (clone_root->offset + num_bytes == info.size) { 6191 + /* 6192 + * The final size of our file matches the end offset, but it may 6193 + * be that its current size is larger, so we have to truncate it 6194 + * to any value between the start offset of the range and the 6195 + * final i_size, otherwise the clone operation is invalid 6196 + * because it's unaligned and it ends before the current EOF. 6197 + * We do this truncate to the final i_size when we finish 6198 + * processing the inode, but it's too late by then. And here we 6199 + * truncate to the start offset of the range because it's always 6200 + * sector size aligned while if it were the final i_size it 6201 + * would result in dirtying part of a page, filling part of a 6202 + * page with zeroes and then having the clone operation at the 6203 + * receiver trigger IO and wait for it due to the dirty page. 6204 + */ 6205 + if (sctx->parent_root != NULL) { 6206 + ret = send_truncate(sctx, sctx->cur_ino, 6207 + sctx->cur_inode_gen, offset); 6208 + if (ret < 0) 6209 + return ret; 6210 + } 6193 6211 goto clone_data; 6212 + } 6194 6213 6195 6214 write_data: 6196 6215 ret = send_extent_data(sctx, path, offset, num_bytes);

+1 -1

include/trace/events/btrfs.h

··· 1716 1716 ), 1717 1717 1718 1718 TP_fast_assign_btrfs(fs_info, 1719 - __entry->bytenr = rec->bytenr, 1719 + __entry->bytenr = rec->bytenr; 1720 1720 __entry->num_bytes = rec->num_bytes; 1721 1721 ), 1722 1722