Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-6.17-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

- fix a few races related to inode link count

- fix inode leak on failure to add link to inode

- move transaction aborts closer to where they happen

* tag 'for-6.17-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: avoid load/store tearing races when checking if an inode was logged
btrfs: fix race between setting last_dir_index_offset and inode logging
btrfs: fix race between logging inode and checking if it was logged before
btrfs: simplify error handling logic for btrfs_link()
btrfs: fix inode leak on failure to add link to inode
btrfs: abort transaction on failure to add link to inode

+79 -51
+1 -1
fs/btrfs/btrfs_inode.h
··· 248 248 u64 new_delalloc_bytes; 249 249 /* 250 250 * The offset of the last dir index key that was logged. 251 - * This is used only for directories. 251 + * This is used only for directories. Protected by 'log_mutex'. 252 252 */ 253 253 u64 last_dir_index_offset; 254 254 };
+25 -25
fs/btrfs/inode.c
··· 6805 6805 struct fscrypt_name fname; 6806 6806 u64 index; 6807 6807 int ret; 6808 - int drop_inode = 0; 6809 6808 6810 6809 /* do not allow sys_link's with other subvols of the same device */ 6811 6810 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) ··· 6836 6837 6837 6838 /* There are several dir indexes for this inode, clear the cache. */ 6838 6839 BTRFS_I(inode)->dir_index = 0ULL; 6839 - inc_nlink(inode); 6840 6840 inode_inc_iversion(inode); 6841 6841 inode_set_ctime_current(inode); 6842 - ihold(inode); 6843 6842 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6844 6843 6845 6844 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6846 6845 &fname.disk_name, 1, index); 6846 + if (ret) 6847 + goto fail; 6847 6848 6849 + /* Link added now we update the inode item with the new link count. */ 6850 + inc_nlink(inode); 6851 + ret = btrfs_update_inode(trans, BTRFS_I(inode)); 6848 6852 if (ret) { 6849 - drop_inode = 1; 6850 - } else { 6851 - struct dentry *parent = dentry->d_parent; 6852 - 6853 - ret = btrfs_update_inode(trans, BTRFS_I(inode)); 6854 - if (ret) 6855 - goto fail; 6856 - if (inode->i_nlink == 1) { 6857 - /* 6858 - * If new hard link count is 1, it's a file created 6859 - * with open(2) O_TMPFILE flag. 6860 - */ 6861 - ret = btrfs_orphan_del(trans, BTRFS_I(inode)); 6862 - if (ret) 6863 - goto fail; 6864 - } 6865 - d_instantiate(dentry, inode); 6866 - btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); 6853 + btrfs_abort_transaction(trans, ret); 6854 + goto fail; 6867 6855 } 6856 + 6857 + if (inode->i_nlink == 1) { 6858 + /* 6859 + * If the new hard link count is 1, it's a file created with the 6860 + * open(2) O_TMPFILE flag. 6861 + */ 6862 + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); 6863 + if (ret) { 6864 + btrfs_abort_transaction(trans, ret); 6865 + goto fail; 6866 + } 6867 + } 6868 + 6869 + /* Grab reference for the new dentry passed to d_instantiate(). */ 6870 + ihold(inode); 6871 + d_instantiate(dentry, inode); 6872 + btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent); 6868 6873 6869 6874 fail: 6870 6875 fscrypt_free_filename(&fname); 6871 6876 if (trans) 6872 6877 btrfs_end_transaction(trans); 6873 - if (drop_inode) { 6874 - inode_dec_link_count(inode); 6875 - iput(inode); 6876 - } 6877 6878 btrfs_btree_balance_dirty(fs_info); 6878 6879 return ret; 6879 6880 } ··· 7829 7830 ei->last_sub_trans = 0; 7830 7831 ei->logged_trans = 0; 7831 7832 ei->delalloc_bytes = 0; 7833 + /* new_delalloc_bytes and last_dir_index_offset are in a union. */ 7832 7834 ei->new_delalloc_bytes = 0; 7833 7835 ei->defrag_bytes = 0; 7834 7836 ei->disk_i_size = 0;
+53 -25
fs/btrfs/tree-log.c
··· 3340 3340 return 0; 3341 3341 } 3342 3342 3343 + static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans, 3344 + struct btrfs_inode *inode) 3345 + { 3346 + bool ret = false; 3347 + 3348 + /* 3349 + * Do this only if ->logged_trans is still 0 to prevent races with 3350 + * concurrent logging as we may see the inode not logged when 3351 + * inode_logged() is called but it gets logged after inode_logged() did 3352 + * not find it in the log tree and we end up setting ->logged_trans to a 3353 + * value less than trans->transid after the concurrent logging task has 3354 + * set it to trans->transid. As a consequence, subsequent rename, unlink 3355 + * and link operations may end up not logging new names and removing old 3356 + * names from the log. 3357 + */ 3358 + spin_lock(&inode->lock); 3359 + if (inode->logged_trans == 0) 3360 + inode->logged_trans = trans->transid - 1; 3361 + else if (inode->logged_trans == trans->transid) 3362 + ret = true; 3363 + spin_unlock(&inode->lock); 3364 + 3365 + return ret; 3366 + } 3367 + 3343 3368 /* 3344 3369 * Check if an inode was logged in the current transaction. This correctly deals 3345 3370 * with the case where the inode was logged but has a logged_trans of 0, which ··· 3382 3357 struct btrfs_key key; 3383 3358 int ret; 3384 3359 3385 - if (inode->logged_trans == trans->transid) 3360 + /* 3361 + * Quick lockless call, since once ->logged_trans is set to the current 3362 + * transaction, we never set it to a lower value anywhere else. 3363 + */ 3364 + if (data_race(inode->logged_trans) == trans->transid) 3386 3365 return 1; 3387 3366 3388 3367 /* 3389 - * If logged_trans is not 0, then we know the inode logged was not logged 3390 - * in this transaction, so we can return false right away. 3368 + * If logged_trans is not 0 and not trans->transid, then we know the 3369 + * inode was not logged in this transaction, so we can return false 3370 + * right away. We take the lock to avoid a race caused by load/store 3371 + * tearing with a concurrent btrfs_log_inode() call or a concurrent task 3372 + * in this function further below - an update to trans->transid can be 3373 + * teared into two 32 bits updates for example, in which case we could 3374 + * see a positive value that is not trans->transid and assume the inode 3375 + * was not logged when it was. 3391 3376 */ 3392 - if (inode->logged_trans > 0) 3377 + spin_lock(&inode->lock); 3378 + if (inode->logged_trans == trans->transid) { 3379 + spin_unlock(&inode->lock); 3380 + return 1; 3381 + } else if (inode->logged_trans > 0) { 3382 + spin_unlock(&inode->lock); 3393 3383 return 0; 3384 + } 3385 + spin_unlock(&inode->lock); 3394 3386 3395 3387 /* 3396 3388 * If no log tree was created for this root in this transaction, then ··· 3416 3374 * transaction's ID, to avoid the search below in a future call in case 3417 3375 * a log tree gets created after this. 3418 3376 */ 3419 - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3420 - inode->logged_trans = trans->transid - 1; 3421 - return 0; 3422 - } 3377 + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) 3378 + return mark_inode_as_not_logged(trans, inode); 3423 3379 3424 3380 /* 3425 3381 * We have a log tree and the inode's logged_trans is 0. We can't tell ··· 3471 3431 * Set logged_trans to a value greater than 0 and less then the 3472 3432 * current transaction to avoid doing the search in future calls. 3473 3433 */ 3474 - inode->logged_trans = trans->transid - 1; 3475 - return 0; 3434 + return mark_inode_as_not_logged(trans, inode); 3476 3435 } 3477 3436 3478 3437 /* ··· 3479 3440 * the current transacion's ID, to avoid future tree searches as long as 3480 3441 * the inode is not evicted again. 3481 3442 */ 3443 + spin_lock(&inode->lock); 3482 3444 inode->logged_trans = trans->transid; 3483 - 3484 - /* 3485 - * If it's a directory, then we must set last_dir_index_offset to the 3486 - * maximum possible value, so that the next attempt to log the inode does 3487 - * not skip checking if dir index keys found in modified subvolume tree 3488 - * leaves have been logged before, otherwise it would result in attempts 3489 - * to insert duplicate dir index keys in the log tree. This must be done 3490 - * because last_dir_index_offset is an in-memory only field, not persisted 3491 - * in the inode item or any other on-disk structure, so its value is lost 3492 - * once the inode is evicted. 3493 - */ 3494 - if (S_ISDIR(inode->vfs_inode.i_mode)) 3495 - inode->last_dir_index_offset = (u64)-1; 3445 + spin_unlock(&inode->lock); 3496 3446 3497 3447 return 1; 3498 3448 } ··· 4073 4045 4074 4046 /* 4075 4047 * If the inode was logged before and it was evicted, then its 4076 - * last_dir_index_offset is (u64)-1, so we don't the value of the last index 4048 + * last_dir_index_offset is 0, so we don't know the value of the last index 4077 4049 * key offset. If that's the case, search for it and update the inode. This 4078 4050 * is to avoid lookups in the log tree every time we try to insert a dir index 4079 4051 * key from a leaf changed in the current transaction, and to allow us to always ··· 4089 4061 4090 4062 lockdep_assert_held(&inode->log_mutex); 4091 4063 4092 - if (inode->last_dir_index_offset != (u64)-1) 4064 + if (inode->last_dir_index_offset != 0) 4093 4065 return 0; 4094 4066 4095 4067 if (!ctx->logged_before) {