Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: xattr inode deduplication

Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

[ Fixed up journal credits calculation to handle inline data and the
rare case where an shared xattr block can get freed when two thread
race on breaking the xattr block sharing. --tytso ]

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Tahsin Erdogan and committed by
Theodore Ts'o
dec214d0 30a7eb97

+867 -299
+4 -1
fs/ext4/acl.c
··· 238 238 if (error) 239 239 return error; 240 240 retry: 241 - credits = ext4_xattr_set_credits(inode, acl_size); 241 + error = ext4_xattr_set_credits(inode, acl_size, &credits); 242 + if (error) 243 + return error; 244 + 242 245 handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); 243 246 if (IS_ERR(handle)) 244 247 return PTR_ERR(handle);
+14 -9
fs/ext4/ext4.h
··· 1517 1517 long s_es_nr_inode; 1518 1518 struct ext4_es_stats s_es_stats; 1519 1519 struct mb_cache *s_ea_block_cache; 1520 + struct mb_cache *s_ea_inode_cache; 1520 1521 spinlock_t s_es_lock ____cacheline_aligned_in_smp; 1521 1522 1522 1523 /* Ratelimit ext4 messages. */ ··· 2101 2100 return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); 2102 2101 } 2103 2102 2104 - #define ext4_is_quota_file(inode) IS_NOQUOTA(inode) 2103 + static inline bool ext4_is_quota_file(struct inode *inode) 2104 + { 2105 + return IS_NOQUOTA(inode) && 2106 + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); 2107 + } 2105 2108 2106 2109 /* 2107 2110 * This structure is stuffed into the struct file's private_data field ··· 2498 2493 extern void ext4_set_inode_flags(struct inode *); 2499 2494 extern int ext4_alloc_da_blocks(struct inode *inode); 2500 2495 extern void ext4_set_aops(struct inode *inode); 2501 - extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); 2502 2496 extern int ext4_writepage_trans_blocks(struct inode *); 2503 2497 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2504 2498 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, ··· 2724 2720 extern int ext4_register_li_request(struct super_block *sb, 2725 2721 ext4_group_t first_not_zeroed); 2726 2722 2727 - static inline int ext4_has_group_desc_csum(struct super_block *sb) 2728 - { 2729 - return ext4_has_feature_gdt_csum(sb) || 2730 - EXT4_SB(sb)->s_chksum_driver != NULL; 2731 - } 2732 - 2733 2723 static inline int ext4_has_metadata_csum(struct super_block *sb) 2734 2724 { 2735 2725 WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && 2736 2726 !EXT4_SB(sb)->s_chksum_driver); 2737 2727 2738 - return (EXT4_SB(sb)->s_chksum_driver != NULL); 2728 + return ext4_has_feature_metadata_csum(sb) && 2729 + (EXT4_SB(sb)->s_chksum_driver != NULL); 2739 2730 } 2731 + 2732 + static inline int ext4_has_group_desc_csum(struct super_block *sb) 2733 + { 2734 + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); 2735 + } 2736 + 2740 2737 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 2741 2738 { 2742 2739 return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+11 -2
fs/ext4/inode.c
··· 139 139 unsigned int length); 140 140 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 141 141 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 142 + static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 143 + int pextents); 142 144 143 145 /* 144 146 * Test whether an inode is a fast symlink. ··· 4845 4843 } 4846 4844 brelse(iloc.bh); 4847 4845 ext4_set_inode_flags(inode); 4848 - if (ei->i_flags & EXT4_EA_INODE_FL) 4846 + 4847 + if (ei->i_flags & EXT4_EA_INODE_FL) { 4849 4848 ext4_xattr_inode_set_class(inode); 4849 + 4850 + inode_lock(inode); 4851 + inode->i_flags |= S_NOQUOTA; 4852 + inode_unlock(inode); 4853 + } 4854 + 4850 4855 unlock_new_inode(inode); 4851 4856 return inode; 4852 4857 ··· 5512 5503 * 5513 5504 * Also account for superblock, inode, quota and xattr blocks 5514 5505 */ 5515 - int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 5506 + static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 5516 5507 int pextents) 5517 5508 { 5518 5509 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+34 -3
fs/ext4/super.c
··· 927 927 invalidate_bdev(sbi->journal_bdev); 928 928 ext4_blkdev_remove(sbi); 929 929 } 930 + if (sbi->s_ea_inode_cache) { 931 + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); 932 + sbi->s_ea_inode_cache = NULL; 933 + } 930 934 if (sbi->s_ea_block_cache) { 931 935 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 932 936 sbi->s_ea_block_cache = NULL; ··· 1182 1178 if (res) 1183 1179 return res; 1184 1180 retry: 1185 - credits = ext4_xattr_set_credits(inode, len); 1181 + res = ext4_xattr_set_credits(inode, len, &credits); 1182 + if (res) 1183 + return res; 1184 + 1186 1185 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 1187 1186 if (IS_ERR(handle)) 1188 1187 return PTR_ERR(handle); ··· 3452 3445 } 3453 3446 3454 3447 /* Load the checksum driver */ 3455 - if (ext4_has_feature_metadata_csum(sb)) { 3448 + if (ext4_has_feature_metadata_csum(sb) || 3449 + ext4_has_feature_ea_inode(sb)) { 3456 3450 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 3457 3451 if (IS_ERR(sbi->s_chksum_driver)) { 3458 3452 ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); ··· 3475 3467 /* Precompute checksum seed for all metadata */ 3476 3468 if (ext4_has_feature_csum_seed(sb)) 3477 3469 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); 3478 - else if (ext4_has_metadata_csum(sb)) 3470 + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) 3479 3471 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, 3480 3472 sizeof(es->s_uuid)); 3481 3473 ··· 3603 3595 if (ext4_has_feature_64bit(sb)) { 3604 3596 ext4_msg(sb, KERN_ERR, 3605 3597 "The Hurd can't support 64-bit file systems"); 3598 + goto failed_mount; 3599 + } 3600 + 3601 + /* 3602 + * ea_inode feature uses l_i_version field which is not 3603 + * available in HURD_COMPAT mode. 3604 + */ 3605 + if (ext4_has_feature_ea_inode(sb)) { 3606 + ext4_msg(sb, KERN_ERR, 3607 + "ea_inode feature is not supported for Hurd"); 3606 3608 goto failed_mount; 3607 3609 } 3608 3610 } ··· 4085 4067 goto failed_mount_wq; 4086 4068 } 4087 4069 4070 + if (ext4_has_feature_ea_inode(sb)) { 4071 + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); 4072 + if (!sbi->s_ea_inode_cache) { 4073 + ext4_msg(sb, KERN_ERR, 4074 + "Failed to create ea_inode_cache"); 4075 + goto failed_mount_wq; 4076 + } 4077 + } 4078 + 4088 4079 if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && 4089 4080 (blocksize != PAGE_SIZE)) { 4090 4081 ext4_msg(sb, KERN_ERR, ··· 4323 4296 if (EXT4_SB(sb)->rsv_conversion_wq) 4324 4297 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4325 4298 failed_mount_wq: 4299 + if (sbi->s_ea_inode_cache) { 4300 + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); 4301 + sbi->s_ea_inode_cache = NULL; 4302 + } 4326 4303 if (sbi->s_ea_block_cache) { 4327 4304 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 4328 4305 sbi->s_ea_block_cache = NULL;
+797 -265
fs/ext4/xattr.c
··· 108 108 #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ 109 109 inode->i_sb->s_fs_info)->s_ea_block_cache) 110 110 111 + #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ 112 + inode->i_sb->s_fs_info)->s_ea_inode_cache) 113 + 111 114 static int 112 115 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, 113 116 struct inode *inode); ··· 283 280 return cmp ? -ENODATA : 0; 284 281 } 285 282 283 + static u32 284 + ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) 285 + { 286 + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); 287 + } 288 + 289 + static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) 290 + { 291 + return ((u64)ea_inode->i_ctime.tv_sec << 32) | 292 + ((u32)ea_inode->i_version); 293 + } 294 + 295 + static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) 296 + { 297 + ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); 298 + ea_inode->i_version = (u32)ref_count; 299 + } 300 + 301 + static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) 302 + { 303 + return (u32)ea_inode->i_atime.tv_sec; 304 + } 305 + 306 + static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) 307 + { 308 + ea_inode->i_atime.tv_sec = hash; 309 + } 310 + 286 311 /* 287 312 * Read the EA value from an inode. 288 313 */ 289 314 static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) 290 315 { 291 316 unsigned long block = 0; 292 - struct buffer_head *bh = NULL; 317 + struct buffer_head *bh; 293 318 int blocksize = ea_inode->i_sb->s_blocksize; 294 319 size_t csize, copied = 0; 320 + void *copy_pos = buf; 295 321 296 322 while (copied < size) { 297 323 csize = (size - copied) > blocksize ? blocksize : size - copied; ··· 330 298 if (!bh) 331 299 return -EFSCORRUPTED; 332 300 333 - memcpy(buf, bh->b_data, csize); 301 + memcpy(copy_pos, bh->b_data, csize); 334 302 brelse(bh); 335 303 336 - buf += csize; 304 + copy_pos += csize; 337 305 block += 1; 338 306 copied += csize; 339 307 } ··· 349 317 inode = ext4_iget(parent->i_sb, ea_ino); 350 318 if (IS_ERR(inode)) { 351 319 err = PTR_ERR(inode); 352 - ext4_error(parent->i_sb, "error while reading EA inode %lu " 353 - "err=%d", ea_ino, err); 320 + ext4_error(parent->i_sb, 321 + "error while reading EA inode %lu err=%d", ea_ino, 322 + err); 354 323 return err; 355 324 } 356 325 357 326 if (is_bad_inode(inode)) { 358 - ext4_error(parent->i_sb, "error while reading EA inode %lu " 359 - "is_bad_inode", ea_ino); 327 + ext4_error(parent->i_sb, 328 + "error while reading EA inode %lu is_bad_inode", 329 + ea_ino); 360 330 err = -EIO; 361 331 goto error; 362 332 } 363 333 364 - if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino || 365 - inode->i_generation != parent->i_generation) { 366 - ext4_error(parent->i_sb, "Backpointer from EA inode %lu " 367 - "to parent is invalid.", ea_ino); 368 - err = -EINVAL; 369 - goto error; 370 - } 371 - 372 334 if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { 373 - ext4_error(parent->i_sb, "EA inode %lu does not have " 374 - "EXT4_EA_INODE_FL flag set.\n", ea_ino); 335 + ext4_error(parent->i_sb, 336 + "EA inode %lu does not have EXT4_EA_INODE_FL flag", 337 + ea_ino); 375 338 err = -EINVAL; 376 339 goto error; 377 340 } ··· 378 351 return err; 379 352 } 380 353 354 + static int 355 + ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size) 356 + { 357 + u32 hash; 358 + 359 + /* Verify stored hash matches calculated hash. */ 360 + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); 361 + if (hash != ext4_xattr_inode_get_hash(ea_inode)) 362 + return -EFSCORRUPTED; 363 + return 0; 364 + } 365 + 366 + #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) 367 + 381 368 /* 382 369 * Read the value from the EA inode. 383 370 */ ··· 399 358 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, 400 359 size_t size) 401 360 { 361 + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); 402 362 struct inode *ea_inode; 403 - int ret; 363 + int err; 404 364 405 - ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); 406 - if (ret) 407 - return ret; 365 + err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); 366 + if (err) { 367 + ea_inode = NULL; 368 + goto out; 369 + } 408 370 409 - ret = ext4_xattr_inode_read(ea_inode, buffer, size); 371 + if (i_size_read(ea_inode) != size) { 372 + ext4_warning_inode(ea_inode, 373 + "ea_inode file size=%llu entry size=%zu", 374 + i_size_read(ea_inode), size); 375 + err = -EFSCORRUPTED; 376 + goto out; 377 + } 378 + 379 + err = ext4_xattr_inode_read(ea_inode, buffer, size); 380 + if (err) 381 + goto out; 382 + 383 + err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size); 384 + /* 385 + * Compatibility check for old Lustre ea_inode implementation. Old 386 + * version does not have hash validation, but it has a backpointer 387 + * from ea_inode to the parent inode. 388 + */ 389 + if (err == -EFSCORRUPTED) { 390 + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || 391 + ea_inode->i_generation != inode->i_generation) { 392 + ext4_warning_inode(ea_inode, 393 + "EA inode hash validation failed"); 394 + goto out; 395 + } 396 + /* Do not add ea_inode to the cache. */ 397 + ea_inode_cache = NULL; 398 + } else if (err) 399 + goto out; 400 + 401 + if (ea_inode_cache) 402 + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, 403 + ext4_xattr_inode_get_hash(ea_inode), 404 + ea_inode->i_ino, true /* reusable */); 405 + out: 410 406 iput(ea_inode); 411 - 412 - return ret; 407 + return err; 413 408 } 414 409 415 410 static int ··· 733 656 } 734 657 } 735 658 659 + static inline size_t round_up_cluster(struct inode *inode, size_t length) 660 + { 661 + struct super_block *sb = inode->i_sb; 662 + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + 663 + inode->i_blkbits); 664 + size_t mask = ~(cluster_size - 1); 665 + 666 + return (length + cluster_size - 1) & mask; 667 + } 668 + 669 + static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) 670 + { 671 + int err; 672 + 673 + err = dquot_alloc_inode(inode); 674 + if (err) 675 + return err; 676 + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); 677 + if (err) 678 + dquot_free_inode(inode); 679 + return err; 680 + } 681 + 682 + static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) 683 + { 684 + dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); 685 + dquot_free_inode(inode); 686 + } 687 + 688 + static int __ext4_xattr_set_credits(struct inode *inode, 689 + struct buffer_head *block_bh, 690 + size_t value_len) 691 + { 692 + struct super_block *sb = inode->i_sb; 693 + int credits; 694 + int blocks; 695 + 696 + /* 697 + * 1) Owner inode update 698 + * 2) Ref count update on old xattr block 699 + * 3) new xattr block 700 + * 4) block bitmap update for new xattr block 701 + * 5) group descriptor for new xattr block 702 + * 6) block bitmap update for old xattr block 703 + * 7) group descriptor for old block 704 + * 705 + * 6 & 7 can happen if we have two racing threads T_a and T_b 706 + * which are each trying to set an xattr on inodes I_a and I_b 707 + * which were both initially sharing an xattr block. 708 + */ 709 + credits = 7; 710 + 711 + /* Quota updates. */ 712 + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); 713 + 714 + /* 715 + * In case of inline data, we may push out the data to a block, 716 + * so we need to reserve credits for this eventuality 717 + */ 718 + if (ext4_has_inline_data(inode)) 719 + credits += ext4_writepage_trans_blocks(inode) + 1; 720 + 721 + /* We are done if ea_inode feature is not enabled. */ 722 + if (!ext4_has_feature_ea_inode(sb)) 723 + return credits; 724 + 725 + /* New ea_inode, inode map, block bitmap, group descriptor. */ 726 + credits += 4; 727 + 728 + /* Data blocks. */ 729 + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 730 + 731 + /* Indirection block or one level of extent tree. */ 732 + blocks += 1; 733 + 734 + /* Block bitmap and group descriptor updates for each block. */ 735 + credits += blocks * 2; 736 + 737 + /* Blocks themselves. */ 738 + credits += blocks; 739 + 740 + /* Dereference ea_inode holding old xattr value. 741 + * Old ea_inode, inode map, block bitmap, group descriptor. 742 + */ 743 + credits += 4; 744 + 745 + /* Data blocks for old ea_inode. */ 746 + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; 747 + 748 + /* Indirection block or one level of extent tree for old ea_inode. */ 749 + blocks += 1; 750 + 751 + /* Block bitmap and group descriptor updates for each block. */ 752 + credits += blocks * 2; 753 + 754 + /* We may need to clone the existing xattr block in which case we need 755 + * to increment ref counts for existing ea_inodes referenced by it. 756 + */ 757 + if (block_bh) { 758 + struct ext4_xattr_entry *entry = BFIRST(block_bh); 759 + 760 + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) 761 + if (entry->e_value_inum) 762 + /* Ref count update on ea_inode. */ 763 + credits += 1; 764 + } 765 + return credits; 766 + } 767 + 736 768 static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, 737 769 int credits, struct buffer_head *bh, 738 770 bool dirty, bool block_csum) ··· 891 705 return 0; 892 706 } 893 707 708 + static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, 709 + int ref_change) 710 + { 711 + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); 712 + struct ext4_iloc iloc; 713 + s64 ref_count; 714 + u32 hash; 715 + int ret; 716 + 717 + inode_lock(ea_inode); 718 + 719 + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); 720 + if (ret) { 721 + iloc.bh = NULL; 722 + goto out; 723 + } 724 + 725 + ref_count = ext4_xattr_inode_get_ref(ea_inode); 726 + ref_count += ref_change; 727 + ext4_xattr_inode_set_ref(ea_inode, ref_count); 728 + 729 + if (ref_change > 0) { 730 + WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", 731 + ea_inode->i_ino, ref_count); 732 + 733 + if (ref_count == 1) { 734 + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", 735 + ea_inode->i_ino, ea_inode->i_nlink); 736 + 737 + set_nlink(ea_inode, 1); 738 + ext4_orphan_del(handle, ea_inode); 739 + 740 + hash = ext4_xattr_inode_get_hash(ea_inode); 741 + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash, 742 + ea_inode->i_ino, 743 + true /* reusable */); 744 + } 745 + } else { 746 + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", 747 + ea_inode->i_ino, ref_count); 748 + 749 + if (ref_count == 0) { 750 + WARN_ONCE(ea_inode->i_nlink != 1, 751 + "EA inode %lu i_nlink=%u", 752 + ea_inode->i_ino, ea_inode->i_nlink); 753 + 754 + clear_nlink(ea_inode); 755 + ext4_orphan_add(handle, ea_inode); 756 + 757 + hash = ext4_xattr_inode_get_hash(ea_inode); 758 + mb_cache_entry_delete(ea_inode_cache, hash, 759 + ea_inode->i_ino); 760 + } 761 + } 762 + 763 + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); 764 + iloc.bh = NULL; 765 + if (ret) 766 + ext4_warning_inode(ea_inode, 767 + "ext4_mark_iloc_dirty() failed ret=%d", ret); 768 + out: 769 + brelse(iloc.bh); 770 + inode_unlock(ea_inode); 771 + return ret; 772 + } 773 + 774 + static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) 775 + { 776 + return ext4_xattr_inode_update_ref(handle, ea_inode, 1); 777 + } 778 + 779 + static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) 780 + { 781 + return ext4_xattr_inode_update_ref(handle, ea_inode, -1); 782 + } 783 + 784 + static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, 785 + struct ext4_xattr_entry *first) 786 + { 787 + struct inode *ea_inode; 788 + struct ext4_xattr_entry *entry; 789 + struct ext4_xattr_entry *failed_entry; 790 + unsigned int ea_ino; 791 + int err, saved_err; 792 + 793 + for (entry = first; !IS_LAST_ENTRY(entry); 794 + entry = EXT4_XATTR_NEXT(entry)) { 795 + if (!entry->e_value_inum) 796 + continue; 797 + ea_ino = le32_to_cpu(entry->e_value_inum); 798 + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); 799 + if (err) 800 + goto cleanup; 801 + err = ext4_xattr_inode_inc_ref(handle, ea_inode); 802 + if (err) { 803 + ext4_warning_inode(ea_inode, "inc ref error %d", err); 804 + iput(ea_inode); 805 + goto cleanup; 806 + } 807 + iput(ea_inode); 808 + } 809 + return 0; 810 + 811 + cleanup: 812 + saved_err = err; 813 + failed_entry = entry; 814 + 815 + for (entry = first; entry != failed_entry; 816 + entry = EXT4_XATTR_NEXT(entry)) { 817 + if (!entry->e_value_inum) 818 + continue; 819 + ea_ino = le32_to_cpu(entry->e_value_inum); 820 + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); 821 + if (err) { 822 + ext4_warning(parent->i_sb, 823 + "cleanup ea_ino %u iget error %d", ea_ino, 824 + err); 825 + continue; 826 + } 827 + err = ext4_xattr_inode_dec_ref(handle, ea_inode); 828 + if (err) 829 + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", 830 + err); 831 + iput(ea_inode); 832 + } 833 + return saved_err; 834 + } 835 + 894 836 static void 895 - ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent, 896 - struct buffer_head *bh, 897 - struct ext4_xattr_entry *first, bool block_csum, 898 - struct ext4_xattr_inode_array **ea_inode_array, 899 - int extra_credits) 837 + ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, 838 + struct buffer_head *bh, 839 + struct ext4_xattr_entry *first, bool block_csum, 840 + struct ext4_xattr_inode_array **ea_inode_array, 841 + int extra_credits, bool skip_quota) 900 842 { 901 843 struct inode *ea_inode; 902 844 struct ext4_xattr_entry *entry; ··· 1061 747 continue; 1062 748 } 1063 749 1064 - inode_lock(ea_inode); 1065 - clear_nlink(ea_inode); 1066 - ext4_orphan_add(handle, ea_inode); 1067 - inode_unlock(ea_inode); 750 + err = ext4_xattr_inode_dec_ref(handle, ea_inode); 751 + if (err) { 752 + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", 753 + err); 754 + continue; 755 + } 756 + 757 + if (!skip_quota) 758 + ext4_xattr_inode_free_quota(parent, 759 + le32_to_cpu(entry->e_value_size)); 1068 760 1069 761 /* 1070 762 * Forget about ea_inode within the same transaction that ··· 1104 784 */ 1105 785 static void 1106 786 ext4_xattr_release_block(handle_t *handle, struct inode *inode, 1107 - struct buffer_head *bh) 787 + struct buffer_head *bh, 788 + struct ext4_xattr_inode_array **ea_inode_array, 789 + int extra_credits) 1108 790 { 1109 791 struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); 1110 792 u32 hash, ref; ··· 1129 807 mb_cache_entry_delete(ea_block_cache, hash, bh->b_blocknr); 1130 808 get_bh(bh); 1131 809 unlock_buffer(bh); 810 + 811 + if (ext4_has_feature_ea_inode(inode->i_sb)) 812 + ext4_xattr_inode_dec_ref_all(handle, inode, bh, 813 + BFIRST(bh), 814 + true /* block_csum */, 815 + ea_inode_array, 816 + extra_credits, 817 + true /* skip_quota */); 1132 818 ext4_free_blocks(handle, inode, bh, 0, 1, 1133 819 EXT4_FREE_BLOCKS_METADATA | 1134 820 EXT4_FREE_BLOCKS_FORGET); ··· 1208 878 { 1209 879 struct buffer_head *bh = NULL; 1210 880 unsigned long block = 0; 1211 - unsigned blocksize = ea_inode->i_sb->s_blocksize; 1212 - unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; 881 + int blocksize = ea_inode->i_sb->s_blocksize; 882 + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; 1213 883 int csize, wsize = 0; 1214 884 int ret = 0; 1215 885 int retries = 0; ··· 1275 945 * Create an inode to store the value of a large EA. 1276 946 */ 1277 947 static struct inode *ext4_xattr_inode_create(handle_t *handle, 1278 - struct inode *inode) 948 + struct inode *inode, u32 hash) 1279 949 { 1280 950 struct inode *ea_inode = NULL; 1281 951 uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; ··· 1293 963 ea_inode->i_fop = &ext4_file_operations; 1294 964 ext4_set_aops(ea_inode); 1295 965 ext4_xattr_inode_set_class(ea_inode); 1296 - ea_inode->i_generation = inode->i_generation; 1297 - EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; 1298 - 1299 - /* 1300 - * A back-pointer from EA inode to parent inode will be useful 1301 - * for e2fsck. 1302 - */ 1303 - EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino); 1304 966 unlock_new_inode(ea_inode); 1305 - err = ext4_inode_attach_jinode(ea_inode); 967 + ext4_xattr_inode_set_ref(ea_inode, 1); 968 + ext4_xattr_inode_set_hash(ea_inode, hash); 969 + err = ext4_mark_inode_dirty(handle, ea_inode); 970 + if (!err) 971 + err = ext4_inode_attach_jinode(ea_inode); 1306 972 if (err) { 1307 973 iput(ea_inode); 1308 974 return ERR_PTR(err); 1309 975 } 976 + 977 + /* 978 + * Xattr inodes are shared therefore quota charging is performed 979 + * at a higher level. 980 + */ 981 + dquot_free_inode(ea_inode); 982 + dquot_drop(ea_inode); 983 + inode_lock(ea_inode); 984 + ea_inode->i_flags |= S_NOQUOTA; 985 + inode_unlock(ea_inode); 1310 986 } 1311 987 1312 988 return ea_inode; 1313 989 } 1314 990 1315 - /* 1316 - * Unlink the inode storing the value of the EA. 1317 - */ 1318 - int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) 991 + static struct inode * 992 + ext4_xattr_inode_cache_find(struct inode *inode, const void *value, 993 + size_t value_len, u32 hash) 1319 994 { 1320 - struct inode *ea_inode = NULL; 1321 - int err; 995 + struct inode *ea_inode; 996 + struct mb_cache_entry *ce; 997 + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); 998 + void *ea_data; 1322 999 1323 - err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); 1324 - if (err) 1325 - return err; 1000 + ce = mb_cache_entry_find_first(ea_inode_cache, hash); 1001 + if (!ce) 1002 + return NULL; 1326 1003 1327 - clear_nlink(ea_inode); 1328 - iput(ea_inode); 1004 + ea_data = ext4_kvmalloc(value_len, GFP_NOFS); 1005 + if (!ea_data) { 1006 + mb_cache_entry_put(ea_inode_cache, ce); 1007 + return NULL; 1008 + } 1329 1009 1330 - return 0; 1010 + while (ce) { 1011 + ea_inode = ext4_iget(inode->i_sb, ce->e_value); 1012 + if (!IS_ERR(ea_inode) && 1013 + !is_bad_inode(ea_inode) && 1014 + (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && 1015 + i_size_read(ea_inode) == value_len && 1016 + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && 1017 + !ext4_xattr_inode_verify_hash(ea_inode, ea_data, 1018 + value_len) && 1019 + !memcmp(value, ea_data, value_len)) { 1020 + mb_cache_entry_touch(ea_inode_cache, ce); 1021 + mb_cache_entry_put(ea_inode_cache, ce); 1022 + kvfree(ea_data); 1023 + return ea_inode; 1024 + } 1025 + 1026 + if (!IS_ERR(ea_inode)) 1027 + iput(ea_inode); 1028 + ce = mb_cache_entry_find_next(ea_inode_cache, ce); 1029 + } 1030 + kvfree(ea_data); 1031 + return NULL; 1331 1032 } 1332 1033 1333 1034 /* 1334 1035 * Add value of the EA in an inode. 1335 1036 */ 1336 - static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode, 1337 - unsigned long *ea_ino, const void *value, 1338 - size_t value_len) 1037 + static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, 1038 + const void *value, size_t value_len, 1039 + struct inode **ret_inode) 1339 1040 { 1340 1041 struct inode *ea_inode; 1042 + u32 hash; 1341 1043 int err; 1342 1044 1045 + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); 1046 + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); 1047 + if (ea_inode) { 1048 + err = ext4_xattr_inode_inc_ref(handle, ea_inode); 1049 + if (err) { 1050 + iput(ea_inode); 1051 + return err; 1052 + } 1053 + 1054 + *ret_inode = ea_inode; 1055 + return 0; 1056 + } 1057 + 1343 1058 /* Create an inode for the EA value */ 1344 - ea_inode = ext4_xattr_inode_create(handle, inode); 1059 + ea_inode = ext4_xattr_inode_create(handle, inode, hash); 1345 1060 if (IS_ERR(ea_inode)) 1346 1061 return PTR_ERR(ea_inode); 1347 1062 1348 1063 err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); 1349 - if (err) 1350 - clear_nlink(ea_inode); 1351 - else 1352 - *ea_ino = ea_inode->i_ino; 1064 + if (err) { 1065 + ext4_xattr_inode_dec_ref(handle, ea_inode); 1066 + iput(ea_inode); 1067 + return err; 1068 + } 1353 1069 1354 - iput(ea_inode); 1070 + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, 1071 + ea_inode->i_ino, true /* reusable */); 1355 1072 1356 - return err; 1073 + *ret_inode = ea_inode; 1074 + return 0; 1357 1075 } 1358 1076 1359 1077 static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ··· 1409 1031 handle_t *handle, struct inode *inode) 1410 1032 { 1411 1033 struct ext4_xattr_entry *last; 1412 - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); 1034 + struct ext4_xattr_entry *here = s->here; 1035 + size_t min_offs = s->end - s->base, name_len = strlen(i->name); 1413 1036 int in_inode = i->in_inode; 1414 - int rc; 1037 + struct inode *old_ea_inode = NULL; 1038 + struct inode *new_ea_inode = NULL; 1039 + size_t old_size, new_size; 1040 + int ret; 1041 + 1042 + /* Space used by old and new values. */ 1043 + old_size = (!s->not_found && !here->e_value_inum) ? 1044 + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; 1045 + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; 1046 + 1047 + /* 1048 + * Optimization for the simple case when old and new values have the 1049 + * same padded sizes. Not applicable if external inodes are involved. 1050 + */ 1051 + if (new_size && new_size == old_size) { 1052 + size_t offs = le16_to_cpu(here->e_value_offs); 1053 + void *val = s->base + offs; 1054 + 1055 + here->e_value_size = cpu_to_le32(i->value_len); 1056 + if (i->value == EXT4_ZERO_XATTR_VALUE) { 1057 + memset(val, 0, new_size); 1058 + } else { 1059 + memcpy(val, i->value, i->value_len); 1060 + /* Clear padding bytes. */ 1061 + memset(val + i->value_len, 0, new_size - i->value_len); 1062 + } 1063 + return 0; 1064 + } 1415 1065 1416 1066 /* Compute min_offs and last. */ 1417 1067 last = s->first; ··· 1450 1044 min_offs = offs; 1451 1045 } 1452 1046 } 1453 - free = min_offs - ((void *)last - s->base) - sizeof(__u32); 1454 - if (!s->not_found) { 1455 - if (!in_inode && 1456 - !s->here->e_value_inum && s->here->e_value_size) { 1457 - size_t size = le32_to_cpu(s->here->e_value_size); 1458 - free += EXT4_XATTR_SIZE(size); 1459 - } 1460 - free += EXT4_XATTR_LEN(name_len); 1461 - } 1047 + 1048 + /* Check whether we have enough space. */ 1462 1049 if (i->value) { 1463 - size_t value_len = EXT4_XATTR_SIZE(i->value_len); 1050 + size_t free; 1464 1051 1465 - if (in_inode) 1466 - value_len = 0; 1052 + free = min_offs - ((void *)last - s->base) - sizeof(__u32); 1053 + if (!s->not_found) 1054 + free += EXT4_XATTR_LEN(name_len) + old_size; 1467 1055 1468 - if (free < EXT4_XATTR_LEN(name_len) + value_len) 1469 - return -ENOSPC; 1056 + if (free < EXT4_XATTR_LEN(name_len) + new_size) { 1057 + ret = -ENOSPC; 1058 + goto out; 1059 + } 1470 1060 } 1471 1061 1472 - if (i->value && s->not_found) { 1473 - /* Insert the new name. */ 1062 + /* 1063 + * Getting access to old and new ea inodes is subject to failures. 1064 + * Finish that work before doing any modifications to the xattr data. 1065 + */ 1066 + if (!s->not_found && here->e_value_inum) { 1067 + ret = ext4_xattr_inode_iget(inode, 1068 + le32_to_cpu(here->e_value_inum), 1069 + &old_ea_inode); 1070 + if (ret) { 1071 + old_ea_inode = NULL; 1072 + goto out; 1073 + } 1074 + } 1075 + if (i->value && in_inode) { 1076 + WARN_ON_ONCE(!i->value_len); 1077 + 1078 + ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); 1079 + if (ret) 1080 + goto out; 1081 + 1082 + ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, 1083 + i->value_len, 1084 + &new_ea_inode); 1085 + if (ret) { 1086 + new_ea_inode = NULL; 1087 + ext4_xattr_inode_free_quota(inode, i->value_len); 1088 + goto out; 1089 + } 1090 + } 1091 + 1092 + if (old_ea_inode) { 1093 + /* We are ready to release ref count on the old_ea_inode. */ 1094 + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); 1095 + if (ret) { 1096 + /* Release newly required ref count on new_ea_inode. */ 1097 + if (new_ea_inode) { 1098 + int err; 1099 + 1100 + err = ext4_xattr_inode_dec_ref(handle, 1101 + new_ea_inode); 1102 + if (err) 1103 + ext4_warning_inode(new_ea_inode, 1104 + "dec ref new_ea_inode err=%d", 1105 + err); 1106 + ext4_xattr_inode_free_quota(inode, 1107 + i->value_len); 1108 + } 1109 + goto out; 1110 + } 1111 + 1112 + ext4_xattr_inode_free_quota(inode, 1113 + le32_to_cpu(here->e_value_size)); 1114 + } 1115 + 1116 + /* No failures allowed past this point. */ 1117 + 1118 + if (!s->not_found && here->e_value_offs) { 1119 + /* Remove the old value. */ 1120 + void *first_val = s->base + min_offs; 1121 + size_t offs = le16_to_cpu(here->e_value_offs); 1122 + void *val = s->base + offs; 1123 + 1124 + memmove(first_val + old_size, first_val, val - first_val); 1125 + memset(first_val, 0, old_size); 1126 + min_offs += old_size; 1127 + 1128 + /* Adjust all value offsets. */ 1129 + last = s->first; 1130 + while (!IS_LAST_ENTRY(last)) { 1131 + size_t o = le16_to_cpu(last->e_value_offs); 1132 + 1133 + if (!last->e_value_inum && 1134 + last->e_value_size && o < offs) 1135 + last->e_value_offs = cpu_to_le16(o + old_size); 1136 + last = EXT4_XATTR_NEXT(last); 1137 + } 1138 + } 1139 + 1140 + if (!i->value) { 1141 + /* Remove old name. */ 1474 1142 size_t size = EXT4_XATTR_LEN(name_len); 1475 - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); 1476 - memmove((void *)s->here + size, s->here, rest); 1477 - memset(s->here, 0, size); 1478 - s->here->e_name_index = i->name_index; 1479 - s->here->e_name_len = name_len; 1480 - memcpy(s->here->e_name, i->name, name_len); 1143 + 1144 + last = ENTRY((void *)last - size); 1145 + memmove(here, (void *)here + size, 1146 + (void *)last - (void *)here + sizeof(__u32)); 1147 + memset(last, 0, size); 1148 + } else if (s->not_found) { 1149 + /* Insert new name. */ 1150 + size_t size = EXT4_XATTR_LEN(name_len); 1151 + size_t rest = (void *)last - (void *)here + sizeof(__u32); 1152 + 1153 + memmove((void *)here + size, here, rest); 1154 + memset(here, 0, size); 1155 + here->e_name_index = i->name_index; 1156 + here->e_name_len = name_len; 1157 + memcpy(here->e_name, i->name, name_len); 1481 1158 } else { 1482 - if (!s->here->e_value_inum && s->here->e_value_size && 1483 - s->here->e_value_offs > 0) { 1484 - void *first_val = s->base + min_offs; 1485 - size_t offs = le16_to_cpu(s->here->e_value_offs); 1486 - void *val = s->base + offs; 1487 - size_t size = EXT4_XATTR_SIZE( 1488 - le32_to_cpu(s->here->e_value_size)); 1489 - 1490 - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { 1491 - /* The old and the new value have the same 1492 - size. Just replace. */ 1493 - s->here->e_value_size = 1494 - cpu_to_le32(i->value_len); 1495 - if (i->value == EXT4_ZERO_XATTR_VALUE) { 1496 - memset(val, 0, size); 1497 - } else { 1498 - /* Clear pad bytes first. */ 1499 - memset(val + size - EXT4_XATTR_PAD, 0, 1500 - EXT4_XATTR_PAD); 1501 - memcpy(val, i->value, i->value_len); 1502 - } 1503 - return 0; 1504 - } 1505 - 1506 - /* Remove the old value. */ 1507 - memmove(first_val + size, first_val, val - first_val); 1508 - memset(first_val, 0, size); 1509 - s->here->e_value_size = 0; 1510 - s->here->e_value_offs = 0; 1511 - min_offs += size; 1512 - 1513 - /* Adjust all value offsets. */ 1514 - last = s->first; 1515 - while (!IS_LAST_ENTRY(last)) { 1516 - size_t o = le16_to_cpu(last->e_value_offs); 1517 - if (!last->e_value_inum && 1518 - last->e_value_size && o < offs) 1519 - last->e_value_offs = 1520 - cpu_to_le16(o + size); 1521 - last = EXT4_XATTR_NEXT(last); 1522 - } 1523 - } 1524 - if (s->here->e_value_inum) { 1525 - ext4_xattr_inode_unlink(inode, 1526 - le32_to_cpu(s->here->e_value_inum)); 1527 - s->here->e_value_inum = 0; 1528 - } 1529 - if (!i->value) { 1530 - /* Remove the old name. */ 1531 - size_t size = EXT4_XATTR_LEN(name_len); 1532 - last = ENTRY((void *)last - size); 1533 - memmove(s->here, (void *)s->here + size, 1534 - (void *)last - (void *)s->here + sizeof(__u32)); 1535 - memset(last, 0, size); 1536 - } 1159 + /* This is an update, reset value info. */ 1160 + here->e_value_inum = 0; 1161 + here->e_value_offs = 0; 1162 + here->e_value_size = 0; 1537 1163 } 1538 1164 1539 1165 if (i->value) { 1540 - /* Insert the new value. */ 1166 + /* Insert new value. */ 1541 1167 if (in_inode) { 1542 - unsigned long ea_ino = 1543 - le32_to_cpu(s->here->e_value_inum); 1544 - rc = ext4_xattr_inode_set(handle, inode, &ea_ino, 1545 - i->value, i->value_len); 1546 - if (rc) 1547 - goto out; 1548 - s->here->e_value_inum = cpu_to_le32(ea_ino); 1549 - s->here->e_value_offs = 0; 1168 + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); 1550 1169 } else if (i->value_len) { 1551 - size_t size = EXT4_XATTR_SIZE(i->value_len); 1552 - void *val = s->base + min_offs - size; 1553 - s->here->e_value_offs = cpu_to_le16(min_offs - size); 1554 - s->here->e_value_inum = 0; 1170 + void *val = s->base + min_offs - new_size; 1171 + 1172 + here->e_value_offs = cpu_to_le16(min_offs - new_size); 1555 1173 if (i->value == EXT4_ZERO_XATTR_VALUE) { 1556 - memset(val, 0, size); 1174 + memset(val, 0, new_size); 1557 1175 } else { 1558 - /* Clear the pad bytes first. */ 1559 - memset(val + size - EXT4_XATTR_PAD, 0, 1560 - EXT4_XATTR_PAD); 1561 1176 memcpy(val, i->value, i->value_len); 1177 + /* Clear padding bytes. */ 1178 + memset(val + i->value_len, 0, 1179 + new_size - i->value_len); 1562 1180 } 1563 1181 } 1564 - s->here->e_value_size = cpu_to_le32(i->value_len); 1182 + here->e_value_size = cpu_to_le32(i->value_len); 1565 1183 } 1566 - 1184 + ret = 0; 1567 1185 out: 1568 - return rc; 1186 + iput(old_ea_inode); 1187 + iput(new_ea_inode); 1188 + return ret; 1569 1189 } 1570 1190 1571 1191 struct ext4_xattr_block_find { ··· 1653 1221 struct mb_cache_entry *ce = NULL; 1654 1222 int error = 0; 1655 1223 struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); 1224 + struct inode *ea_inode = NULL; 1225 + size_t old_ea_inode_size = 0; 1656 1226 1657 1227 #define header(x) ((struct ext4_xattr_header *)(x)) 1658 1228 ··· 1709 1275 header(s->base)->h_refcount = cpu_to_le32(1); 1710 1276 s->here = ENTRY(s->base + offset); 1711 1277 s->end = s->base + bs->bh->b_size; 1278 + 1279 + /* 1280 + * If existing entry points to an xattr inode, we need 1281 + * to prevent ext4_xattr_set_entry() from decrementing 1282 + * ref count on it because the reference belongs to the 1283 + * original block. In this case, make the entry look 1284 + * like it has an empty value. 1285 + */ 1286 + if (!s->not_found && s->here->e_value_inum) { 1287 + /* 1288 + * Defer quota free call for previous inode 1289 + * until success is guaranteed. 1290 + */ 1291 + old_ea_inode_size = le32_to_cpu( 1292 + s->here->e_value_size); 1293 + s->here->e_value_inum = 0; 1294 + s->here->e_value_size = 0; 1295 + } 1712 1296 } 1713 1297 } else { 1714 1298 /* Allocate a buffer where we construct the new block. */ ··· 1748 1296 goto bad_block; 1749 1297 if (error) 1750 1298 goto cleanup; 1299 + 1300 + if (i->value && s->here->e_value_inum) { 1301 + unsigned int ea_ino; 1302 + 1303 + /* 1304 + * A ref count on ea_inode has been taken as part of the call to 1305 + * ext4_xattr_set_entry() above. We would like to drop this 1306 + * extra ref but we have to wait until the xattr block is 1307 + * initialized and has its own ref count on the ea_inode. 1308 + */ 1309 + ea_ino = le32_to_cpu(s->here->e_value_inum); 1310 + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); 1311 + if (error) { 1312 + ea_inode = NULL; 1313 + goto cleanup; 1314 + } 1315 + } 1316 + 1751 1317 if (!IS_LAST_ENTRY(s->first)) 1752 1318 ext4_xattr_rehash(header(s->base), s->here); 1753 1319 ··· 1876 1406 EXT4_FREE_BLOCKS_METADATA); 1877 1407 goto cleanup; 1878 1408 } 1409 + error = ext4_xattr_inode_inc_ref_all(handle, inode, 1410 + ENTRY(header(s->base)+1)); 1411 + if (error) 1412 + goto getblk_failed; 1413 + if (ea_inode) { 1414 + /* Drop the extra ref on ea_inode. */ 1415 + error = ext4_xattr_inode_dec_ref(handle, 1416 + ea_inode); 1417 + if (error) 1418 + ext4_warning_inode(ea_inode, 1419 + "dec ref error=%d", 1420 + error); 1421 + iput(ea_inode); 1422 + ea_inode = NULL; 1423 + } 1424 + 1879 1425 lock_buffer(new_bh); 1880 1426 error = ext4_journal_get_create_access(handle, new_bh); 1881 1427 if (error) { ··· 1911 1425 } 1912 1426 } 1913 1427 1428 + if (old_ea_inode_size) 1429 + ext4_xattr_inode_free_quota(inode, old_ea_inode_size); 1430 + 1914 1431 /* Update the inode. */ 1915 1432 EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; 1916 1433 1917 1434 /* Drop the previous xattr block. */ 1918 - if (bs->bh && bs->bh != new_bh) 1919 - ext4_xattr_release_block(handle, inode, bs->bh); 1435 + if (bs->bh && bs->bh != new_bh) { 1436 + struct ext4_xattr_inode_array *ea_inode_array = NULL; 1437 + 1438 + ext4_xattr_release_block(handle, inode, bs->bh, 1439 + &ea_inode_array, 1440 + 0 /* extra_credits */); 1441 + ext4_xattr_inode_array_free(ea_inode_array); 1442 + } 1920 1443 error = 0; 1921 1444 1922 1445 cleanup: 1446 + if (ea_inode) { 1447 + int error2; 1448 + 1449 + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); 1450 + if (error2) 1451 + ext4_warning_inode(ea_inode, "dec ref error=%d", 1452 + error2); 1453 + 1454 + /* If there was an error, revert the quota charge. */ 1455 + if (error) 1456 + ext4_xattr_inode_free_quota(inode, 1457 + i_size_read(ea_inode)); 1458 + iput(ea_inode); 1459 + } 1923 1460 if (ce) 1924 1461 mb_cache_entry_put(ea_block_cache, ce); 1925 1462 brelse(new_bh); ··· 2067 1558 return !memcmp(value, i->value, i->value_len); 2068 1559 } 2069 1560 1561 + static struct buffer_head *ext4_xattr_get_block(struct inode *inode) 1562 + { 1563 + struct buffer_head *bh; 1564 + int error; 1565 + 1566 + if (!EXT4_I(inode)->i_file_acl) 1567 + return NULL; 1568 + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1569 + if (!bh) 1570 + return ERR_PTR(-EIO); 1571 + error = ext4_xattr_check_block(inode, bh); 1572 + if (error) 1573 + return ERR_PTR(error); 1574 + return bh; 1575 + } 1576 + 2070 1577 /* 2071 1578 * ext4_xattr_set_handle() 2072 1579 * ··· 2125 1600 2126 1601 /* Check journal credits under write lock. */ 2127 1602 if (ext4_handle_valid(handle)) { 1603 + struct buffer_head *bh; 2128 1604 int credits; 2129 1605 2130 - credits = ext4_xattr_set_credits(inode, value_len); 1606 + bh = ext4_xattr_get_block(inode); 1607 + if (IS_ERR(bh)) { 1608 + error = PTR_ERR(bh); 1609 + goto cleanup; 1610 + } 1611 + 1612 + credits = __ext4_xattr_set_credits(inode, bh, value_len); 1613 + brelse(bh); 1614 + 2131 1615 if (!ext4_handle_has_enough_credits(handle, credits)) { 2132 1616 error = -ENOSPC; 2133 1617 goto cleanup; ··· 2172 1638 if (flags & XATTR_CREATE) 2173 1639 goto cleanup; 2174 1640 } 1641 + 2175 1642 if (!value) { 2176 1643 if (!is.s.not_found) 2177 1644 error = ext4_xattr_ibody_set(handle, inode, &i, &is); ··· 2241 1706 return error; 2242 1707 } 2243 1708 2244 - int ext4_xattr_set_credits(struct inode *inode, size_t value_len) 1709 + int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits) 2245 1710 { 2246 - struct super_block *sb = inode->i_sb; 2247 - int credits; 1711 + struct buffer_head *bh; 1712 + int err; 2248 1713 2249 - if (!EXT4_SB(sb)->s_journal) 1714 + *credits = 0; 1715 + 1716 + if (!EXT4_SB(inode->i_sb)->s_journal) 2250 1717 return 0; 2251 1718 2252 - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); 1719 + down_read(&EXT4_I(inode)->xattr_sem); 2253 1720 2254 - /* 2255 - * In case of inline data, we may push out the data to a block, 2256 - * so we need to reserve credits for this eventuality 2257 - */ 2258 - if (ext4_has_inline_data(inode)) 2259 - credits += ext4_writepage_trans_blocks(inode) + 1; 2260 - 2261 - if (ext4_has_feature_ea_inode(sb)) { 2262 - int nrblocks = (value_len + sb->s_blocksize - 1) >> 2263 - sb->s_blocksize_bits; 2264 - 2265 - /* For new inode */ 2266 - credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; 2267 - 2268 - /* For data blocks of EA inode */ 2269 - credits += ext4_meta_trans_blocks(inode, nrblocks, 0); 1721 + bh = ext4_xattr_get_block(inode); 1722 + if (IS_ERR(bh)) { 1723 + err = PTR_ERR(bh); 1724 + } else { 1725 + *credits = __ext4_xattr_set_credits(inode, bh, value_len); 1726 + brelse(bh); 1727 + err = 0; 2270 1728 } 2271 - return credits; 1729 + 1730 + up_read(&EXT4_I(inode)->xattr_sem); 1731 + return err; 2272 1732 } 2273 1733 2274 1734 /* ··· 2288 1758 return error; 2289 1759 2290 1760 retry: 2291 - credits = ext4_xattr_set_credits(inode, value_len); 1761 + error = ext4_xattr_set_credits(inode, value_len, &credits); 1762 + if (error) 1763 + return error; 1764 + 2292 1765 handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); 2293 1766 if (IS_ERR(handle)) { 2294 1767 error = PTR_ERR(handle); ··· 2597 2064 return error; 2598 2065 } 2599 2066 2600 - 2601 2067 #define EIA_INCR 16 /* must be 2^n */ 2602 2068 #define EIA_MASK (EIA_INCR - 1) 2603 - /* Add the large xattr @inode into @ea_inode_array for later deletion. 2069 + 2070 + /* Add the large xattr @inode into @ea_inode_array for deferred iput(). 2604 2071 * If @ea_inode_array is new or full it will be grown and the old 2605 2072 * contents copied over. 2606 2073 */ ··· 2645 2112 * ext4_xattr_delete_inode() 2646 2113 * 2647 2114 * Free extended attribute resources associated with this inode. Traverse 2648 - * all entries and unlink any xattr inodes associated with this inode. This 2649 - * is called immediately before an inode is freed. We have exclusive 2650 - * access to the inode. If an orphan inode is deleted it will also delete any 2651 - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() 2652 - * to ensure they belong to the parent inode and were not deleted already. 2115 + * all entries and decrement reference on any xattr inodes associated with this 2116 + * inode. This is called immediately before an inode is freed. We have exclusive 2117 + * access to the inode. If an orphan inode is deleted it will also release its 2118 + * references on xattr block and xattr inodes. 2653 2119 */ 2654 - int 2655 - ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, 2656 - struct ext4_xattr_inode_array **ea_inode_array, 2657 - int extra_credits) 2120 + int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, 2121 + struct ext4_xattr_inode_array **ea_inode_array, 2122 + int extra_credits) 2658 2123 { 2659 2124 struct buffer_head *bh = NULL; 2660 2125 struct ext4_xattr_ibody_header *header; 2661 - struct ext4_inode *raw_inode; 2662 2126 struct ext4_iloc iloc = { .bh = NULL }; 2127 + struct ext4_xattr_entry *entry; 2663 2128 int error; 2664 2129 2665 2130 error = ext4_xattr_ensure_credits(handle, inode, extra_credits, ··· 2669 2138 goto cleanup; 2670 2139 } 2671 2140 2672 - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) 2673 - goto delete_external_ea; 2141 + if (ext4_has_feature_ea_inode(inode->i_sb) && 2142 + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 2674 2143 2675 - error = ext4_get_inode_loc(inode, &iloc); 2676 - if (error) 2677 - goto cleanup; 2678 - 2679 - error = ext4_journal_get_write_access(handle, iloc.bh); 2680 - if (error) 2681 - goto cleanup; 2682 - 2683 - raw_inode = ext4_raw_inode(&iloc); 2684 - header = IHDR(inode, raw_inode); 2685 - ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header), 2686 - false /* block_csum */, ea_inode_array, 2687 - extra_credits); 2688 - 2689 - delete_external_ea: 2690 - if (!EXT4_I(inode)->i_file_acl) { 2691 - error = 0; 2692 - goto cleanup; 2693 - } 2694 - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 2695 - if (!bh) { 2696 - EXT4_ERROR_INODE(inode, "block %llu read error", 2697 - EXT4_I(inode)->i_file_acl); 2698 - error = -EIO; 2699 - goto cleanup; 2700 - } 2701 - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 2702 - BHDR(bh)->h_blocks != cpu_to_le32(1)) { 2703 - EXT4_ERROR_INODE(inode, "bad block %llu", 2704 - EXT4_I(inode)->i_file_acl); 2705 - error = -EFSCORRUPTED; 2706 - goto cleanup; 2707 - } 2708 - 2709 - if (ext4_has_feature_ea_inode(inode->i_sb)) { 2710 - error = ext4_journal_get_write_access(handle, bh); 2144 + error = ext4_get_inode_loc(inode, &iloc); 2711 2145 if (error) { 2712 - EXT4_ERROR_INODE(inode, "write access %llu", 2713 - EXT4_I(inode)->i_file_acl); 2146 + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); 2714 2147 goto cleanup; 2715 2148 } 2716 - ext4_xattr_inode_remove_all(handle, inode, bh, 2717 - BFIRST(bh), 2718 - true /* block_csum */, 2719 - ea_inode_array, 2720 - extra_credits); 2149 + 2150 + error = ext4_journal_get_write_access(handle, iloc.bh); 2151 + if (error) { 2152 + EXT4_ERROR_INODE(inode, "write access (error %d)", 2153 + error); 2154 + goto cleanup; 2155 + } 2156 + 2157 + header = IHDR(inode, ext4_raw_inode(&iloc)); 2158 + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 2159 + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, 2160 + IFIRST(header), 2161 + false /* block_csum */, 2162 + ea_inode_array, 2163 + extra_credits, 2164 + false /* skip_quota */); 2721 2165 } 2722 2166 2723 - ext4_xattr_release_block(handle, inode, bh); 2724 - /* Update i_file_acl within the same transaction that releases block. */ 2725 - EXT4_I(inode)->i_file_acl = 0; 2726 - error = ext4_mark_inode_dirty(handle, inode); 2727 - if (error) { 2728 - EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", 2729 - error); 2730 - goto cleanup; 2167 + if (EXT4_I(inode)->i_file_acl) { 2168 + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 2169 + if (!bh) { 2170 + EXT4_ERROR_INODE(inode, "block %llu read error", 2171 + EXT4_I(inode)->i_file_acl); 2172 + error = -EIO; 2173 + goto cleanup; 2174 + } 2175 + error = ext4_xattr_check_block(inode, bh); 2176 + if (error) { 2177 + EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", 2178 + EXT4_I(inode)->i_file_acl, error); 2179 + goto cleanup; 2180 + } 2181 + 2182 + if (ext4_has_feature_ea_inode(inode->i_sb)) { 2183 + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); 2184 + entry = EXT4_XATTR_NEXT(entry)) 2185 + if (entry->e_value_inum) 2186 + ext4_xattr_inode_free_quota(inode, 2187 + le32_to_cpu(entry->e_value_size)); 2188 + 2189 + } 2190 + 2191 + ext4_xattr_release_block(handle, inode, bh, ea_inode_array, 2192 + extra_credits); 2193 + /* 2194 + * Update i_file_acl value in the same transaction that releases 2195 + * block. 2196 + */ 2197 + EXT4_I(inode)->i_file_acl = 0; 2198 + error = ext4_mark_inode_dirty(handle, inode); 2199 + if (error) { 2200 + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", 2201 + error); 2202 + goto cleanup; 2203 + } 2731 2204 } 2205 + error = 0; 2732 2206 cleanup: 2733 2207 brelse(iloc.bh); 2734 2208 brelse(bh); ··· 2742 2206 2743 2207 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) 2744 2208 { 2745 - struct inode *ea_inode; 2746 - int idx = 0; 2209 + int idx; 2747 2210 2748 2211 if (ea_inode_array == NULL) 2749 2212 return; 2750 2213 2751 - for (; idx < ea_inode_array->count; ++idx) { 2752 - ea_inode = ea_inode_array->inodes[idx]; 2753 - clear_nlink(ea_inode); 2754 - iput(ea_inode); 2755 - } 2214 + for (idx = 0; idx < ea_inode_array->count; ++idx) 2215 + iput(ea_inode_array->inodes[idx]); 2756 2216 kfree(ea_inode_array); 2757 2217 } 2758 2218
+2 -15
fs/ext4/xattr.h
··· 70 70 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 71 71 72 72 /* 73 - * Link EA inode back to parent one using i_mtime field. 74 - * Extra integer type conversion added to ignore higher 75 - * bits in i_mtime.tv_sec which might be set by ext4_get() 76 - */ 77 - #define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \ 78 - do { \ 79 - (inode)->i_mtime.tv_sec = inum; \ 80 - } while(0) 81 - 82 - #define EXT4_XATTR_INODE_GET_PARENT(inode) \ 83 - ((__u32)(inode)->i_mtime.tv_sec) 84 - 85 - /* 86 73 * The minimum size of EA value when you start storing it in an external inode 87 74 * size of block - size of header - size of 1 entry - 4 null bytes 88 75 */ ··· 152 165 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); 153 166 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); 154 167 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); 155 - extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len); 168 + extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, 169 + int *credits); 156 170 157 - extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); 158 171 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, 159 172 struct ext4_xattr_inode_array **array, 160 173 int extra_credits);
+5 -4
fs/mbcache.c
··· 13 13 * mb_cache_entry_delete()). 14 14 * 15 15 * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. 16 - * They use hash of a block contents as a key and block number as a value. 17 - * That's why keys need not be unique (different xattr blocks may end up having 18 - * the same hash). However block number always uniquely identifies a cache 19 - * entry. 16 + * Ext4 also uses it for deduplication of xattr values stored in inodes. 17 + * They use hash of data as a key and provide a value that may represent a 18 + * block or inode number. That's why keys need not be unique (hash of different 19 + * data may be the same). However user provided value always uniquely 20 + * identifies a cache entry. 20 21 * 21 22 * We provide functions for creation and removal of entries, search by key, 22 23 * and a special "delete entry with given key-value pair" operation. Fixed