Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
"Some locking and page fault bug fixes from Jan Kara, some ext4
encryption fixes from me, and Li Xi's Project Quota commits"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
fs: clean up the flags definition in uapi/linux/fs.h
ext4: add FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support
ext4: add project quota support
ext4: adds project ID support
ext4 crypto: simplify interfaces to directory entry insert functions
ext4 crypto: add missing locking for keyring_key access
ext4: use pre-zeroed blocks for DAX page faults
ext4: implement allocation of pre-zeroed blocks
ext4: provide ext4_issue_zeroout()
ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag
ext4: document lock ordering
ext4: fix races of writeback with punch hole and zero range
ext4: fix races between buffered IO and collapse / insert range
ext4: move unlocked dio protection from ext4_alloc_file_blocks()
ext4: fix races between page faults and hole punching

+895 -276
+2 -4
fs/ext4/crypto.c
··· 384 384 EXT4_DECRYPT, page->index, page, page); 385 385 } 386 386 387 - int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) 387 + int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, 388 + ext4_fsblk_t pblk, ext4_lblk_t len) 388 389 { 389 390 struct ext4_crypto_ctx *ctx; 390 391 struct page *ciphertext_page = NULL; 391 392 struct bio *bio; 392 - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); 393 - ext4_fsblk_t pblk = ext4_ext_pblock(ex); 394 - unsigned int len = ext4_ext_get_actual_len(ex); 395 393 int ret, err = 0; 396 394 397 395 #if 0
+4
fs/ext4/crypto_key.c
··· 213 213 res = -ENOKEY; 214 214 goto out; 215 215 } 216 + down_read(&keyring_key->sem); 216 217 ukp = user_key_payload(keyring_key); 217 218 if (ukp->datalen != sizeof(struct ext4_encryption_key)) { 218 219 res = -EINVAL; 220 + up_read(&keyring_key->sem); 219 221 goto out; 220 222 } 221 223 master_key = (struct ext4_encryption_key *)ukp->data; ··· 228 226 "ext4: key size incorrect: %d\n", 229 227 master_key->size); 230 228 res = -ENOKEY; 229 + up_read(&keyring_key->sem); 231 230 goto out; 232 231 } 233 232 res = ext4_derive_key_aes(ctx.nonce, master_key->raw, 234 233 raw_key); 234 + up_read(&keyring_key->sem); 235 235 if (res) 236 236 goto out; 237 237 got_key:
+86 -13
fs/ext4/ext4.h
··· 378 378 #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ 379 379 #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 380 380 381 - #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ 382 - #define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ 381 + #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ 382 + #define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ 383 + 384 + #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ 385 + EXT4_IMMUTABLE_FL | \ 386 + EXT4_APPEND_FL | \ 387 + EXT4_NODUMP_FL | \ 388 + EXT4_NOATIME_FL | \ 389 + EXT4_PROJINHERIT_FL) 383 390 384 391 /* Flags that should be inherited by new inodes from their parent. */ 385 392 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 386 393 EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ 387 394 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ 388 - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) 395 + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ 396 + EXT4_PROJINHERIT_FL) 389 397 390 398 /* Flags that are appropriate for regular files (all but dir-specific ones). */ 391 399 #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) ··· 563 555 #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 564 556 /* Request will not result in inode size update (user for fallocate) */ 565 557 #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 566 - /* Do not take i_data_sem locking in ext4_map_blocks */ 567 - #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 568 558 /* Convert written extents to unwritten */ 569 - #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 559 + #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 560 + /* Write zeros to newly created written extents */ 561 + #define EXT4_GET_BLOCKS_ZERO 0x0200 562 + #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ 563 + EXT4_GET_BLOCKS_ZERO) 570 564 571 565 /* 572 566 * The bit position of these flags must not overlap with any of the ··· 625 615 #define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) 626 616 #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) 627 617 #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) 618 + 619 + #ifndef FS_IOC_FSGETXATTR 620 + /* Until the uapi changes get merged for project quota... */ 621 + 622 + #define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) 623 + #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) 624 + 625 + /* 626 + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. 627 + */ 628 + struct fsxattr { 629 + __u32 fsx_xflags; /* xflags field value (get/set) */ 630 + __u32 fsx_extsize; /* extsize field value (get/set)*/ 631 + __u32 fsx_nextents; /* nextents field value (get) */ 632 + __u32 fsx_projid; /* project identifier (get/set) */ 633 + unsigned char fsx_pad[12]; 634 + }; 635 + 636 + /* 637 + * Flags for the fsx_xflags field 638 + */ 639 + #define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ 640 + #define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ 641 + #define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ 642 + #define FS_XFLAG_APPEND 0x00000010 /* all writes append */ 643 + #define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ 644 + #define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ 645 + #define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ 646 + #define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ 647 + #define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ 648 + #define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ 649 + #define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ 650 + #define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ 651 + #define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ 652 + #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ 653 + #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ 654 + #endif /* !defined(FS_IOC_FSGETXATTR) */ 655 + 656 + #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR 657 + #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR 628 658 629 659 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 630 660 /* ··· 960 910 * by other means, so we have i_data_sem. 961 911 */ 962 912 struct rw_semaphore i_data_sem; 913 + /* 914 + * i_mmap_sem is for serializing page faults with truncate / punch hole 915 + * operations. We have to make sure that new page cannot be faulted in 916 + * a section of the inode that is being punched. We cannot easily use 917 + * i_data_sem for this since we need protection for the whole punch 918 + * operation and i_data_sem ranks below transaction start so we have 919 + * to occasionally drop it. 920 + */ 921 + struct rw_semaphore i_mmap_sem; 963 922 struct inode vfs_inode; 964 923 struct jbd2_inode *jinode; 965 924 ··· 1052 993 /* Encryption params */ 1053 994 struct ext4_crypt_info *i_crypt_info; 1054 995 #endif 996 + kprojid_t i_projid; 1055 997 }; 1056 998 1057 999 /* ··· 1308 1248 #endif 1309 1249 1310 1250 /* Number of quota types we support */ 1311 - #define EXT4_MAXQUOTAS 2 1251 + #define EXT4_MAXQUOTAS 3 1312 1252 1313 1253 /* 1314 1254 * fourth extended-fs super-block data in memory ··· 1814 1754 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ 1815 1755 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ 1816 1756 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ 1817 - EXT4_FEATURE_RO_COMPAT_QUOTA) 1757 + EXT4_FEATURE_RO_COMPAT_QUOTA |\ 1758 + EXT4_FEATURE_RO_COMPAT_PROJECT) 1818 1759 1819 1760 #define EXTN_FEATURE_FUNCS(ver) \ 1820 1761 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ ··· 1856 1795 */ 1857 1796 #define EXT4_DEF_RESUID 0 1858 1797 #define EXT4_DEF_RESGID 0 1798 + 1799 + /* 1800 + * Default project ID 1801 + */ 1802 + #define EXT4_DEF_PROJID 0 1859 1803 1860 1804 #define EXT4_DEF_INODE_READAHEAD_BLKS 32 1861 1805 ··· 2300 2234 struct page *ext4_encrypt(struct inode *inode, 2301 2235 struct page *plaintext_page); 2302 2236 int ext4_decrypt(struct page *page); 2303 - int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); 2237 + int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, 2238 + ext4_fsblk_t pblk, ext4_lblk_t len); 2304 2239 2305 2240 #ifdef CONFIG_EXT4_FS_ENCRYPTION 2306 2241 int ext4_init_crypto(void); ··· 2507 2440 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2508 2441 int ext4_get_block_write(struct inode *inode, sector_t iblock, 2509 2442 struct buffer_head *bh_result, int create); 2510 - int ext4_get_block_dax(struct inode *inode, sector_t iblock, 2511 - struct buffer_head *bh_result, int create); 2443 + int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 2444 + struct buffer_head *bh_result, int create); 2512 2445 int ext4_get_block(struct inode *inode, sector_t iblock, 2513 2446 struct buffer_head *bh_result, int create); 2514 2447 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, ··· 2551 2484 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2552 2485 loff_t lstart, loff_t lend); 2553 2486 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2487 + extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); 2554 2488 extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2489 + extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); 2555 2490 extern void ext4_da_update_reserve_space(struct inode *inode, 2556 2491 int used, int quota_claim); 2492 + extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, 2493 + ext4_fsblk_t pblk, ext4_lblk_t len); 2557 2494 2558 2495 /* indirect.c */ 2559 2496 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ··· 2919 2848 return changed; 2920 2849 } 2921 2850 2851 + int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, 2852 + loff_t len); 2853 + 2922 2854 struct ext4_group_info { 2923 2855 unsigned long bb_state; 2924 2856 struct rb_root bb_free_root; ··· 3060 2986 struct page *page); 3061 2987 extern int ext4_try_add_inline_entry(handle_t *handle, 3062 2988 struct ext4_filename *fname, 3063 - struct dentry *dentry, 3064 - struct inode *inode); 2989 + struct inode *dir, struct inode *inode); 3065 2990 extern int ext4_try_create_inline_dir(handle_t *handle, 3066 2991 struct inode *parent, 3067 2992 struct inode *inode);
+83 -70
fs/ext4/extents.c
··· 3119 3119 { 3120 3120 ext4_fsblk_t ee_pblock; 3121 3121 unsigned int ee_len; 3122 - int ret; 3123 3122 3124 3123 ee_len = ext4_ext_get_actual_len(ex); 3125 3124 ee_pblock = ext4_ext_pblock(ex); 3126 - 3127 - if (ext4_encrypted_inode(inode)) 3128 - return ext4_encrypted_zeroout(inode, ex); 3129 - 3130 - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 3131 - if (ret > 0) 3132 - ret = 0; 3133 - 3134 - return ret; 3125 + return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, 3126 + ee_len); 3135 3127 } 3136 3128 3137 3129 /* ··· 4044 4052 } 4045 4053 /* IO end_io complete, convert the filled extent to written */ 4046 4054 if (flags & EXT4_GET_BLOCKS_CONVERT) { 4055 + if (flags & EXT4_GET_BLOCKS_ZERO) { 4056 + if (allocated > map->m_len) 4057 + allocated = map->m_len; 4058 + err = ext4_issue_zeroout(inode, map->m_lblk, newblock, 4059 + allocated); 4060 + if (err < 0) 4061 + goto out2; 4062 + } 4047 4063 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 4048 4064 ppath); 4049 4065 if (ret >= 0) { ··· 4685 4685 if (len <= EXT_UNWRITTEN_MAX_LEN) 4686 4686 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4687 4687 4688 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 4689 - ext4_inode_block_unlocked_dio(inode); 4690 - inode_dio_wait(inode); 4691 - 4692 4688 /* 4693 4689 * credits to insert 1 extent into extent tree 4694 4690 */ ··· 4748 4752 goto retry; 4749 4753 } 4750 4754 4751 - ext4_inode_resume_unlocked_dio(inode); 4752 - 4753 4755 return ret > 0 ? ret2 : ret; 4754 4756 } 4755 4757 ··· 4764 4770 int partial_begin, partial_end; 4765 4771 loff_t start, end; 4766 4772 ext4_lblk_t lblk; 4767 - struct address_space *mapping = inode->i_mapping; 4768 4773 unsigned int blkbits = inode->i_blkbits; 4769 4774 4770 4775 trace_ext4_zero_range(inode, offset, len, mode); ··· 4774 4781 /* Call ext4_force_commit to flush all data in case of data=journal. */ 4775 4782 if (ext4_should_journal_data(inode)) { 4776 4783 ret = ext4_force_commit(inode->i_sb); 4777 - if (ret) 4778 - return ret; 4779 - } 4780 - 4781 - /* 4782 - * Write out all dirty pages to avoid race conditions 4783 - * Then release them. 4784 - */ 4785 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4786 - ret = filemap_write_and_wait_range(mapping, offset, 4787 - offset + len - 1); 4788 4784 if (ret) 4789 4785 return ret; 4790 4786 } ··· 4821 4839 if (mode & FALLOC_FL_KEEP_SIZE) 4822 4840 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4823 4841 4842 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 4843 + ext4_inode_block_unlocked_dio(inode); 4844 + inode_dio_wait(inode); 4845 + 4824 4846 /* Preallocate the range including the unaligned edges */ 4825 4847 if (partial_begin || partial_end) { 4826 4848 ret = ext4_alloc_file_blocks(file, ··· 4833 4847 round_down(offset, 1 << blkbits)) >> blkbits, 4834 4848 new_size, flags, mode); 4835 4849 if (ret) 4836 - goto out_mutex; 4850 + goto out_dio; 4837 4851 4838 4852 } 4839 4853 ··· 4842 4856 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4843 4857 EXT4_EX_NOCACHE); 4844 4858 4845 - /* Now release the pages and zero block aligned part of pages*/ 4859 + /* 4860 + * Prevent page faults from reinstantiating pages we have 4861 + * released from page cache. 4862 + */ 4863 + down_write(&EXT4_I(inode)->i_mmap_sem); 4864 + ret = ext4_update_disksize_before_punch(inode, offset, len); 4865 + if (ret) { 4866 + up_write(&EXT4_I(inode)->i_mmap_sem); 4867 + goto out_dio; 4868 + } 4869 + /* Now release the pages and zero block aligned part of pages */ 4846 4870 truncate_pagecache_range(inode, start, end - 1); 4847 4871 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4848 4872 4849 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 4850 - ext4_inode_block_unlocked_dio(inode); 4851 - inode_dio_wait(inode); 4852 - 4853 4873 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4854 4874 flags, mode); 4875 + up_write(&EXT4_I(inode)->i_mmap_sem); 4855 4876 if (ret) 4856 4877 goto out_dio; 4857 4878 } ··· 4991 4998 goto out; 4992 4999 } 4993 5000 5001 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 5002 + ext4_inode_block_unlocked_dio(inode); 5003 + inode_dio_wait(inode); 5004 + 4994 5005 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4995 5006 flags, mode); 5007 + ext4_inode_resume_unlocked_dio(inode); 4996 5008 if (ret) 4997 5009 goto out; 4998 5010 ··· 5492 5494 return ret; 5493 5495 } 5494 5496 5495 - /* 5496 - * Need to round down offset to be aligned with page size boundary 5497 - * for page size > block size. 5498 - */ 5499 - ioffset = round_down(offset, PAGE_SIZE); 5500 - 5501 - /* Write out all dirty pages */ 5502 - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5503 - LLONG_MAX); 5504 - if (ret) 5505 - return ret; 5506 - 5507 - /* Take mutex lock */ 5508 5497 mutex_lock(&inode->i_mutex); 5509 - 5510 5498 /* 5511 5499 * There is no need to overlap collapse range with EOF, in which case 5512 5500 * it is effectively a truncate operation ··· 5508 5524 goto out_mutex; 5509 5525 } 5510 5526 5511 - truncate_pagecache(inode, ioffset); 5512 - 5513 5527 /* Wait for existing dio to complete */ 5514 5528 ext4_inode_block_unlocked_dio(inode); 5515 5529 inode_dio_wait(inode); 5530 + 5531 + /* 5532 + * Prevent page faults from reinstantiating pages we have released from 5533 + * page cache. 5534 + */ 5535 + down_write(&EXT4_I(inode)->i_mmap_sem); 5536 + /* 5537 + * Need to round down offset to be aligned with page size boundary 5538 + * for page size > block size. 5539 + */ 5540 + ioffset = round_down(offset, PAGE_SIZE); 5541 + /* 5542 + * Write tail of the last page before removed range since it will get 5543 + * removed from the page cache below. 5544 + */ 5545 + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); 5546 + if (ret) 5547 + goto out_mmap; 5548 + /* 5549 + * Write data that will be shifted to preserve them when discarding 5550 + * page cache below. We are also protected from pages becoming dirty 5551 + * by i_mmap_sem. 5552 + */ 5553 + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, 5554 + LLONG_MAX); 5555 + if (ret) 5556 + goto out_mmap; 5557 + truncate_pagecache(inode, ioffset); 5516 5558 5517 5559 credits = ext4_writepage_trans_blocks(inode); 5518 5560 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5519 5561 if (IS_ERR(handle)) { 5520 5562 ret = PTR_ERR(handle); 5521 - goto out_dio; 5563 + goto out_mmap; 5522 5564 } 5523 5565 5524 5566 down_write(&EXT4_I(inode)->i_data_sem); ··· 5583 5573 5584 5574 out_stop: 5585 5575 ext4_journal_stop(handle); 5586 - out_dio: 5576 + out_mmap: 5577 + up_write(&EXT4_I(inode)->i_mmap_sem); 5587 5578 ext4_inode_resume_unlocked_dio(inode); 5588 5579 out_mutex: 5589 5580 mutex_unlock(&inode->i_mutex); ··· 5638 5627 return ret; 5639 5628 } 5640 5629 5641 - /* 5642 - * Need to round down to align start offset to page size boundary 5643 - * for page size > block size. 5644 - */ 5645 - ioffset = round_down(offset, PAGE_SIZE); 5646 - 5647 - /* Write out all dirty pages */ 5648 - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5649 - LLONG_MAX); 5650 - if (ret) 5651 - return ret; 5652 - 5653 - /* Take mutex lock */ 5654 5630 mutex_lock(&inode->i_mutex); 5655 - 5656 5631 /* Currently just for extent based files */ 5657 5632 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5658 5633 ret = -EOPNOTSUPP; ··· 5657 5660 goto out_mutex; 5658 5661 } 5659 5662 5660 - truncate_pagecache(inode, ioffset); 5661 - 5662 5663 /* Wait for existing dio to complete */ 5663 5664 ext4_inode_block_unlocked_dio(inode); 5664 5665 inode_dio_wait(inode); 5666 + 5667 + /* 5668 + * Prevent page faults from reinstantiating pages we have released from 5669 + * page cache. 5670 + */ 5671 + down_write(&EXT4_I(inode)->i_mmap_sem); 5672 + /* 5673 + * Need to round down to align start offset to page size boundary 5674 + * for page size > block size. 5675 + */ 5676 + ioffset = round_down(offset, PAGE_SIZE); 5677 + /* Write out all dirty pages */ 5678 + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5679 + LLONG_MAX); 5680 + if (ret) 5681 + goto out_mmap; 5682 + truncate_pagecache(inode, ioffset); 5665 5683 5666 5684 credits = ext4_writepage_trans_blocks(inode); 5667 5685 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5668 5686 if (IS_ERR(handle)) { 5669 5687 ret = PTR_ERR(handle); 5670 - goto out_dio; 5688 + goto out_mmap; 5671 5689 } 5672 5690 5673 5691 /* Expand file to avoid data loss if there is error while shifting */ ··· 5753 5741 5754 5742 out_stop: 5755 5743 ext4_journal_stop(handle); 5756 - out_dio: 5744 + out_mmap: 5745 + up_write(&EXT4_I(inode)->i_mmap_sem); 5757 5746 ext4_inode_resume_unlocked_dio(inode); 5758 5747 out_mutex: 5759 5748 mutex_unlock(&inode->i_mutex);
+58 -24
fs/ext4/file.c
··· 193 193 } 194 194 195 195 #ifdef CONFIG_FS_DAX 196 - static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) 197 - { 198 - struct inode *inode = bh->b_assoc_map->host; 199 - /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ 200 - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; 201 - int err; 202 - if (!uptodate) 203 - return; 204 - WARN_ON(!buffer_unwritten(bh)); 205 - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); 206 - } 207 - 208 196 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 209 197 { 210 198 int result; 211 199 handle_t *handle = NULL; 212 - struct super_block *sb = file_inode(vma->vm_file)->i_sb; 200 + struct inode *inode = file_inode(vma->vm_file); 201 + struct super_block *sb = inode->i_sb; 213 202 bool write = vmf->flags & FAULT_FLAG_WRITE; 214 203 215 204 if (write) { 216 205 sb_start_pagefault(sb); 217 206 file_update_time(vma->vm_file); 207 + down_read(&EXT4_I(inode)->i_mmap_sem); 218 208 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 219 209 EXT4_DATA_TRANS_BLOCKS(sb)); 220 - } 210 + } else 211 + down_read(&EXT4_I(inode)->i_mmap_sem); 221 212 222 213 if (IS_ERR(handle)) 223 214 result = VM_FAULT_SIGBUS; 224 215 else 225 - result = __dax_fault(vma, vmf, ext4_get_block_dax, 226 - ext4_end_io_unwritten); 216 + result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); 227 217 228 218 if (write) { 229 219 if (!IS_ERR(handle)) 230 220 ext4_journal_stop(handle); 221 + up_read(&EXT4_I(inode)->i_mmap_sem); 231 222 sb_end_pagefault(sb); 232 - } 223 + } else 224 + up_read(&EXT4_I(inode)->i_mmap_sem); 233 225 234 226 return result; 235 227 } ··· 238 246 if (write) { 239 247 sb_start_pagefault(sb); 240 248 file_update_time(vma->vm_file); 249 + down_read(&EXT4_I(inode)->i_mmap_sem); 241 250 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 242 251 ext4_chunk_trans_blocks(inode, 243 252 PMD_SIZE / PAGE_SIZE)); 244 - } 253 + } else 254 + down_read(&EXT4_I(inode)->i_mmap_sem); 245 255 246 256 if (IS_ERR(handle)) 247 257 result = VM_FAULT_SIGBUS; 248 258 else 249 259 result = __dax_pmd_fault(vma, addr, pmd, flags, 250 - ext4_get_block_dax, ext4_end_io_unwritten); 260 + ext4_dax_mmap_get_block, NULL); 251 261 252 262 if (write) { 253 263 if (!IS_ERR(handle)) 254 264 ext4_journal_stop(handle); 265 + up_read(&EXT4_I(inode)->i_mmap_sem); 255 266 sb_end_pagefault(sb); 256 - } 267 + } else 268 + up_read(&EXT4_I(inode)->i_mmap_sem); 257 269 258 270 return result; 259 271 } 260 272 261 273 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 262 274 { 263 - return dax_mkwrite(vma, vmf, ext4_get_block_dax, 264 - ext4_end_io_unwritten); 275 + int err; 276 + struct inode *inode = file_inode(vma->vm_file); 277 + 278 + sb_start_pagefault(inode->i_sb); 279 + file_update_time(vma->vm_file); 280 + down_read(&EXT4_I(inode)->i_mmap_sem); 281 + err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); 282 + up_read(&EXT4_I(inode)->i_mmap_sem); 283 + sb_end_pagefault(inode->i_sb); 284 + 285 + return err; 286 + } 287 + 288 + /* 289 + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() 290 + * handler we check for races agaist truncate. Note that since we cycle through 291 + * i_mmap_sem, we are sure that also any hole punching that began before we 292 + * were called is finished by now and so if it included part of the file we 293 + * are working on, our pte will get unmapped and the check for pte_same() in 294 + * wp_pfn_shared() fails. Thus fault gets retried and things work out as 295 + * desired. 296 + */ 297 + static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, 298 + struct vm_fault *vmf) 299 + { 300 + struct inode *inode = file_inode(vma->vm_file); 301 + struct super_block *sb = inode->i_sb; 302 + int ret = VM_FAULT_NOPAGE; 303 + loff_t size; 304 + 305 + sb_start_pagefault(sb); 306 + file_update_time(vma->vm_file); 307 + down_read(&EXT4_I(inode)->i_mmap_sem); 308 + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 309 + if (vmf->pgoff >= size) 310 + ret = VM_FAULT_SIGBUS; 311 + up_read(&EXT4_I(inode)->i_mmap_sem); 312 + sb_end_pagefault(sb); 313 + 314 + return ret; 265 315 } 266 316 267 317 static const struct vm_operations_struct ext4_dax_vm_ops = { 268 318 .fault = ext4_dax_fault, 269 319 .pmd_fault = ext4_dax_pmd_fault, 270 320 .page_mkwrite = ext4_dax_mkwrite, 271 - .pfn_mkwrite = dax_pfn_mkwrite, 321 + .pfn_mkwrite = ext4_dax_pfn_mkwrite, 272 322 }; 273 323 #else 274 324 #define ext4_dax_vm_ops ext4_file_vm_ops 275 325 #endif 276 326 277 327 static const struct vm_operations_struct ext4_file_vm_ops = { 278 - .fault = filemap_fault, 328 + .fault = ext4_filemap_fault, 279 329 .map_pages = filemap_map_pages, 280 330 .page_mkwrite = ext4_page_mkwrite, 281 331 };
+7
fs/ext4/ialloc.c
··· 799 799 inode->i_gid = dir->i_gid; 800 800 } else 801 801 inode_init_owner(inode, dir, mode); 802 + 803 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && 804 + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) 805 + ei->i_projid = EXT4_I(dir)->i_projid; 806 + else 807 + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); 808 + 802 809 err = dquot_initialize(inode); 803 810 if (err) 804 811 goto out;
+4 -6
fs/ext4/inline.c
··· 995 995 */ 996 996 static int ext4_add_dirent_to_inline(handle_t *handle, 997 997 struct ext4_filename *fname, 998 - struct dentry *dentry, 998 + struct inode *dir, 999 999 struct inode *inode, 1000 1000 struct ext4_iloc *iloc, 1001 1001 void *inline_start, int inline_size) 1002 1002 { 1003 - struct inode *dir = d_inode(dentry->d_parent); 1004 1003 int err; 1005 1004 struct ext4_dir_entry_2 *de; 1006 1005 ··· 1244 1245 * the new created block. 1245 1246 */ 1246 1247 int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, 1247 - struct dentry *dentry, struct inode *inode) 1248 + struct inode *dir, struct inode *inode) 1248 1249 { 1249 1250 int ret, inline_size; 1250 1251 void *inline_start; 1251 1252 struct ext4_iloc iloc; 1252 - struct inode *dir = d_inode(dentry->d_parent); 1253 1253 1254 1254 ret = ext4_get_inode_loc(dir, &iloc); 1255 1255 if (ret) ··· 1262 1264 EXT4_INLINE_DOTDOT_SIZE; 1263 1265 inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; 1264 1266 1265 - ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, 1267 + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, 1266 1268 inline_start, inline_size); 1267 1269 if (ret != -ENOSPC) 1268 1270 goto out; ··· 1283 1285 if (inline_size) { 1284 1286 inline_start = ext4_get_inline_xattr_pos(dir, &iloc); 1285 1287 1286 - ret = ext4_add_dirent_to_inline(handle, fname, dentry, 1288 + ret = ext4_add_dirent_to_inline(handle, fname, dir, 1287 1289 inode, &iloc, inline_start, 1288 1290 inline_size); 1289 1291
+215 -53
fs/ext4/inode.c
··· 383 383 return 0; 384 384 } 385 385 386 + int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, 387 + ext4_lblk_t len) 388 + { 389 + int ret; 390 + 391 + if (ext4_encrypted_inode(inode)) 392 + return ext4_encrypted_zeroout(inode, lblk, pblk, len); 393 + 394 + ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); 395 + if (ret > 0) 396 + ret = 0; 397 + 398 + return ret; 399 + } 400 + 386 401 #define check_block_validity(inode, map) \ 387 402 __check_block_validity((inode), __func__, __LINE__, (map)) 388 403 ··· 418 403 * out taking i_data_sem. So at the time the unwritten extent 419 404 * could be converted. 420 405 */ 421 - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 422 - down_read(&EXT4_I(inode)->i_data_sem); 406 + down_read(&EXT4_I(inode)->i_data_sem); 423 407 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 424 408 retval = ext4_ext_map_blocks(handle, inode, map, flags & 425 409 EXT4_GET_BLOCKS_KEEP_SIZE); ··· 426 412 retval = ext4_ind_map_blocks(handle, inode, map, flags & 427 413 EXT4_GET_BLOCKS_KEEP_SIZE); 428 414 } 429 - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 430 - up_read((&EXT4_I(inode)->i_data_sem)); 415 + up_read((&EXT4_I(inode)->i_data_sem)); 431 416 432 417 /* 433 418 * We don't check m_len because extent will be collpased in status ··· 522 509 * Try to see if we can get the block without requesting a new 523 510 * file system block. 524 511 */ 525 - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 526 - down_read(&EXT4_I(inode)->i_data_sem); 512 + down_read(&EXT4_I(inode)->i_data_sem); 527 513 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 528 514 retval = ext4_ext_map_blocks(handle, inode, map, flags & 529 515 EXT4_GET_BLOCKS_KEEP_SIZE); ··· 553 541 if (ret < 0) 554 542 retval = ret; 555 543 } 556 - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 557 - up_read((&EXT4_I(inode)->i_data_sem)); 544 + up_read((&EXT4_I(inode)->i_data_sem)); 558 545 559 546 found: 560 547 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ··· 637 626 } 638 627 639 628 /* 629 + * We have to zeroout blocks before inserting them into extent 630 + * status tree. Otherwise someone could look them up there and 631 + * use them before they are really zeroed. 632 + */ 633 + if (flags & EXT4_GET_BLOCKS_ZERO && 634 + map->m_flags & EXT4_MAP_MAPPED && 635 + map->m_flags & EXT4_MAP_NEW) { 636 + ret = ext4_issue_zeroout(inode, map->m_lblk, 637 + map->m_pblk, map->m_len); 638 + if (ret) { 639 + retval = ret; 640 + goto out_sem; 641 + } 642 + } 643 + 644 + /* 640 645 * If the extent has been zeroed out, we don't need to update 641 646 * extent status tree. 642 647 */ 643 648 if ((flags & EXT4_GET_BLOCKS_PRE_IO) && 644 649 ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 645 650 if (ext4_es_is_written(&es)) 646 - goto has_zeroout; 651 + goto out_sem; 647 652 } 648 653 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 649 654 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ··· 670 643 status |= EXTENT_STATUS_DELAYED; 671 644 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 672 645 map->m_pblk, status); 673 - if (ret < 0) 646 + if (ret < 0) { 674 647 retval = ret; 648 + goto out_sem; 649 + } 675 650 } 676 651 677 - has_zeroout: 652 + out_sem: 678 653 up_write((&EXT4_I(inode)->i_data_sem)); 679 654 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 680 655 ret = check_block_validity(inode, map); ··· 703 674 map.m_lblk = iblock; 704 675 map.m_len = bh->b_size >> inode->i_blkbits; 705 676 706 - if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { 677 + if (flags && !handle) { 707 678 /* Direct IO write... */ 708 679 if (map.m_len > DIO_MAX_BLOCKS) 709 680 map.m_len = DIO_MAX_BLOCKS; ··· 723 694 724 695 map_bh(bh, inode->i_sb, map.m_pblk); 725 696 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 726 - if (IS_DAX(inode) && buffer_unwritten(bh)) { 727 - /* 728 - * dgc: I suspect unwritten conversion on ext4+DAX is 729 - * fundamentally broken here when there are concurrent 730 - * read/write in progress on this inode. 731 - */ 732 - WARN_ON_ONCE(io_end); 733 - bh->b_assoc_map = inode->i_mapping; 734 - bh->b_private = (void *)(unsigned long)iblock; 735 - } 736 697 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) 737 698 set_buffer_defer_completion(bh); 738 699 bh->b_size = inode->i_sb->s_blocksize * map.m_len; ··· 897 878 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 898 879 return ret; 899 880 } 900 - 901 - static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 902 - struct buffer_head *bh_result, int create); 903 881 904 882 #ifdef CONFIG_EXT4_FS_ENCRYPTION 905 883 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, ··· 3070 3054 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3071 3055 } 3072 3056 3073 - static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 3057 + static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, 3074 3058 struct buffer_head *bh_result, int create) 3075 3059 { 3076 - ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", 3060 + int ret; 3061 + 3062 + ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", 3077 3063 inode->i_ino, create); 3078 - return _ext4_get_block(inode, iblock, bh_result, 3079 - EXT4_GET_BLOCKS_NO_LOCK); 3064 + ret = _ext4_get_block(inode, iblock, bh_result, 0); 3065 + /* 3066 + * Blocks should have been preallocated! ext4_file_write_iter() checks 3067 + * that. 3068 + */ 3069 + WARN_ON_ONCE(!buffer_mapped(bh_result)); 3070 + 3071 + return ret; 3080 3072 } 3081 3073 3082 - int ext4_get_block_dax(struct inode *inode, sector_t iblock, 3083 - struct buffer_head *bh_result, int create) 3074 + #ifdef CONFIG_FS_DAX 3075 + int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3076 + struct buffer_head *bh_result, int create) 3084 3077 { 3085 - int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; 3086 - if (create) 3087 - flags |= EXT4_GET_BLOCKS_CREATE; 3088 - ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", 3078 + int ret, err; 3079 + int credits; 3080 + struct ext4_map_blocks map; 3081 + handle_t *handle = NULL; 3082 + int flags = 0; 3083 + 3084 + ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", 3089 3085 inode->i_ino, create); 3090 - return _ext4_get_block(inode, iblock, bh_result, flags); 3086 + map.m_lblk = iblock; 3087 + map.m_len = bh_result->b_size >> inode->i_blkbits; 3088 + credits = ext4_chunk_trans_blocks(inode, map.m_len); 3089 + if (create) { 3090 + flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; 3091 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 3092 + if (IS_ERR(handle)) { 3093 + ret = PTR_ERR(handle); 3094 + return ret; 3095 + } 3096 + } 3097 + 3098 + ret = ext4_map_blocks(handle, inode, &map, flags); 3099 + if (create) { 3100 + err = ext4_journal_stop(handle); 3101 + if (ret >= 0 && err < 0) 3102 + ret = err; 3103 + } 3104 + if (ret <= 0) 3105 + goto out; 3106 + if (map.m_flags & EXT4_MAP_UNWRITTEN) { 3107 + int err2; 3108 + 3109 + /* 3110 + * We are protected by i_mmap_sem so we know block cannot go 3111 + * away from under us even though we dropped i_data_sem. 3112 + * Convert extent to written and write zeros there. 3113 + * 3114 + * Note: We may get here even when create == 0. 3115 + */ 3116 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 3117 + if (IS_ERR(handle)) { 3118 + ret = PTR_ERR(handle); 3119 + goto out; 3120 + } 3121 + 3122 + err = ext4_map_blocks(handle, inode, &map, 3123 + EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); 3124 + if (err < 0) 3125 + ret = err; 3126 + err2 = ext4_journal_stop(handle); 3127 + if (err2 < 0 && ret > 0) 3128 + ret = err2; 3129 + } 3130 + out: 3131 + WARN_ON_ONCE(ret == 0 && create); 3132 + if (ret > 0) { 3133 + map_bh(bh_result, inode->i_sb, map.m_pblk); 3134 + bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | 3135 + map.m_flags; 3136 + /* 3137 + * At least for now we have to clear BH_New so that DAX code 3138 + * doesn't attempt to zero blocks again in a racy way. 3139 + */ 3140 + bh_result->b_state &= ~(1 << BH_New); 3141 + bh_result->b_size = map.m_len << inode->i_blkbits; 3142 + ret = 0; 3143 + } 3144 + return ret; 3091 3145 } 3146 + #endif 3092 3147 3093 3148 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3094 3149 ssize_t size, void *private) ··· 3230 3143 /* If we do a overwrite dio, i_mutex locking can be released */ 3231 3144 overwrite = *((int *)iocb->private); 3232 3145 3233 - if (overwrite) { 3234 - down_read(&EXT4_I(inode)->i_data_sem); 3146 + if (overwrite) 3235 3147 mutex_unlock(&inode->i_mutex); 3236 - } 3237 3148 3238 3149 /* 3239 3150 * We could direct write to holes and fallocate. ··· 3274 3189 } 3275 3190 3276 3191 if (overwrite) { 3277 - get_block_func = ext4_get_block_write_nolock; 3192 + get_block_func = ext4_get_block_overwrite; 3278 3193 } else { 3279 3194 get_block_func = ext4_get_block_write; 3280 3195 dio_flags = DIO_LOCKING; ··· 3330 3245 if (iov_iter_rw(iter) == WRITE) 3331 3246 inode_dio_end(inode); 3332 3247 /* take i_mutex locking again if we do a ovewrite dio */ 3333 - if (overwrite) { 3334 - up_read(&EXT4_I(inode)->i_data_sem); 3248 + if (overwrite) 3335 3249 mutex_lock(&inode->i_mutex); 3336 - } 3337 3250 3338 3251 return ret; 3339 3252 } ··· 3642 3559 } 3643 3560 3644 3561 /* 3562 + * We have to make sure i_disksize gets properly updated before we truncate 3563 + * page cache due to hole punching or zero range. Otherwise i_disksize update 3564 + * can get lost as it may have been postponed to submission of writeback but 3565 + * that will never happen after we truncate page cache. 3566 + */ 3567 + int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, 3568 + loff_t len) 3569 + { 3570 + handle_t *handle; 3571 + loff_t size = i_size_read(inode); 3572 + 3573 + WARN_ON(!mutex_is_locked(&inode->i_mutex)); 3574 + if (offset > size || offset + len < size) 3575 + return 0; 3576 + 3577 + if (EXT4_I(inode)->i_disksize >= size) 3578 + return 0; 3579 + 3580 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 3581 + if (IS_ERR(handle)) 3582 + return PTR_ERR(handle); 3583 + ext4_update_i_disksize(inode, size); 3584 + ext4_mark_inode_dirty(handle, inode); 3585 + ext4_journal_stop(handle); 3586 + 3587 + return 0; 3588 + } 3589 + 3590 + /* 3645 3591 * ext4_punch_hole: punches a hole in a file by releaseing the blocks 3646 3592 * associated with the given offset and length 3647 3593 * ··· 3735 3623 3736 3624 } 3737 3625 3626 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 3627 + ext4_inode_block_unlocked_dio(inode); 3628 + inode_dio_wait(inode); 3629 + 3630 + /* 3631 + * Prevent page faults from reinstantiating pages we have released from 3632 + * page cache. 3633 + */ 3634 + down_write(&EXT4_I(inode)->i_mmap_sem); 3738 3635 first_block_offset = round_up(offset, sb->s_blocksize); 3739 3636 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 3740 3637 3741 3638 /* Now release the pages and zero block aligned part of pages*/ 3742 - if (last_block_offset > first_block_offset) 3639 + if (last_block_offset > first_block_offset) { 3640 + ret = ext4_update_disksize_before_punch(inode, offset, length); 3641 + if (ret) 3642 + goto out_dio; 3743 3643 truncate_pagecache_range(inode, first_block_offset, 3744 3644 last_block_offset); 3745 - 3746 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 3747 - ext4_inode_block_unlocked_dio(inode); 3748 - inode_dio_wait(inode); 3645 + } 3749 3646 3750 3647 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3751 3648 credits = ext4_writepage_trans_blocks(inode); ··· 3801 3680 if (IS_SYNC(inode)) 3802 3681 ext4_handle_sync(handle); 3803 3682 3804 - /* Now release the pages again to reduce race window */ 3805 - if (last_block_offset > first_block_offset) 3806 - truncate_pagecache_range(inode, first_block_offset, 3807 - last_block_offset); 3808 - 3809 3683 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3810 3684 ext4_mark_inode_dirty(handle, inode); 3811 3685 out_stop: 3812 3686 ext4_journal_stop(handle); 3813 3687 out_dio: 3688 + up_write(&EXT4_I(inode)->i_mmap_sem); 3814 3689 ext4_inode_resume_unlocked_dio(inode); 3815 3690 out_mutex: 3816 3691 mutex_unlock(&inode->i_mutex); ··· 4193 4076 EXT4_I(inode)->i_inline_off = 0; 4194 4077 } 4195 4078 4079 + int ext4_get_projid(struct inode *inode, kprojid_t *projid) 4080 + { 4081 + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) 4082 + return -EOPNOTSUPP; 4083 + *projid = EXT4_I(inode)->i_projid; 4084 + return 0; 4085 + } 4086 + 4196 4087 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4197 4088 { 4198 4089 struct ext4_iloc iloc; ··· 4212 4087 int block; 4213 4088 uid_t i_uid; 4214 4089 gid_t i_gid; 4090 + projid_t i_projid; 4215 4091 4216 4092 inode = iget_locked(sb, ino); 4217 4093 if (!inode) ··· 4262 4136 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4263 4137 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4264 4138 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4139 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && 4140 + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && 4141 + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4142 + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); 4143 + else 4144 + i_projid = EXT4_DEF_PROJID; 4145 + 4265 4146 if (!(test_opt(inode->i_sb, NO_UID32))) { 4266 4147 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4267 4148 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4268 4149 } 4269 4150 i_uid_write(inode, i_uid); 4270 4151 i_gid_write(inode, i_gid); 4152 + ei->i_projid = make_kprojid(&init_user_ns, i_projid); 4271 4153 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 4272 4154 4273 4155 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ··· 4574 4440 int need_datasync = 0, set_large_file = 0; 4575 4441 uid_t i_uid; 4576 4442 gid_t i_gid; 4443 + projid_t i_projid; 4577 4444 4578 4445 spin_lock(&ei->i_raw_lock); 4579 4446 ··· 4587 4452 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4588 4453 i_uid = i_uid_read(inode); 4589 4454 i_gid = i_gid_read(inode); 4455 + i_projid = from_kprojid(&init_user_ns, ei->i_projid); 4590 4456 if (!(test_opt(inode->i_sb, NO_UID32))) { 4591 4457 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); 4592 4458 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); ··· 4665 4529 cpu_to_le16(ei->i_extra_isize); 4666 4530 } 4667 4531 } 4532 + 4533 + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 4534 + EXT4_FEATURE_RO_COMPAT_PROJECT) && 4535 + i_projid != EXT4_DEF_PROJID); 4536 + 4537 + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 4538 + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4539 + raw_inode->i_projid = cpu_to_le32(i_projid); 4540 + 4668 4541 ext4_inode_csum_set(inode, raw_inode, ei); 4669 4542 spin_unlock(&ei->i_raw_lock); 4670 4543 if (inode->i_sb->s_flags & MS_LAZYTIME) ··· 4969 4824 } else 4970 4825 ext4_wait_for_tail_page_commit(inode); 4971 4826 } 4827 + down_write(&EXT4_I(inode)->i_mmap_sem); 4972 4828 /* 4973 4829 * Truncate pagecache after we've waited for commit 4974 4830 * in data=journal mode to make pages freeable. ··· 4977 4831 truncate_pagecache(inode, inode->i_size); 4978 4832 if (shrink) 4979 4833 ext4_truncate(inode); 4834 + up_write(&EXT4_I(inode)->i_mmap_sem); 4980 4835 } 4981 4836 4982 4837 if (!rc) { ··· 5426 5279 5427 5280 sb_start_pagefault(inode->i_sb); 5428 5281 file_update_time(vma->vm_file); 5282 + 5283 + down_read(&EXT4_I(inode)->i_mmap_sem); 5429 5284 /* Delalloc case is easy... */ 5430 5285 if (test_opt(inode->i_sb, DELALLOC) && 5431 5286 !ext4_should_journal_data(inode) && ··· 5497 5348 out_ret: 5498 5349 ret = block_page_mkwrite_return(ret); 5499 5350 out: 5351 + up_read(&EXT4_I(inode)->i_mmap_sem); 5500 5352 sb_end_pagefault(inode->i_sb); 5501 5353 return ret; 5354 + } 5355 + 5356 + int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 5357 + { 5358 + struct inode *inode = file_inode(vma->vm_file); 5359 + int err; 5360 + 5361 + down_read(&EXT4_I(inode)->i_mmap_sem); 5362 + err = filemap_fault(vma, vmf); 5363 + up_read(&EXT4_I(inode)->i_mmap_sem); 5364 + 5365 + return err; 5502 5366 }
+289 -87
fs/ext4/ioctl.c
··· 14 14 #include <linux/mount.h> 15 15 #include <linux/file.h> 16 16 #include <linux/random.h> 17 + #include <linux/quotaops.h> 17 18 #include <asm/uaccess.h> 18 19 #include "ext4_jbd2.h" 19 20 #include "ext4.h" ··· 203 202 return 1; 204 203 } 205 204 205 + static int ext4_ioctl_setflags(struct inode *inode, 206 + unsigned int flags) 207 + { 208 + struct ext4_inode_info *ei = EXT4_I(inode); 209 + handle_t *handle = NULL; 210 + int err = EPERM, migrate = 0; 211 + struct ext4_iloc iloc; 212 + unsigned int oldflags, mask, i; 213 + unsigned int jflag; 214 + 215 + /* Is it quota file? Do not allow user to mess with it */ 216 + if (IS_NOQUOTA(inode)) 217 + goto flags_out; 218 + 219 + oldflags = ei->i_flags; 220 + 221 + /* The JOURNAL_DATA flag is modifiable only by root */ 222 + jflag = flags & EXT4_JOURNAL_DATA_FL; 223 + 224 + /* 225 + * The IMMUTABLE and APPEND_ONLY flags can only be changed by 226 + * the relevant capability. 227 + * 228 + * This test looks nicer. Thanks to Pauline Middelink 229 + */ 230 + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { 231 + if (!capable(CAP_LINUX_IMMUTABLE)) 232 + goto flags_out; 233 + } 234 + 235 + /* 236 + * The JOURNAL_DATA flag can only be changed by 237 + * the relevant capability. 238 + */ 239 + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { 240 + if (!capable(CAP_SYS_RESOURCE)) 241 + goto flags_out; 242 + } 243 + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) 244 + migrate = 1; 245 + 246 + if (flags & EXT4_EOFBLOCKS_FL) { 247 + /* we don't support adding EOFBLOCKS flag */ 248 + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { 249 + err = -EOPNOTSUPP; 250 + goto flags_out; 251 + } 252 + } else if (oldflags & EXT4_EOFBLOCKS_FL) 253 + ext4_truncate(inode); 254 + 255 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 256 + if (IS_ERR(handle)) { 257 + err = PTR_ERR(handle); 258 + goto flags_out; 259 + } 260 + if (IS_SYNC(inode)) 261 + ext4_handle_sync(handle); 262 + err = ext4_reserve_inode_write(handle, inode, &iloc); 263 + if (err) 264 + goto flags_err; 265 + 266 + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { 267 + if (!(mask & EXT4_FL_USER_MODIFIABLE)) 268 + continue; 269 + if (mask & flags) 270 + ext4_set_inode_flag(inode, i); 271 + else 272 + ext4_clear_inode_flag(inode, i); 273 + } 274 + 275 + ext4_set_inode_flags(inode); 276 + inode->i_ctime = ext4_current_time(inode); 277 + 278 + err = ext4_mark_iloc_dirty(handle, inode, &iloc); 279 + flags_err: 280 + ext4_journal_stop(handle); 281 + if (err) 282 + goto flags_out; 283 + 284 + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 285 + err = ext4_change_inode_journal_flag(inode, jflag); 286 + if (err) 287 + goto flags_out; 288 + if (migrate) { 289 + if (flags & EXT4_EXTENTS_FL) 290 + err = ext4_ext_migrate(inode); 291 + else 292 + err = ext4_ind_migrate(inode); 293 + } 294 + 295 + flags_out: 296 + return err; 297 + } 298 + 299 + #ifdef CONFIG_QUOTA 300 + static int ext4_ioctl_setproject(struct file *filp, __u32 projid) 301 + { 302 + struct inode *inode = file_inode(filp); 303 + struct super_block *sb = inode->i_sb; 304 + struct ext4_inode_info *ei = EXT4_I(inode); 305 + int err, rc; 306 + handle_t *handle; 307 + kprojid_t kprojid; 308 + struct ext4_iloc iloc; 309 + struct ext4_inode *raw_inode; 310 + 311 + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 312 + EXT4_FEATURE_RO_COMPAT_PROJECT)) { 313 + if (projid != EXT4_DEF_PROJID) 314 + return -EOPNOTSUPP; 315 + else 316 + return 0; 317 + } 318 + 319 + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) 320 + return -EOPNOTSUPP; 321 + 322 + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); 323 + 324 + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) 325 + return 0; 326 + 327 + err = mnt_want_write_file(filp); 328 + if (err) 329 + return err; 330 + 331 + err = -EPERM; 332 + mutex_lock(&inode->i_mutex); 333 + /* Is it quota file? Do not allow user to mess with it */ 334 + if (IS_NOQUOTA(inode)) 335 + goto out_unlock; 336 + 337 + err = ext4_get_inode_loc(inode, &iloc); 338 + if (err) 339 + goto out_unlock; 340 + 341 + raw_inode = ext4_raw_inode(&iloc); 342 + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { 343 + err = -EOVERFLOW; 344 + brelse(iloc.bh); 345 + goto out_unlock; 346 + } 347 + brelse(iloc.bh); 348 + 349 + dquot_initialize(inode); 350 + 351 + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 352 + EXT4_QUOTA_INIT_BLOCKS(sb) + 353 + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); 354 + if (IS_ERR(handle)) { 355 + err = PTR_ERR(handle); 356 + goto out_unlock; 357 + } 358 + 359 + err = ext4_reserve_inode_write(handle, inode, &iloc); 360 + if (err) 361 + goto out_stop; 362 + 363 + if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { 364 + struct dquot *transfer_to[MAXQUOTAS] = { }; 365 + 366 + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); 367 + if (transfer_to[PRJQUOTA]) { 368 + err = __dquot_transfer(inode, transfer_to); 369 + dqput(transfer_to[PRJQUOTA]); 370 + if (err) 371 + goto out_dirty; 372 + } 373 + } 374 + EXT4_I(inode)->i_projid = kprojid; 375 + inode->i_ctime = ext4_current_time(inode); 376 + out_dirty: 377 + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 378 + if (!err) 379 + err = rc; 380 + out_stop: 381 + ext4_journal_stop(handle); 382 + out_unlock: 383 + mutex_unlock(&inode->i_mutex); 384 + mnt_drop_write_file(filp); 385 + return err; 386 + } 387 + #else 388 + static int ext4_ioctl_setproject(struct file *filp, __u32 projid) 389 + { 390 + if (projid != EXT4_DEF_PROJID) 391 + return -EOPNOTSUPP; 392 + return 0; 393 + } 394 + #endif 395 + 396 + /* Transfer internal flags to xflags */ 397 + static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) 398 + { 399 + __u32 xflags = 0; 400 + 401 + if (iflags & EXT4_SYNC_FL) 402 + xflags |= FS_XFLAG_SYNC; 403 + if (iflags & EXT4_IMMUTABLE_FL) 404 + xflags |= FS_XFLAG_IMMUTABLE; 405 + if (iflags & EXT4_APPEND_FL) 406 + xflags |= FS_XFLAG_APPEND; 407 + if (iflags & EXT4_NODUMP_FL) 408 + xflags |= FS_XFLAG_NODUMP; 409 + if (iflags & EXT4_NOATIME_FL) 410 + xflags |= FS_XFLAG_NOATIME; 411 + if (iflags & EXT4_PROJINHERIT_FL) 412 + xflags |= FS_XFLAG_PROJINHERIT; 413 + return xflags; 414 + } 415 + 416 + /* Transfer xflags flags to internal */ 417 + static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) 418 + { 419 + unsigned long iflags = 0; 420 + 421 + if (xflags & FS_XFLAG_SYNC) 422 + iflags |= EXT4_SYNC_FL; 423 + if (xflags & FS_XFLAG_IMMUTABLE) 424 + iflags |= EXT4_IMMUTABLE_FL; 425 + if (xflags & FS_XFLAG_APPEND) 426 + iflags |= EXT4_APPEND_FL; 427 + if (xflags & FS_XFLAG_NODUMP) 428 + iflags |= EXT4_NODUMP_FL; 429 + if (xflags & FS_XFLAG_NOATIME) 430 + iflags |= EXT4_NOATIME_FL; 431 + if (xflags & FS_XFLAG_PROJINHERIT) 432 + iflags |= EXT4_PROJINHERIT_FL; 433 + 434 + return iflags; 435 + } 436 + 206 437 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 207 438 { 208 439 struct inode *inode = file_inode(filp); ··· 450 217 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; 451 218 return put_user(flags, (int __user *) arg); 452 219 case EXT4_IOC_SETFLAGS: { 453 - handle_t *handle = NULL; 454 - int err, migrate = 0; 455 - struct ext4_iloc iloc; 456 - unsigned int oldflags, mask, i; 457 - unsigned int jflag; 220 + int err; 458 221 459 222 if (!inode_owner_or_capable(inode)) 460 223 return -EACCES; ··· 464 235 465 236 flags = ext4_mask_flags(inode->i_mode, flags); 466 237 467 - err = -EPERM; 468 238 mutex_lock(&inode->i_mutex); 469 - /* Is it quota file? Do not allow user to mess with it */ 470 - if (IS_NOQUOTA(inode)) 471 - goto flags_out; 472 - 473 - oldflags = ei->i_flags; 474 - 475 - /* The JOURNAL_DATA flag is modifiable only by root */ 476 - jflag = flags & EXT4_JOURNAL_DATA_FL; 477 - 478 - /* 479 - * The IMMUTABLE and APPEND_ONLY flags can only be changed by 480 - * the relevant capability. 481 - * 482 - * This test looks nicer. Thanks to Pauline Middelink 483 - */ 484 - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { 485 - if (!capable(CAP_LINUX_IMMUTABLE)) 486 - goto flags_out; 487 - } 488 - 489 - /* 490 - * The JOURNAL_DATA flag can only be changed by 491 - * the relevant capability. 492 - */ 493 - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { 494 - if (!capable(CAP_SYS_RESOURCE)) 495 - goto flags_out; 496 - } 497 - if ((flags ^ oldflags) & EXT4_EXTENTS_FL) 498 - migrate = 1; 499 - 500 - if (flags & EXT4_EOFBLOCKS_FL) { 501 - /* we don't support adding EOFBLOCKS flag */ 502 - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { 503 - err = -EOPNOTSUPP; 504 - goto flags_out; 505 - } 506 - } else if (oldflags & EXT4_EOFBLOCKS_FL) 507 - ext4_truncate(inode); 508 - 509 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 510 - if (IS_ERR(handle)) { 511 - err = PTR_ERR(handle); 512 - goto flags_out; 513 - } 514 - if (IS_SYNC(inode)) 515 - ext4_handle_sync(handle); 516 - err = ext4_reserve_inode_write(handle, inode, &iloc); 517 - if (err) 518 - goto flags_err; 519 - 520 - for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { 521 - if (!(mask & EXT4_FL_USER_MODIFIABLE)) 522 - continue; 523 - if (mask & flags) 524 - ext4_set_inode_flag(inode, i); 525 - else 526 - ext4_clear_inode_flag(inode, i); 527 - } 528 - 529 - ext4_set_inode_flags(inode); 530 - inode->i_ctime = ext4_current_time(inode); 531 - 532 - err = ext4_mark_iloc_dirty(handle, inode, &iloc); 533 - flags_err: 534 - ext4_journal_stop(handle); 535 - if (err) 536 - goto flags_out; 537 - 538 - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 539 - err = ext4_change_inode_journal_flag(inode, jflag); 540 - if (err) 541 - goto flags_out; 542 - if (migrate) { 543 - if (flags & EXT4_EXTENTS_FL) 544 - err = ext4_ext_migrate(inode); 545 - else 546 - err = ext4_ind_migrate(inode); 547 - } 548 - 549 - flags_out: 239 + err = ext4_ioctl_setflags(inode, flags); 550 240 mutex_unlock(&inode->i_mutex); 551 241 mnt_drop_write_file(filp); 552 242 return err; ··· 836 688 #else 837 689 return -EOPNOTSUPP; 838 690 #endif 691 + } 692 + case EXT4_IOC_FSGETXATTR: 693 + { 694 + struct fsxattr fa; 695 + 696 + memset(&fa, 0, sizeof(struct fsxattr)); 697 + ext4_get_inode_flags(ei); 698 + fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); 699 + 700 + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 701 + EXT4_FEATURE_RO_COMPAT_PROJECT)) { 702 + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, 703 + EXT4_I(inode)->i_projid); 704 + } 705 + 706 + if (copy_to_user((struct fsxattr __user *)arg, 707 + &fa, sizeof(fa))) 708 + return -EFAULT; 709 + return 0; 710 + } 711 + case EXT4_IOC_FSSETXATTR: 712 + { 713 + struct fsxattr fa; 714 + int err; 715 + 716 + if (copy_from_user(&fa, (struct fsxattr __user *)arg, 717 + sizeof(fa))) 718 + return -EFAULT; 719 + 720 + /* Make sure caller has proper permission */ 721 + if (!inode_owner_or_capable(inode)) 722 + return -EACCES; 723 + 724 + err = mnt_want_write_file(filp); 725 + if (err) 726 + return err; 727 + 728 + flags = ext4_xflags_to_iflags(fa.fsx_xflags); 729 + flags = ext4_mask_flags(inode->i_mode, flags); 730 + 731 + mutex_lock(&inode->i_mutex); 732 + flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | 733 + (flags & EXT4_FL_XFLAG_VISIBLE); 734 + err = ext4_ioctl_setflags(inode, flags); 735 + mutex_unlock(&inode->i_mutex); 736 + mnt_drop_write_file(filp); 737 + if (err) 738 + return err; 739 + 740 + err = ext4_ioctl_setproject(filp, fa.fsx_projid); 741 + if (err) 742 + return err; 743 + 744 + return 0; 839 745 } 840 746 default: 841 747 return -ENOTTY;
+25 -9
fs/ext4/namei.c
··· 273 273 struct ext4_filename *fname, 274 274 struct ext4_dir_entry_2 **res_dir); 275 275 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, 276 - struct dentry *dentry, struct inode *inode); 276 + struct inode *dir, struct inode *inode); 277 277 278 278 /* checksumming functions */ 279 279 void initialize_dirent_tail(struct ext4_dir_entry_tail *t, ··· 1928 1928 * directory, and adds the dentry to the indexed directory. 1929 1929 */ 1930 1930 static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, 1931 - struct dentry *dentry, 1931 + struct inode *dir, 1932 1932 struct inode *inode, struct buffer_head *bh) 1933 1933 { 1934 - struct inode *dir = d_inode(dentry->d_parent); 1935 1934 struct buffer_head *bh2; 1936 1935 struct dx_root *root; 1937 1936 struct dx_frame frames[2], *frame; ··· 2085 2086 return retval; 2086 2087 2087 2088 if (ext4_has_inline_data(dir)) { 2088 - retval = ext4_try_add_inline_entry(handle, &fname, 2089 - dentry, inode); 2089 + retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); 2090 2090 if (retval < 0) 2091 2091 goto out; 2092 2092 if (retval == 1) { ··· 2095 2097 } 2096 2098 2097 2099 if (is_dx(dir)) { 2098 - retval = ext4_dx_add_entry(handle, &fname, dentry, inode); 2100 + retval = ext4_dx_add_entry(handle, &fname, dir, inode); 2099 2101 if (!retval || (retval != ERR_BAD_DX_DIR)) 2100 2102 goto out; 2101 2103 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); ··· 2117 2119 2118 2120 if (blocks == 1 && !dx_fallback && 2119 2121 ext4_has_feature_dir_index(sb)) { 2120 - retval = make_indexed_dir(handle, &fname, dentry, 2122 + retval = make_indexed_dir(handle, &fname, dir, 2121 2123 inode, bh); 2122 2124 bh = NULL; /* make_indexed_dir releases bh */ 2123 2125 goto out; ··· 2152 2154 * Returns 0 for success, or a negative error value 2153 2155 */ 2154 2156 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, 2155 - struct dentry *dentry, struct inode *inode) 2157 + struct inode *dir, struct inode *inode) 2156 2158 { 2157 2159 struct dx_frame frames[2], *frame; 2158 2160 struct dx_entry *entries, *at; 2159 2161 struct buffer_head *bh; 2160 - struct inode *dir = d_inode(dentry->d_parent); 2161 2162 struct super_block *sb = dir->i_sb; 2162 2163 struct ext4_dir_entry_2 *de; 2163 2164 int err; ··· 3209 3212 if (ext4_encrypted_inode(dir) && 3210 3213 !ext4_is_child_context_consistent_with_parent(dir, inode)) 3211 3214 return -EPERM; 3215 + 3216 + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && 3217 + (!projid_eq(EXT4_I(dir)->i_projid, 3218 + EXT4_I(old_dentry->d_inode)->i_projid))) 3219 + return -EXDEV; 3220 + 3212 3221 err = dquot_initialize(dir); 3213 3222 if (err) 3214 3223 return err; ··· 3495 3492 int credits; 3496 3493 u8 old_file_type; 3497 3494 3495 + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && 3496 + (!projid_eq(EXT4_I(new_dir)->i_projid, 3497 + EXT4_I(old_dentry->d_inode)->i_projid))) 3498 + return -EXDEV; 3499 + 3498 3500 retval = dquot_initialize(old.dir); 3499 3501 if (retval) 3500 3502 return retval; ··· 3708 3700 !ext4_is_child_context_consistent_with_parent(old_dir, 3709 3701 new.inode))) 3710 3702 return -EPERM; 3703 + 3704 + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && 3705 + !projid_eq(EXT4_I(new_dir)->i_projid, 3706 + EXT4_I(old_dentry->d_inode)->i_projid)) || 3707 + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && 3708 + !projid_eq(EXT4_I(old_dir)->i_projid, 3709 + EXT4_I(new_dentry->d_inode)->i_projid))) 3710 + return -EXDEV; 3711 3711 3712 3712 retval = dquot_initialize(old.dir); 3713 3713 if (retval)
+92 -5
fs/ext4/super.c
··· 80 80 static void ext4_unregister_li_request(struct super_block *sb); 81 81 static void ext4_clear_request_list(void); 82 82 83 + /* 84 + * Lock ordering 85 + * 86 + * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and 87 + * i_mmap_rwsem (inode->i_mmap_rwsem)! 88 + * 89 + * page fault path: 90 + * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> 91 + * page lock -> i_data_sem (rw) 92 + * 93 + * buffered write path: 94 + * sb_start_write -> i_mutex -> mmap_sem 95 + * sb_start_write -> i_mutex -> transaction start -> page lock -> 96 + * i_data_sem (rw) 97 + * 98 + * truncate: 99 + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> 100 + * i_mmap_rwsem (w) -> page lock 101 + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> 102 + * transaction start -> i_data_sem (rw) 103 + * 104 + * direct IO: 105 + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem 106 + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> 107 + * transaction start -> i_data_sem (rw) 108 + * 109 + * writepages: 110 + * transaction start -> page lock(s) -> i_data_sem (rw) 111 + */ 112 + 83 113 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) 84 114 static struct file_system_type ext2_fs_type = { 85 115 .owner = THIS_MODULE, ··· 988 958 INIT_LIST_HEAD(&ei->i_orphan); 989 959 init_rwsem(&ei->xattr_sem); 990 960 init_rwsem(&ei->i_data_sem); 961 + init_rwsem(&ei->i_mmap_sem); 991 962 inode_init_once(&ei->vfs_inode); 992 963 } 993 964 ··· 1097 1066 } 1098 1067 1099 1068 #ifdef CONFIG_QUOTA 1100 - #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") 1101 - #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 1069 + static char *quotatypes[] = INITQFNAMES; 1070 + #define QTYPE2NAME(t) (quotatypes[t]) 1102 1071 1103 1072 static int ext4_write_dquot(struct dquot *dquot); 1104 1073 static int ext4_acquire_dquot(struct dquot *dquot); ··· 1131 1100 .write_info = ext4_write_info, 1132 1101 .alloc_dquot = dquot_alloc, 1133 1102 .destroy_dquot = dquot_destroy, 1103 + .get_projid = ext4_get_projid, 1134 1104 }; 1135 1105 1136 1106 static const struct quotactl_ops ext4_qctl_operations = { ··· 2558 2526 "without CONFIG_QUOTA"); 2559 2527 return 0; 2560 2528 } 2529 + if (ext4_has_feature_project(sb) && !readonly) { 2530 + ext4_msg(sb, KERN_ERR, 2531 + "Filesystem with project quota feature cannot be mounted RDWR " 2532 + "without CONFIG_QUOTA"); 2533 + return 0; 2534 + } 2561 2535 #endif /* CONFIG_QUOTA */ 2562 2536 return 1; 2563 2537 } ··· 3692 3654 sb->s_qcop = &dquot_quotactl_sysfile_ops; 3693 3655 else 3694 3656 sb->s_qcop = &ext4_qctl_operations; 3695 - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 3657 + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; 3696 3658 #endif 3697 3659 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); 3698 3660 ··· 4828 4790 return err; 4829 4791 } 4830 4792 4793 + #ifdef CONFIG_QUOTA 4794 + static int ext4_statfs_project(struct super_block *sb, 4795 + kprojid_t projid, struct kstatfs *buf) 4796 + { 4797 + struct kqid qid; 4798 + struct dquot *dquot; 4799 + u64 limit; 4800 + u64 curblock; 4801 + 4802 + qid = make_kqid_projid(projid); 4803 + dquot = dqget(sb, qid); 4804 + if (IS_ERR(dquot)) 4805 + return PTR_ERR(dquot); 4806 + spin_lock(&dq_data_lock); 4807 + 4808 + limit = (dquot->dq_dqb.dqb_bsoftlimit ? 4809 + dquot->dq_dqb.dqb_bsoftlimit : 4810 + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; 4811 + if (limit && buf->f_blocks > limit) { 4812 + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; 4813 + buf->f_blocks = limit; 4814 + buf->f_bfree = buf->f_bavail = 4815 + (buf->f_blocks > curblock) ? 4816 + (buf->f_blocks - curblock) : 0; 4817 + } 4818 + 4819 + limit = dquot->dq_dqb.dqb_isoftlimit ? 4820 + dquot->dq_dqb.dqb_isoftlimit : 4821 + dquot->dq_dqb.dqb_ihardlimit; 4822 + if (limit && buf->f_files > limit) { 4823 + buf->f_files = limit; 4824 + buf->f_ffree = 4825 + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? 4826 + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; 4827 + } 4828 + 4829 + spin_unlock(&dq_data_lock); 4830 + dqput(dquot); 4831 + return 0; 4832 + } 4833 + #endif 4834 + 4831 4835 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) 4832 4836 { 4833 4837 struct super_block *sb = dentry->d_sb; ··· 4902 4822 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 4903 4823 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 4904 4824 4825 + #ifdef CONFIG_QUOTA 4826 + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && 4827 + sb_has_quota_limits_enabled(sb, PRJQUOTA)) 4828 + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); 4829 + #endif 4905 4830 return 0; 4906 4831 } 4907 4832 ··· 5071 4986 struct inode *qf_inode; 5072 4987 unsigned long qf_inums[EXT4_MAXQUOTAS] = { 5073 4988 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 5074 - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) 4989 + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), 4990 + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) 5075 4991 }; 5076 4992 5077 4993 BUG_ON(!ext4_has_feature_quota(sb)); ··· 5100 5014 int type, err = 0; 5101 5015 unsigned long qf_inums[EXT4_MAXQUOTAS] = { 5102 5016 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 5103 - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) 5017 + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), 5018 + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) 5104 5019 }; 5105 5020 5106 5021 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+2
fs/ext4/truncate.h
··· 10 10 */ 11 11 static inline void ext4_truncate_failed_write(struct inode *inode) 12 12 { 13 + down_write(&EXT4_I(inode)->i_mmap_sem); 13 14 truncate_inode_pages(inode->i_mapping, inode->i_size); 14 15 ext4_truncate(inode); 16 + up_write(&EXT4_I(inode)->i_mmap_sem); 15 17 } 16 18 17 19 /*
+1 -1
include/trace/events/ext4.h
··· 43 43 { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ 44 44 { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ 45 45 { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ 46 - { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }) 46 + { EXT4_GET_BLOCKS_ZERO, "ZERO" }) 47 47 48 48 #define show_mflags(flags) __print_flags(flags, "", \ 49 49 { EXT4_MAP_NEW, "N" }, \
+27 -4
include/uapi/linux/fs.h
··· 2 2 #define _UAPI_LINUX_FS_H 3 3 4 4 /* 5 - * This file has definitions for some important file table 6 - * structures etc. 5 + * This file has definitions for some important file table structures 6 + * and constants and structures used by various generic file system 7 + * ioctl's. Please do not make any changes in this file before 8 + * sending patches for review to linux-fsdevel@vger.kernel.org and 9 + * linux-api@vger.kernel.org. 7 10 */ 8 11 9 12 #include <linux/limits.h> ··· 249 246 250 247 /* 251 248 * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) 249 + * 250 + * Note: for historical reasons, these flags were originally used and 251 + * defined for use by ext2/ext3, and then other file systems started 252 + * using these flags so they wouldn't need to write their own version 253 + * of chattr/lsattr (which was shipped as part of e2fsprogs). You 254 + * should think twice before trying to use these flags in new 255 + * contexts, or trying to assign these flags, since they are used both 256 + * as the UAPI and the on-disk encoding for ext2/3/4. Also, we are 257 + * almost out of 32-bit flags. :-) 258 + * 259 + * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from 260 + * XFS to the generic FS level interface. This uses a structure that 261 + * has padding and hence has more room to grow, so it may be more 262 + * appropriate for many new use cases. 263 + * 264 + * Please do not change these flags or interfaces before checking with 265 + * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org. 252 266 */ 253 267 #define FS_SECRM_FL 0x00000001 /* Secure deletion */ 254 268 #define FS_UNRM_FL 0x00000002 /* Undelete */ ··· 279 259 #define FS_DIRTY_FL 0x00000100 280 260 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ 281 261 #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ 282 - #define FS_ECOMPR_FL 0x00000800 /* Compression error */ 283 262 /* End compression flags --- maybe not all used */ 263 + #define FS_ENCRYPT_FL 0x00000800 /* Encrypted file */ 284 264 #define FS_BTREE_FL 0x00001000 /* btree format dir */ 285 265 #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ 286 266 #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ ··· 288 268 #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ 289 269 #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ 290 270 #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 271 + #define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */ 291 272 #define FS_EXTENT_FL 0x00080000 /* Extents */ 292 - #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ 273 + #define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ 274 + #define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ 293 275 #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ 276 + #define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ 294 277 #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ 295 278 #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ 296 279