Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 bug fixes from Ted Ts'o:
"Various bug fixes for ext4. Perhaps the most serious bug fixed is one
which could cause file system corruptions when performing file punch
operations."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
ext4: avoid hang when mounting non-journal filesystems with orphan list
ext4: lock i_mutex when truncating orphan inodes
ext4: do not try to write superblock on ro remount w/o journal
ext4: include journal blocks in df overhead calcs
ext4: remove unaligned AIO warning printk
ext4: fix an incorrect comment about i_mutex
ext4: fix deadlock in journal_unmap_buffer()
ext4: split off ext4_journalled_invalidatepage()
jbd2: fix assertion failure in jbd2_journal_flush()
ext4: check dioread_nolock on remount
ext4: fix extent tree corruption caused by hole punch

+152 -58
+18 -4
fs/ext4/extents.c
··· 2226 2226 * removes index from the index block. 2227 2227 */ 2228 2228 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2229 - struct ext4_ext_path *path) 2229 + struct ext4_ext_path *path, int depth) 2230 2230 { 2231 2231 int err; 2232 2232 ext4_fsblk_t leaf; 2233 2233 2234 2234 /* free index block */ 2235 - path--; 2235 + depth--; 2236 + path = path + depth; 2236 2237 leaf = ext4_idx_pblock(path->p_idx); 2237 2238 if (unlikely(path->p_hdr->eh_entries == 0)) { 2238 2239 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); ··· 2258 2257 2259 2258 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2260 2259 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2260 + 2261 + while (--depth >= 0) { 2262 + if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) 2263 + break; 2264 + path--; 2265 + err = ext4_ext_get_access(handle, inode, path); 2266 + if (err) 2267 + break; 2268 + path->p_idx->ei_block = (path+1)->p_idx->ei_block; 2269 + err = ext4_ext_dirty(handle, inode, path); 2270 + if (err) 2271 + break; 2272 + } 2261 2273 return err; 2262 2274 } 2263 2275 ··· 2613 2599 /* if this leaf is free, then we should 2614 2600 * remove it from index block above */ 2615 2601 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2616 - err = ext4_ext_rm_idx(handle, inode, path + depth); 2602 + err = ext4_ext_rm_idx(handle, inode, path, depth); 2617 2603 2618 2604 out: 2619 2605 return err; ··· 2816 2802 /* index is empty, remove it; 2817 2803 * handle must be already prepared by the 2818 2804 * truncatei_leaf() */ 2819 - err = ext4_ext_rm_idx(handle, inode, path + i); 2805 + err = ext4_ext_rm_idx(handle, inode, path, i); 2820 2806 } 2821 2807 /* root level has p_bh == NULL, brelse() eats this */ 2822 2808 brelse(path[i].p_bh);
-8
fs/ext4/file.c
··· 108 108 109 109 /* Unaligned direct AIO must be serialized; see comment above */ 110 110 if (unaligned_aio) { 111 - static unsigned long unaligned_warn_time; 112 - 113 - /* Warn about this once per day */ 114 - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) 115 - ext4_msg(inode->i_sb, KERN_WARNING, 116 - "Unaligned AIO/DIO on inode %ld by %s; " 117 - "performance will be poor.", 118 - inode->i_ino, current->comm); 119 111 mutex_lock(ext4_aio_mutex(inode)); 120 112 ext4_unwritten_wait(inode); 121 113 }
-2
fs/ext4/fsync.c
··· 109 109 * 110 110 * What we do is just kick off a commit and wait on it. This will snapshot the 111 111 * inode to disk. 112 - * 113 - * i_mutex lock is held when entering and exiting this function 114 112 */ 115 113 116 114 int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+84 -15
fs/ext4/inode.c
··· 2880 2880 2881 2881 static void ext4_invalidatepage(struct page *page, unsigned long offset) 2882 2882 { 2883 - journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2884 - 2885 2883 trace_ext4_invalidatepage(page, offset); 2886 2884 2887 2885 /* ··· 2887 2889 */ 2888 2890 if (ext4_should_dioread_nolock(page->mapping->host)) 2889 2891 ext4_invalidatepage_free_endio(page, offset); 2892 + 2893 + /* No journalling happens on data buffers when this function is used */ 2894 + WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2895 + 2896 + block_invalidatepage(page, offset); 2897 + } 2898 + 2899 + static int __ext4_journalled_invalidatepage(struct page *page, 2900 + unsigned long offset) 2901 + { 2902 + journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2903 + 2904 + trace_ext4_journalled_invalidatepage(page, offset); 2905 + 2890 2906 /* 2891 2907 * If it's a full truncate we just forget about the pending dirtying 2892 2908 */ 2893 2909 if (offset == 0) 2894 2910 ClearPageChecked(page); 2895 2911 2896 - if (journal) 2897 - jbd2_journal_invalidatepage(journal, page, offset); 2898 - else 2899 - block_invalidatepage(page, offset); 2912 + return jbd2_journal_invalidatepage(journal, page, offset); 2913 + } 2914 + 2915 + /* Wrapper for aops... */ 2916 + static void ext4_journalled_invalidatepage(struct page *page, 2917 + unsigned long offset) 2918 + { 2919 + WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 2900 2920 } 2901 2921 2902 2922 static int ext4_releasepage(struct page *page, gfp_t wait) ··· 3280 3264 .write_end = ext4_journalled_write_end, 3281 3265 .set_page_dirty = ext4_journalled_set_page_dirty, 3282 3266 .bmap = ext4_bmap, 3283 - .invalidatepage = ext4_invalidatepage, 3267 + .invalidatepage = ext4_journalled_invalidatepage, 3284 3268 .releasepage = ext4_releasepage, 3285 3269 .direct_IO = ext4_direct_IO, 3286 3270 .is_partially_uptodate = block_is_partially_uptodate, ··· 4321 4305 } 4322 4306 4323 4307 /* 4308 + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate 4309 + * buffers that are attached to a page stradding i_size and are undergoing 4310 + * commit. In that case we have to wait for commit to finish and try again. 4311 + */ 4312 + static void ext4_wait_for_tail_page_commit(struct inode *inode) 4313 + { 4314 + struct page *page; 4315 + unsigned offset; 4316 + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 4317 + tid_t commit_tid = 0; 4318 + int ret; 4319 + 4320 + offset = inode->i_size & (PAGE_CACHE_SIZE - 1); 4321 + /* 4322 + * All buffers in the last page remain valid? Then there's nothing to 4323 + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == 4324 + * blocksize case 4325 + */ 4326 + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) 4327 + return; 4328 + while (1) { 4329 + page = find_lock_page(inode->i_mapping, 4330 + inode->i_size >> PAGE_CACHE_SHIFT); 4331 + if (!page) 4332 + return; 4333 + ret = __ext4_journalled_invalidatepage(page, offset); 4334 + unlock_page(page); 4335 + page_cache_release(page); 4336 + if (ret != -EBUSY) 4337 + return; 4338 + commit_tid = 0; 4339 + read_lock(&journal->j_state_lock); 4340 + if (journal->j_committing_transaction) 4341 + commit_tid = journal->j_committing_transaction->t_tid; 4342 + read_unlock(&journal->j_state_lock); 4343 + if (commit_tid) 4344 + jbd2_log_wait_commit(journal, commit_tid); 4345 + } 4346 + } 4347 + 4348 + /* 4324 4349 * ext4_setattr() 4325 4350 * 4326 4351 * Called from notify_change. ··· 4474 4417 } 4475 4418 4476 4419 if (attr->ia_valid & ATTR_SIZE) { 4477 - if (attr->ia_size != i_size_read(inode)) { 4478 - truncate_setsize(inode, attr->ia_size); 4479 - /* Inode size will be reduced, wait for dio in flight. 4480 - * Temporarily disable dioread_nolock to prevent 4481 - * livelock. */ 4420 + if (attr->ia_size != inode->i_size) { 4421 + loff_t oldsize = inode->i_size; 4422 + 4423 + i_size_write(inode, attr->ia_size); 4424 + /* 4425 + * Blocks are going to be removed from the inode. Wait 4426 + * for dio in flight. Temporarily disable 4427 + * dioread_nolock to prevent livelock. 4428 + */ 4482 4429 if (orphan) { 4483 - ext4_inode_block_unlocked_dio(inode); 4484 - inode_dio_wait(inode); 4485 - ext4_inode_resume_unlocked_dio(inode); 4430 + if (!ext4_should_journal_data(inode)) { 4431 + ext4_inode_block_unlocked_dio(inode); 4432 + inode_dio_wait(inode); 4433 + ext4_inode_resume_unlocked_dio(inode); 4434 + } else 4435 + ext4_wait_for_tail_page_commit(inode); 4486 4436 } 4437 + /* 4438 + * Truncate pagecache after we've waited for commit 4439 + * in data=journal mode to make pages freeable. 4440 + */ 4441 + truncate_pagecache(inode, oldsize, inode->i_size); 4487 4442 } 4488 4443 ext4_truncate(inode); 4489 4444 }
+2 -1
fs/ext4/namei.c
··· 2648 2648 struct ext4_iloc iloc; 2649 2649 int err = 0; 2650 2650 2651 - if (!EXT4_SB(inode->i_sb)->s_journal) 2651 + if ((!EXT4_SB(inode->i_sb)->s_journal) && 2652 + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) 2652 2653 return 0; 2653 2654 2654 2655 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+18 -12
fs/ext4/super.c
··· 1645 1645 unsigned int *journal_ioprio, 1646 1646 int is_remount) 1647 1647 { 1648 - #ifdef CONFIG_QUOTA 1649 1648 struct ext4_sb_info *sbi = EXT4_SB(sb); 1650 - #endif 1651 1649 char *p; 1652 1650 substring_t args[MAX_OPT_ARGS]; 1653 1651 int token; ··· 1694 1696 } 1695 1697 } 1696 1698 #endif 1699 + if (test_opt(sb, DIOREAD_NOLOCK)) { 1700 + int blocksize = 1701 + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 1702 + 1703 + if (blocksize < PAGE_CACHE_SIZE) { 1704 + ext4_msg(sb, KERN_ERR, "can't mount with " 1705 + "dioread_nolock if block size != PAGE_SIZE"); 1706 + return 0; 1707 + } 1708 + } 1697 1709 return 1; 1698 1710 } 1699 1711 ··· 2220 2212 __func__, inode->i_ino, inode->i_size); 2221 2213 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2222 2214 inode->i_ino, inode->i_size); 2215 + mutex_lock(&inode->i_mutex); 2223 2216 ext4_truncate(inode); 2217 + mutex_unlock(&inode->i_mutex); 2224 2218 nr_truncates++; 2225 2219 } else { 2226 2220 ext4_msg(sb, KERN_DEBUG, ··· 3233 3223 memset(buf, 0, PAGE_SIZE); 3234 3224 cond_resched(); 3235 3225 } 3226 + /* Add the journal blocks as well */ 3227 + if (sbi->s_journal) 3228 + overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen); 3229 + 3236 3230 sbi->s_overhead = overhead; 3237 3231 smp_wmb(); 3238 3232 free_page((unsigned long) buf); ··· 3450 3436 clear_opt(sb, DELALLOC); 3451 3437 } 3452 3438 3453 - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 3454 - if (test_opt(sb, DIOREAD_NOLOCK)) { 3455 - if (blocksize < PAGE_SIZE) { 3456 - ext4_msg(sb, KERN_ERR, "can't mount with " 3457 - "dioread_nolock if block size != PAGE_SIZE"); 3458 - goto failed_mount; 3459 - } 3460 - } 3461 - 3462 3439 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3463 3440 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 3464 3441 ··· 3491 3486 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) 3492 3487 goto failed_mount; 3493 3488 3489 + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 3494 3490 if (blocksize < EXT4_MIN_BLOCK_SIZE || 3495 3491 blocksize > EXT4_MAX_BLOCK_SIZE) { 3496 3492 ext4_msg(sb, KERN_ERR, ··· 4731 4725 } 4732 4726 4733 4727 ext4_setup_system_zone(sb); 4734 - if (sbi->s_journal == NULL) 4728 + if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) 4735 4729 ext4_commit_super(sb, 1); 4736 4730 4737 4731 #ifdef CONFIG_QUOTA
+16 -14
fs/jbd2/transaction.c
··· 209 209 if (!new_transaction) 210 210 goto alloc_transaction; 211 211 write_lock(&journal->j_state_lock); 212 - if (!journal->j_running_transaction) { 212 + if (!journal->j_running_transaction && 213 + !journal->j_barrier_count) { 213 214 jbd2_get_transaction(journal, new_transaction); 214 215 new_transaction = NULL; 215 216 } ··· 1840 1839 1841 1840 BUFFER_TRACE(bh, "entry"); 1842 1841 1843 - retry: 1844 1842 /* 1845 1843 * It is safe to proceed here without the j_list_lock because the 1846 1844 * buffers cannot be stolen by try_to_free_buffers as long as we are ··· 1934 1934 * for commit and try again. 1935 1935 */ 1936 1936 if (partial_page) { 1937 - tid_t tid = journal->j_committing_transaction->t_tid; 1938 - 1939 1937 jbd2_journal_put_journal_head(jh); 1940 1938 spin_unlock(&journal->j_list_lock); 1941 1939 jbd_unlock_bh_state(bh); 1942 1940 write_unlock(&journal->j_state_lock); 1943 - jbd2_log_wait_commit(journal, tid); 1944 - goto retry; 1941 + return -EBUSY; 1945 1942 } 1946 1943 /* 1947 1944 * OK, buffer won't be reachable after truncate. We just set ··· 1999 2002 * @page: page to flush 2000 2003 * @offset: length of page to invalidate. 2001 2004 * 2002 - * Reap page buffers containing data after offset in page. 2003 - * 2005 + * Reap page buffers containing data after offset in page. Can return -EBUSY 2006 + * if buffers are part of the committing transaction and the page is straddling 2007 + * i_size. Caller then has to wait for current commit and try again. 2004 2008 */ 2005 - void jbd2_journal_invalidatepage(journal_t *journal, 2006 - struct page *page, 2007 - unsigned long offset) 2009 + int jbd2_journal_invalidatepage(journal_t *journal, 2010 + struct page *page, 2011 + unsigned long offset) 2008 2012 { 2009 2013 struct buffer_head *head, *bh, *next; 2010 2014 unsigned int curr_off = 0; 2011 2015 int may_free = 1; 2016 + int ret = 0; 2012 2017 2013 2018 if (!PageLocked(page)) 2014 2019 BUG(); 2015 2020 if (!page_has_buffers(page)) 2016 - return; 2021 + return 0; 2017 2022 2018 2023 /* We will potentially be playing with lists other than just the 2019 2024 * data lists (especially for journaled data mode), so be ··· 2029 2030 if (offset <= curr_off) { 2030 2031 /* This block is wholly outside the truncation point */ 2031 2032 lock_buffer(bh); 2032 - may_free &= journal_unmap_buffer(journal, bh, 2033 - offset > 0); 2033 + ret = journal_unmap_buffer(journal, bh, offset > 0); 2034 2034 unlock_buffer(bh); 2035 + if (ret < 0) 2036 + return ret; 2037 + may_free &= ret; 2035 2038 } 2036 2039 curr_off = next_off; 2037 2040 bh = next; ··· 2044 2043 if (may_free && try_to_free_buffers(page)) 2045 2044 J_ASSERT(!page_has_buffers(page)); 2046 2045 } 2046 + return 0; 2047 2047 } 2048 2048 2049 2049 /*
+1 -1
include/linux/jbd2.h
··· 1098 1098 extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); 1099 1099 extern int jbd2_journal_forget (handle_t *, struct buffer_head *); 1100 1100 extern void journal_sync_buffer (struct buffer_head *); 1101 - extern void jbd2_journal_invalidatepage(journal_t *, 1101 + extern int jbd2_journal_invalidatepage(journal_t *, 1102 1102 struct page *, unsigned long); 1103 1103 extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); 1104 1104 extern int jbd2_journal_stop(handle_t *);
+13 -1
include/trace/events/ext4.h
··· 451 451 TP_ARGS(page) 452 452 ); 453 453 454 - TRACE_EVENT(ext4_invalidatepage, 454 + DECLARE_EVENT_CLASS(ext4_invalidatepage_op, 455 455 TP_PROTO(struct page *page, unsigned long offset), 456 456 457 457 TP_ARGS(page, offset), ··· 475 475 MAJOR(__entry->dev), MINOR(__entry->dev), 476 476 (unsigned long) __entry->ino, 477 477 (unsigned long) __entry->index, __entry->offset) 478 + ); 479 + 480 + DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, 481 + TP_PROTO(struct page *page, unsigned long offset), 482 + 483 + TP_ARGS(page, offset) 484 + ); 485 + 486 + DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, 487 + TP_PROTO(struct page *page, unsigned long offset), 488 + 489 + TP_ARGS(page, offset) 478 490 ); 479 491 480 492 TRACE_EVENT(ext4_discard_blocks,