Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
ext4: Add new ext4 inode tracepoints
ext4: Don't call sb_issue_discard() in ext4_free_blocks()
ext4: do not try to grab the s_umount semaphore in ext4_quota_off
ext4: fix potential race when freeing ext4_io_page structures
ext4: handle writeback of inodes which are being freed
ext4: initialize the percpu counters before replaying the journal
ext4: "ret" may be used uninitialized in ext4_lazyinit_thread()
ext4: fix lazyinit hang after removing request

+214 -91
+3 -1
fs/ext4/ext4.h
··· 177 177 178 178 struct ext4_io_page { 179 179 struct page *p_page; 180 - int p_count; 180 + atomic_t p_count; 181 181 }; 182 182 183 183 #define MAX_IO_PAGES 128 ··· 858 858 spinlock_t i_completed_io_lock; 859 859 /* current io_end structure for async DIO write*/ 860 860 ext4_io_end_t *cur_aio_dio; 861 + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 861 862 862 863 /* 863 864 * Transactions that contain inode's metadata needed to complete ··· 2061 2060 /* page-io.c */ 2062 2061 extern int __init ext4_init_pageio(void); 2063 2062 extern void ext4_exit_pageio(void); 2063 + extern void ext4_ioend_wait(struct inode *); 2064 2064 extern void ext4_free_io_end(ext4_io_end_t *io); 2065 2065 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2066 2066 extern int ext4_end_io_nolock(ext4_io_end_t *io);
+3
fs/ext4/inode.c
··· 53 53 static inline int ext4_begin_ordered_truncate(struct inode *inode, 54 54 loff_t new_size) 55 55 { 56 + trace_ext4_begin_ordered_truncate(inode, new_size); 56 57 return jbd2_journal_begin_ordered_truncate( 57 58 EXT4_SB(inode->i_sb)->s_journal, 58 59 &EXT4_I(inode)->jinode, ··· 179 178 handle_t *handle; 180 179 int err; 181 180 181 + trace_ext4_evict_inode(inode); 182 182 if (inode->i_nlink) { 183 183 truncate_inode_pages(&inode->i_data, 0); 184 184 goto no_delete; ··· 5649 5647 int err, ret; 5650 5648 5651 5649 might_sleep(); 5650 + trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5652 5651 err = ext4_reserve_inode_write(handle, inode, &iloc); 5653 5652 if (ext4_handle_valid(handle) && 5654 5653 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-2
fs/ext4/mballoc.c
··· 4640 4640 * with group lock held. generate_buddy look at 4641 4641 * them with group lock_held 4642 4642 */ 4643 - if (test_opt(sb, DISCARD)) 4644 - ext4_issue_discard(sb, block_group, bit, count); 4645 4643 ext4_lock_group(sb, block_group); 4646 4644 mb_clear_bits(bitmap_bh->b_data, bit, count); 4647 4645 mb_free_blocks(inode, &e4b, bit, count);
+49 -48
fs/ext4/page-io.c
··· 32 32 33 33 static struct kmem_cache *io_page_cachep, *io_end_cachep; 34 34 35 + #define WQ_HASH_SZ 37 36 + #define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) 37 + static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; 38 + 35 39 int __init ext4_init_pageio(void) 36 40 { 41 + int i; 42 + 37 43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 38 44 if (io_page_cachep == NULL) 39 45 return -ENOMEM; ··· 48 42 kmem_cache_destroy(io_page_cachep); 49 43 return -ENOMEM; 50 44 } 45 + for (i = 0; i < WQ_HASH_SZ; i++) 46 + init_waitqueue_head(&ioend_wq[i]); 51 47 52 48 return 0; 53 49 } ··· 60 52 kmem_cache_destroy(io_page_cachep); 61 53 } 62 54 55 + void ext4_ioend_wait(struct inode *inode) 56 + { 57 + wait_queue_head_t *wq = to_ioend_wq(inode); 58 + 59 + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 60 + } 61 + 62 + static void put_io_page(struct ext4_io_page *io_page) 63 + { 64 + if (atomic_dec_and_test(&io_page->p_count)) { 65 + end_page_writeback(io_page->p_page); 66 + put_page(io_page->p_page); 67 + kmem_cache_free(io_page_cachep, io_page); 68 + } 69 + } 70 + 63 71 void ext4_free_io_end(ext4_io_end_t *io) 64 72 { 65 73 int i; 74 + wait_queue_head_t *wq; 66 75 67 76 BUG_ON(!io); 68 77 if (io->page) 69 78 put_page(io->page); 70 - for (i = 0; i < io->num_io_pages; i++) { 71 - if (--io->pages[i]->p_count == 0) { 72 - struct page *page = io->pages[i]->p_page; 73 - 74 - end_page_writeback(page); 75 - put_page(page); 76 - kmem_cache_free(io_page_cachep, io->pages[i]); 77 - } 78 - } 79 + for (i = 0; i < io->num_io_pages; i++) 80 + put_io_page(io->pages[i]); 79 81 io->num_io_pages = 0; 80 - iput(io->inode); 82 + wq = to_ioend_wq(io->inode); 83 + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 84 + waitqueue_active(wq)) 85 + wake_up_all(wq); 81 86 kmem_cache_free(io_end_cachep, io); 82 87 } 83 88 ··· 163 142 io = kmem_cache_alloc(io_end_cachep, flags); 164 143 if (io) { 165 144 memset(io, 0, sizeof(*io)); 166 - io->inode = igrab(inode); 167 - BUG_ON(!io->inode); 145 + atomic_inc(&EXT4_I(inode)->i_ioend_count); 146 + io->inode = inode; 168 147 INIT_WORK(&io->work, ext4_end_io_work); 169 148 INIT_LIST_HEAD(&io->list); 170 149 } ··· 192 171 struct workqueue_struct *wq; 193 172 struct inode *inode; 194 173 unsigned long flags; 195 - ext4_fsblk_t err_block; 196 174 int i; 197 175 198 176 BUG_ON(!io_end); 199 - inode = io_end->inode; 200 177 bio->bi_private = NULL; 201 178 bio->bi_end_io = NULL; 202 179 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 203 180 error = 0; 204 - err_block = bio->bi_sector >> (inode->i_blkbits - 9); 205 181 bio_put(bio); 206 - 207 - if (!(inode->i_sb->s_flags & MS_ACTIVE)) { 208 - pr_err("sb umounted, discard end_io request for inode %lu\n", 209 - io_end->inode->i_ino); 210 - ext4_free_io_end(io_end); 211 - return; 212 - } 213 - 214 - if (error) { 215 - io_end->flag |= EXT4_IO_END_ERROR; 216 - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 217 - "(offset %llu size %ld starting block %llu)", 218 - inode->i_ino, 219 - (unsigned long long) io_end->offset, 220 - (long) io_end->size, 221 - (unsigned long long) err_block); 222 - } 223 182 224 183 for (i = 0; i < io_end->num_io_pages; i++) { 225 184 struct page *page = io_end->pages[i]->p_page; ··· 237 236 } while (bh != head); 238 237 } 239 238 240 - if (--io_end->pages[i]->p_count == 0) { 241 - struct page *page = io_end->pages[i]->p_page; 242 - 243 - end_page_writeback(page); 244 - put_page(page); 245 - kmem_cache_free(io_page_cachep, io_end->pages[i]); 246 - } 239 + put_io_page(io_end->pages[i]); 247 240 248 241 /* 249 242 * If this is a partial write which happened to make ··· 249 254 if (!partial_write) 250 255 SetPageUptodate(page); 251 256 } 252 - 253 257 io_end->num_io_pages = 0; 258 + inode = io_end->inode; 259 + 260 + if (error) { 261 + io_end->flag |= EXT4_IO_END_ERROR; 262 + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 263 + "(offset %llu size %ld starting block %llu)", 264 + inode->i_ino, 265 + (unsigned long long) io_end->offset, 266 + (long) io_end->size, 267 + (unsigned long long) 268 + bio->bi_sector >> (inode->i_blkbits - 9)); 269 + } 254 270 255 271 /* Add the io_end to per-inode completed io list*/ 256 272 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); ··· 311 305 bio->bi_private = io->io_end = io_end; 312 306 bio->bi_end_io = ext4_end_bio; 313 307 314 - io_end->inode = inode; 315 308 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 316 309 317 310 io->io_bio = bio; ··· 365 360 if ((io_end->num_io_pages == 0) || 366 361 (io_end->pages[io_end->num_io_pages-1] != io_page)) { 367 362 io_end->pages[io_end->num_io_pages++] = io_page; 368 - io_page->p_count++; 363 + atomic_inc(&io_page->p_count); 369 364 } 370 365 return 0; 371 366 } ··· 394 389 return -ENOMEM; 395 390 } 396 391 io_page->p_page = page; 397 - io_page->p_count = 0; 392 + atomic_set(&io_page->p_count, 1); 398 393 get_page(page); 399 394 400 395 for (bh = head = page_buffers(page), block_start = 0; ··· 426 421 * PageWriteback bit from the page to prevent the system from 427 422 * wedging later on. 428 423 */ 429 - if (io_page->p_count == 0) { 430 - put_page(page); 431 - end_page_writeback(page); 432 - kmem_cache_free(io_page_cachep, io_page); 433 - } 424 + put_io_page(io_page); 434 425 return ret; 435 426 }
+62 -40
fs/ext4/super.c
··· 828 828 ei->cur_aio_dio = NULL; 829 829 ei->i_sync_tid = 0; 830 830 ei->i_datasync_tid = 0; 831 + atomic_set(&ei->i_ioend_count, 0); 831 832 832 833 return &ei->vfs_inode; 833 834 } 834 835 836 + static int ext4_drop_inode(struct inode *inode) 837 + { 838 + int drop = generic_drop_inode(inode); 839 + 840 + trace_ext4_drop_inode(inode, drop); 841 + return drop; 842 + } 843 + 835 844 static void ext4_destroy_inode(struct inode *inode) 836 845 { 846 + ext4_ioend_wait(inode); 837 847 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 838 848 ext4_msg(inode->i_sb, KERN_ERR, 839 849 "Inode %lu (%p): orphan list check failed!", ··· 1183 1173 .destroy_inode = ext4_destroy_inode, 1184 1174 .write_inode = ext4_write_inode, 1185 1175 .dirty_inode = ext4_dirty_inode, 1176 + .drop_inode = ext4_drop_inode, 1186 1177 .evict_inode = ext4_evict_inode, 1187 1178 .put_super = ext4_put_super, 1188 1179 .sync_fs = ext4_sync_fs, ··· 1205 1194 .destroy_inode = ext4_destroy_inode, 1206 1195 .write_inode = ext4_write_inode, 1207 1196 .dirty_inode = ext4_dirty_inode, 1197 + .drop_inode = ext4_drop_inode, 1208 1198 .evict_inode = ext4_evict_inode, 1209 1199 .write_super = ext4_write_super, 1210 1200 .put_super = ext4_put_super, ··· 2711 2699 struct ext4_li_request *elr; 2712 2700 unsigned long next_wakeup; 2713 2701 DEFINE_WAIT(wait); 2714 - int ret; 2715 2702 2716 2703 BUG_ON(NULL == eli); 2717 2704 ··· 2734 2723 elr = list_entry(pos, struct ext4_li_request, 2735 2724 lr_request); 2736 2725 2737 - if (time_after_eq(jiffies, elr->lr_next_sched)) 2738 - ret = ext4_run_li_request(elr); 2739 - 2740 - if (ret) { 2741 - ret = 0; 2742 - ext4_remove_li_request(elr); 2743 - continue; 2726 + if (time_after_eq(jiffies, elr->lr_next_sched)) { 2727 + if (ext4_run_li_request(elr) != 0) { 2728 + /* error, remove the lazy_init job */ 2729 + ext4_remove_li_request(elr); 2730 + continue; 2731 + } 2744 2732 } 2745 2733 2746 2734 if (time_before(elr->lr_next_sched, next_wakeup)) ··· 2750 2740 if (freezing(current)) 2751 2741 refrigerator(); 2752 2742 2753 - if (time_after_eq(jiffies, next_wakeup)) { 2743 + if ((time_after_eq(jiffies, next_wakeup)) || 2744 + (MAX_JIFFY_OFFSET == next_wakeup)) { 2754 2745 cond_resched(); 2755 2746 continue; 2756 2747 } ··· 3359 3348 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3360 3349 spin_lock_init(&sbi->s_next_gen_lock); 3361 3350 3351 + err = percpu_counter_init(&sbi->s_freeblocks_counter, 3352 + ext4_count_free_blocks(sb)); 3353 + if (!err) { 3354 + err = percpu_counter_init(&sbi->s_freeinodes_counter, 3355 + ext4_count_free_inodes(sb)); 3356 + } 3357 + if (!err) { 3358 + err = percpu_counter_init(&sbi->s_dirs_counter, 3359 + ext4_count_dirs(sb)); 3360 + } 3361 + if (!err) { 3362 + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3363 + } 3364 + if (err) { 3365 + ext4_msg(sb, KERN_ERR, "insufficient memory"); 3366 + goto failed_mount3; 3367 + } 3368 + 3362 3369 sbi->s_stripe = ext4_get_stripe_size(sbi); 3363 3370 sbi->s_max_writeback_mb_bump = 128; 3364 3371 ··· 3475 3446 } 3476 3447 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3477 3448 3478 - no_journal: 3479 - err = percpu_counter_init(&sbi->s_freeblocks_counter, 3480 - ext4_count_free_blocks(sb)); 3481 - if (!err) 3482 - err = percpu_counter_init(&sbi->s_freeinodes_counter, 3483 - ext4_count_free_inodes(sb)); 3484 - if (!err) 3485 - err = percpu_counter_init(&sbi->s_dirs_counter, 3486 - ext4_count_dirs(sb)); 3487 - if (!err) 3488 - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3489 - if (err) { 3490 - ext4_msg(sb, KERN_ERR, "insufficient memory"); 3491 - goto failed_mount_wq; 3492 - } 3449 + /* 3450 + * The journal may have updated the bg summary counts, so we 3451 + * need to update the global counters. 3452 + */ 3453 + percpu_counter_set(&sbi->s_freeblocks_counter, 3454 + ext4_count_free_blocks(sb)); 3455 + percpu_counter_set(&sbi->s_freeinodes_counter, 3456 + ext4_count_free_inodes(sb)); 3457 + percpu_counter_set(&sbi->s_dirs_counter, 3458 + ext4_count_dirs(sb)); 3459 + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); 3493 3460 3461 + no_journal: 3494 3462 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3495 3463 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3496 3464 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); ··· 3637 3611 jbd2_journal_destroy(sbi->s_journal); 3638 3612 sbi->s_journal = NULL; 3639 3613 } 3640 - percpu_counter_destroy(&sbi->s_freeblocks_counter); 3641 - percpu_counter_destroy(&sbi->s_freeinodes_counter); 3642 - percpu_counter_destroy(&sbi->s_dirs_counter); 3643 - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3644 3614 failed_mount3: 3645 3615 if (sbi->s_flex_groups) { 3646 3616 if (is_vmalloc_addr(sbi->s_flex_groups)) ··· 3644 3622 else 3645 3623 kfree(sbi->s_flex_groups); 3646 3624 } 3625 + percpu_counter_destroy(&sbi->s_freeblocks_counter); 3626 + percpu_counter_destroy(&sbi->s_freeinodes_counter); 3627 + percpu_counter_destroy(&sbi->s_dirs_counter); 3628 + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3647 3629 failed_mount2: 3648 3630 for (i = 0; i < db_count; i++) 3649 3631 brelse(sbi->s_group_desc[i]); ··· 3975 3949 else 3976 3950 es->s_kbytes_written = 3977 3951 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 3978 - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) 3979 - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3980 - &EXT4_SB(sb)->s_freeblocks_counter)); 3981 - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) 3982 - es->s_free_inodes_count = 3983 - cpu_to_le32(percpu_counter_sum_positive( 3984 - &EXT4_SB(sb)->s_freeinodes_counter)); 3952 + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3953 + &EXT4_SB(sb)->s_freeblocks_counter)); 3954 + es->s_free_inodes_count = 3955 + cpu_to_le32(percpu_counter_sum_positive( 3956 + &EXT4_SB(sb)->s_freeinodes_counter)); 3985 3957 sb->s_dirt = 0; 3986 3958 BUFFER_TRACE(sbh, "marking dirty"); 3987 3959 mark_buffer_dirty(sbh); ··· 4580 4556 4581 4557 static int ext4_quota_off(struct super_block *sb, int type) 4582 4558 { 4583 - /* Force all delayed allocation blocks to be allocated */ 4584 - if (test_opt(sb, DELALLOC)) { 4585 - down_read(&sb->s_umount); 4559 + /* Force all delayed allocation blocks to be allocated. 4560 + * Caller already holds s_umount sem */ 4561 + if (test_opt(sb, DELALLOC)) 4586 4562 sync_filesystem(sb); 4587 - up_read(&sb->s_umount); 4588 - } 4589 4563 4590 4564 return dquot_quota_off(sb, type); 4591 4565 }
+97
include/trace/events/ext4.h
··· 98 98 (unsigned long) __entry->dir, __entry->mode) 99 99 ); 100 100 101 + TRACE_EVENT(ext4_evict_inode, 102 + TP_PROTO(struct inode *inode), 103 + 104 + TP_ARGS(inode), 105 + 106 + TP_STRUCT__entry( 107 + __field( int, dev_major ) 108 + __field( int, dev_minor ) 109 + __field( ino_t, ino ) 110 + __field( int, nlink ) 111 + ), 112 + 113 + TP_fast_assign( 114 + __entry->dev_major = MAJOR(inode->i_sb->s_dev); 115 + __entry->dev_minor = MINOR(inode->i_sb->s_dev); 116 + __entry->ino = inode->i_ino; 117 + __entry->nlink = inode->i_nlink; 118 + ), 119 + 120 + TP_printk("dev %d,%d ino %lu nlink %d", 121 + __entry->dev_major, __entry->dev_minor, 122 + (unsigned long) __entry->ino, __entry->nlink) 123 + ); 124 + 125 + TRACE_EVENT(ext4_drop_inode, 126 + TP_PROTO(struct inode *inode, int drop), 127 + 128 + TP_ARGS(inode, drop), 129 + 130 + TP_STRUCT__entry( 131 + __field( int, dev_major ) 132 + __field( int, dev_minor ) 133 + __field( ino_t, ino ) 134 + __field( int, drop ) 135 + ), 136 + 137 + TP_fast_assign( 138 + __entry->dev_major = MAJOR(inode->i_sb->s_dev); 139 + __entry->dev_minor = MINOR(inode->i_sb->s_dev); 140 + __entry->ino = inode->i_ino; 141 + __entry->drop = drop; 142 + ), 143 + 144 + TP_printk("dev %d,%d ino %lu drop %d", 145 + __entry->dev_major, __entry->dev_minor, 146 + (unsigned long) __entry->ino, __entry->drop) 147 + ); 148 + 149 + TRACE_EVENT(ext4_mark_inode_dirty, 150 + TP_PROTO(struct inode *inode, unsigned long IP), 151 + 152 + TP_ARGS(inode, IP), 153 + 154 + TP_STRUCT__entry( 155 + __field( int, dev_major ) 156 + __field( int, dev_minor ) 157 + __field( ino_t, ino ) 158 + __field(unsigned long, ip ) 159 + ), 160 + 161 + TP_fast_assign( 162 + __entry->dev_major = MAJOR(inode->i_sb->s_dev); 163 + __entry->dev_minor = MINOR(inode->i_sb->s_dev); 164 + __entry->ino = inode->i_ino; 165 + __entry->ip = IP; 166 + ), 167 + 168 + TP_printk("dev %d,%d ino %lu caller %pF", 169 + __entry->dev_major, __entry->dev_minor, 170 + (unsigned long) __entry->ino, (void *)__entry->ip) 171 + ); 172 + 173 + TRACE_EVENT(ext4_begin_ordered_truncate, 174 + TP_PROTO(struct inode *inode, loff_t new_size), 175 + 176 + TP_ARGS(inode, new_size), 177 + 178 + TP_STRUCT__entry( 179 + __field( int, dev_major ) 180 + __field( int, dev_minor ) 181 + __field( ino_t, ino ) 182 + __field( loff_t, new_size ) 183 + ), 184 + 185 + TP_fast_assign( 186 + __entry->dev_major = MAJOR(inode->i_sb->s_dev); 187 + __entry->dev_minor = MINOR(inode->i_sb->s_dev); 188 + __entry->ino = inode->i_ino; 189 + __entry->new_size = new_size; 190 + ), 191 + 192 + TP_printk("dev %d,%d ino %lu new_size %lld", 193 + __entry->dev_major, __entry->dev_minor, 194 + (unsigned long) __entry->ino, 195 + (long long) __entry->new_size) 196 + ); 197 + 101 198 DECLARE_EVENT_CLASS(ext4__write_begin, 102 199 103 200 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,