commit 58617d5e59663d2edea03bd03cb74279827611bb · tjh.dev/kernel

+15 -17

Documentation/filesystems/ext4.txt

··· 2 2 Ext4 Filesystem 3 3 =============== 4 4 5 - This is a development version of the ext4 filesystem, an advanced level 6 - of the ext3 filesystem which incorporates scalability and reliability 7 - enhancements for supporting large filesystems (64 bit) in keeping with 8 - increasing disk capacities and state-of-the-art feature requirements. 5 + Ext4 is an an advanced level of the ext3 filesystem which incorporates 6 + scalability and reliability enhancements for supporting large filesystems 7 + (64 bit) in keeping with increasing disk capacities and state-of-the-art 8 + feature requirements. 9 9 10 - Mailing list: linux-ext4@vger.kernel.org 10 + Mailing list: linux-ext4@vger.kernel.org 11 + Web site: http://ext4.wiki.kernel.org 11 12 12 13 13 14 1. Quick usage instructions: 14 15 =========================== 15 16 17 + Note: More extensive information for getting started with ext4 can be 18 + found at the ext4 wiki site at the URL: 19 + http://ext4.wiki.kernel.org/index.php/Ext4_Howto 20 + 16 21 - Compile and install the latest version of e2fsprogs (as of this 17 - writing version 1.41) from: 22 + writing version 1.41.3) from: 18 23 19 24 http://sourceforge.net/project/showfiles.php?group_id=2406 20 25 ··· 41 36 42 37 # mke2fs -t ext4 /dev/hda1 43 38 44 - Or configure an existing ext3 filesystem to support extents and set 45 - the test_fs flag to indicate that it's ok for an in-development 46 - filesystem to touch this filesystem: 39 + Or to configure an existing ext3 filesystem to support extents: 47 40 48 - # tune2fs -O extents -E test_fs /dev/hda1 41 + # tune2fs -O extents /dev/hda1 49 42 50 43 If the filesystem was created with 128 byte inodes, it can be 51 44 converted to use 256 byte for greater efficiency via: ··· 107 104 The big performance win will come with mballoc, delalloc and flex_bg 108 105 grouping of bitmaps and inode tables. Some test results available here: 109 106 110 - - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html 111 - - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html 107 + - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html 108 + - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html 112 109 113 110 3. Options 114 111 ========== ··· 217 214 bsddf (*) Make 'df' act like BSD. 218 215 minixdf Make 'df' act like Minix. 219 216 220 - check=none Don't do extra checking of bitmaps on mount. 221 - nocheck 222 - 223 217 debug Extra debugging information is sent to syslog. 224 218 225 219 errors=remount-ro(*) Remount the filesystem read-only on an error. ··· 253 253 "nobh" option tries to avoid associating buffer 254 254 heads (supported only for "writeback" mode). 255 255 256 - mballoc (*) Use the multiple block allocator for block allocation 257 - nomballoc disabled multiple block allocator for block allocation. 258 256 stripe=n Number of filesystem blocks that mballoc will try 259 257 to use for allocation size and alignment. For RAID5/6 260 258 systems this should be the number of data

+1 -1

fs/Kconfig

··· 160 160 filesystem initially. 161 161 162 162 To compile this file system support as a module, choose M here. The 163 - module will be called ext4dev. 163 + module will be called ext4. 164 164 165 165 If unsure, say N. 166 166

+1 -1

fs/Makefile

··· 71 71 # Do not add any filesystems before this line 72 72 obj-$(CONFIG_REISERFS_FS) += reiserfs/ 73 73 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 74 - obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev 74 + obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 75 75 obj-$(CONFIG_JBD) += jbd/ 76 76 obj-$(CONFIG_JBD2) += jbd2/ 77 77 obj-$(CONFIG_EXT2_FS) += ext2/

+10 -2

fs/ext4/balloc.c

··· 568 568 569 569 /* this isn't the right place to decide whether block is metadata 570 570 * inode.c/extents.c knows better, but for safety ... */ 571 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || 572 - ext4_should_journal_data(inode)) 571 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 572 + metadata = 1; 573 + 574 + /* We need to make sure we don't reuse 575 + * block released untill the transaction commit. 576 + * writeback mode have weak data consistency so 577 + * don't force data as metadata when freeing block 578 + * for writeback mode. 579 + */ 580 + if (metadata == 0 && !ext4_should_writeback_data(inode)) 573 581 metadata = 1; 574 582 575 583 sb = inode->i_sb;

-1

fs/ext4/ext4.h

··· 511 511 /* 512 512 * Mount flags 513 513 */ 514 - #define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ 515 514 #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ 516 515 #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 517 516 #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */

-3

fs/ext4/ext4_sb.h

··· 99 99 struct inode *s_buddy_cache; 100 100 long s_blocks_reserved; 101 101 spinlock_t s_reserve_lock; 102 - struct list_head s_active_transaction; 103 - struct list_head s_closed_transaction; 104 - struct list_head s_committed_transaction; 105 102 spinlock_t s_md_lock; 106 103 tid_t s_last_transaction; 107 104 unsigned short *s_mb_offsets, *s_mb_maxs;

+75 -66

fs/ext4/inode.c

··· 1648 1648 int ret = 0, err, nr_pages, i; 1649 1649 unsigned long index, end; 1650 1650 struct pagevec pvec; 1651 + long pages_skipped; 1651 1652 1652 1653 BUG_ON(mpd->next_page <= mpd->first_page); 1653 1654 pagevec_init(&pvec, 0); ··· 1656 1655 end = mpd->next_page - 1; 1657 1656 1658 1657 while (index <= end) { 1659 - /* XXX: optimize tail */ 1660 - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1658 + /* 1659 + * We can use PAGECACHE_TAG_DIRTY lookup here because 1660 + * even though we have cleared the dirty flag on the page 1661 + * We still keep the page in the radix tree with tag 1662 + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. 1663 + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback 1664 + * which is called via the below writepage callback. 1665 + */ 1666 + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1667 + PAGECACHE_TAG_DIRTY, 1668 + min(end - index, 1669 + (pgoff_t)PAGEVEC_SIZE-1) + 1); 1661 1670 if (nr_pages == 0) 1662 1671 break; 1663 1672 for (i = 0; i < nr_pages; i++) { 1664 1673 struct page *page = pvec.pages[i]; 1665 1674 1666 - index = page->index; 1667 - if (index > end) 1668 - break; 1669 - index++; 1670 - 1675 + pages_skipped = mpd->wbc->pages_skipped; 1671 1676 err = mapping->a_ops->writepage(page, mpd->wbc); 1672 - if (!err) 1677 + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1678 + /* 1679 + * have successfully written the page 1680 + * without skipping the same 1681 + */ 1673 1682 mpd->pages_written++; 1674 1683 /* 1675 1684 * In error case, we have to continue because ··· 2115 2104 struct writeback_control *wbc, 2116 2105 struct mpage_da_data *mpd) 2117 2106 { 2118 - long to_write; 2119 2107 int ret; 2120 2108 2121 2109 if (!mpd->get_block) ··· 2129 2119 mpd->pages_written = 0; 2130 2120 mpd->retval = 0; 2131 2121 2132 - to_write = wbc->nr_to_write; 2133 - 2134 2122 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2135 - 2136 2123 /* 2137 2124 * Handle last extent of pages 2138 2125 */ 2139 2126 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2140 2127 if (mpage_da_map_blocks(mpd) == 0) 2141 2128 mpage_da_submit_io(mpd); 2142 - } 2143 2129 2144 - wbc->nr_to_write = to_write - mpd->pages_written; 2130 + mpd->io_done = 1; 2131 + ret = MPAGE_DA_EXTENT_TAIL; 2132 + } 2133 + wbc->nr_to_write -= mpd->pages_written; 2145 2134 return ret; 2146 2135 } 2147 2136 ··· 2369 2360 static int ext4_da_writepages(struct address_space *mapping, 2370 2361 struct writeback_control *wbc) 2371 2362 { 2363 + pgoff_t index; 2364 + int range_whole = 0; 2372 2365 handle_t *handle = NULL; 2373 - loff_t range_start = 0; 2374 2366 struct mpage_da_data mpd; 2375 2367 struct inode *inode = mapping->host; 2368 + int no_nrwrite_index_update; 2369 + long pages_written = 0, pages_skipped; 2376 2370 int needed_blocks, ret = 0, nr_to_writebump = 0; 2377 - long to_write, pages_skipped = 0; 2378 2371 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2379 2372 2380 2373 /* ··· 2396 2385 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2397 2386 wbc->nr_to_write = sbi->s_mb_stream_request; 2398 2387 } 2388 + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2389 + range_whole = 1; 2399 2390 2400 - if (!wbc->range_cyclic) 2401 - /* 2402 - * If range_cyclic is not set force range_cont 2403 - * and save the old writeback_index 2404 - */ 2405 - wbc->range_cont = 1; 2406 - 2407 - range_start = wbc->range_start; 2408 - pages_skipped = wbc->pages_skipped; 2391 + if (wbc->range_cyclic) 2392 + index = mapping->writeback_index; 2393 + else 2394 + index = wbc->range_start >> PAGE_CACHE_SHIFT; 2409 2395 2410 2396 mpd.wbc = wbc; 2411 2397 mpd.inode = mapping->host; 2412 2398 2413 - restart_loop: 2414 - to_write = wbc->nr_to_write; 2415 - while (!ret && to_write > 0) { 2399 + /* 2400 + * we don't want write_cache_pages to update 2401 + * nr_to_write and writeback_index 2402 + */ 2403 + no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2404 + wbc->no_nrwrite_index_update = 1; 2405 + pages_skipped = wbc->pages_skipped; 2406 + 2407 + while (!ret && wbc->nr_to_write > 0) { 2416 2408 2417 2409 /* 2418 2410 * we insert one extent at a time. So we need ··· 2436 2422 dump_stack(); 2437 2423 goto out_writepages; 2438 2424 } 2439 - to_write -= wbc->nr_to_write; 2440 - 2441 2425 mpd.get_block = ext4_da_get_block_write; 2442 2426 ret = mpage_da_writepages(mapping, wbc, &mpd); 2443 2427 2444 2428 ext4_journal_stop(handle); 2445 2429 2446 - if (mpd.retval == -ENOSPC) 2430 + if (mpd.retval == -ENOSPC) { 2431 + /* commit the transaction which would 2432 + * free blocks released in the transaction 2433 + * and try again 2434 + */ 2447 2435 jbd2_journal_force_commit_nested(sbi->s_journal); 2448 - 2449 - /* reset the retry count */ 2450 - if (ret == MPAGE_DA_EXTENT_TAIL) { 2436 + wbc->pages_skipped = pages_skipped; 2437 + ret = 0; 2438 + } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2451 2439 /* 2452 2440 * got one extent now try with 2453 2441 * rest of the pages 2454 2442 */ 2455 - to_write += wbc->nr_to_write; 2443 + pages_written += mpd.pages_written; 2444 + wbc->pages_skipped = pages_skipped; 2456 2445 ret = 0; 2457 - } else if (wbc->nr_to_write) { 2446 + } else if (wbc->nr_to_write) 2458 2447 /* 2459 2448 * There is no more writeout needed 2460 2449 * or we requested for a noblocking writeout 2461 2450 * and we found the device congested 2462 2451 */ 2463 - to_write += wbc->nr_to_write; 2464 2452 break; 2465 - } 2466 - wbc->nr_to_write = to_write; 2467 2453 } 2454 + if (pages_skipped != wbc->pages_skipped) 2455 + printk(KERN_EMERG "This should not happen leaving %s " 2456 + "with nr_to_write = %ld ret = %d\n", 2457 + __func__, wbc->nr_to_write, ret); 2468 2458 2469 - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { 2470 - /* We skipped pages in this loop */ 2471 - wbc->range_start = range_start; 2472 - wbc->nr_to_write = to_write + 2473 - wbc->pages_skipped - pages_skipped; 2474 - wbc->pages_skipped = pages_skipped; 2475 - goto restart_loop; 2476 - } 2459 + /* Update index */ 2460 + index += pages_written; 2461 + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2462 + /* 2463 + * set the writeback_index so that range_cyclic 2464 + * mode will write it back later 2465 + */ 2466 + mapping->writeback_index = index; 2477 2467 2478 2468 out_writepages: 2479 - wbc->nr_to_write = to_write - nr_to_writebump; 2480 - wbc->range_start = range_start; 2469 + if (!no_nrwrite_index_update) 2470 + wbc->no_nrwrite_index_update = 0; 2471 + wbc->nr_to_write -= nr_to_writebump; 2481 2472 return ret; 2482 2473 } 2483 2474 ··· 4194 4175 struct inode *inode = &(ei->vfs_inode); 4195 4176 u64 i_blocks = inode->i_blocks; 4196 4177 struct super_block *sb = inode->i_sb; 4197 - int err = 0; 4198 4178 4199 4179 if (i_blocks <= ~0U) { 4200 4180 /* ··· 4203 4185 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4204 4186 raw_inode->i_blocks_high = 0; 4205 4187 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4206 - } else if (i_blocks <= 0xffffffffffffULL) { 4188 + return 0; 4189 + } 4190 + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4191 + return -EFBIG; 4192 + 4193 + if (i_blocks <= 0xffffffffffffULL) { 4207 4194 /* 4208 4195 * i_blocks can be represented in a 48 bit variable 4209 4196 * as multiple of 512 bytes 4210 4197 */ 4211 - err = ext4_update_rocompat_feature(handle, sb, 4212 - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 4213 - if (err) 4214 - goto err_out; 4215 - /* i_block is stored in the split 48 bit fields */ 4216 4198 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4217 4199 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4218 4200 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4219 4201 } else { 4220 - /* 4221 - * i_blocks should be represented in a 48 bit variable 4222 - * as multiple of file system block size 4223 - */ 4224 - err = ext4_update_rocompat_feature(handle, sb, 4225 - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 4226 - if (err) 4227 - goto err_out; 4228 4202 ei->i_flags |= EXT4_HUGE_FILE_FL; 4229 4203 /* i_block is stored in file system block size */ 4230 4204 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4231 4205 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4232 4206 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4233 4207 } 4234 - err_out: 4235 - return err; 4208 + return 0; 4236 4209 } 4237 4210 4238 4211 /*

+138 -131

fs/ext4/mballoc.c

··· 2300 2300 } 2301 2301 2302 2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2303 + meta_group_info[i]->bb_free_root.rb_node = NULL;; 2303 2304 2304 2305 #ifdef DOUBLE_CHECK 2305 2306 { ··· 2523 2522 } 2524 2523 2525 2524 spin_lock_init(&sbi->s_md_lock); 2526 - INIT_LIST_HEAD(&sbi->s_active_transaction); 2527 - INIT_LIST_HEAD(&sbi->s_closed_transaction); 2528 - INIT_LIST_HEAD(&sbi->s_committed_transaction); 2529 2525 spin_lock_init(&sbi->s_bal_lock); 2530 2526 2531 2527 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; ··· 2551 2553 ext4_mb_init_per_dev_proc(sb); 2552 2554 ext4_mb_history_init(sb); 2553 2555 2556 + sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2557 + 2554 2558 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2555 2559 return 0; 2556 2560 } ··· 2568 2568 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2569 2569 list_del(&pa->pa_group_list); 2570 2570 count++; 2571 - kfree(pa); 2571 + kmem_cache_free(ext4_pspace_cachep, pa); 2572 2572 } 2573 2573 if (count) 2574 2574 mb_debug("mballoc: %u PAs left\n", count); ··· 2581 2581 int num_meta_group_infos; 2582 2582 struct ext4_group_info *grinfo; 2583 2583 struct ext4_sb_info *sbi = EXT4_SB(sb); 2584 - 2585 - /* release freed, non-committed blocks */ 2586 - spin_lock(&sbi->s_md_lock); 2587 - list_splice_init(&sbi->s_closed_transaction, 2588 - &sbi->s_committed_transaction); 2589 - list_splice_init(&sbi->s_active_transaction, 2590 - &sbi->s_committed_transaction); 2591 - spin_unlock(&sbi->s_md_lock); 2592 - ext4_mb_free_committed_blocks(sb); 2593 2584 2594 2585 if (sbi->s_group_info) { 2595 2586 for (i = 0; i < sbi->s_groups_count; i++) { ··· 2635 2644 return 0; 2636 2645 } 2637 2646 2638 - static noinline_for_stack void 2639 - ext4_mb_free_committed_blocks(struct super_block *sb) 2647 + /* 2648 + * This function is called by the jbd2 layer once the commit has finished, 2649 + * so we know we can free the blocks that were released with that commit. 2650 + */ 2651 + static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2640 2652 { 2641 - struct ext4_sb_info *sbi = EXT4_SB(sb); 2642 - int err; 2643 - int i; 2644 - int count = 0; 2645 - int count2 = 0; 2646 - struct ext4_free_metadata *md; 2653 + struct super_block *sb = journal->j_private; 2647 2654 struct ext4_buddy e4b; 2655 + struct ext4_group_info *db; 2656 + int err, count = 0, count2 = 0; 2657 + struct ext4_free_data *entry; 2658 + ext4_fsblk_t discard_block; 2659 + struct list_head *l, *ltmp; 2648 2660 2649 - if (list_empty(&sbi->s_committed_transaction)) 2650 - return; 2651 - 2652 - /* there is committed blocks to be freed yet */ 2653 - do { 2654 - /* get next array of blocks */ 2655 - md = NULL; 2656 - spin_lock(&sbi->s_md_lock); 2657 - if (!list_empty(&sbi->s_committed_transaction)) { 2658 - md = list_entry(sbi->s_committed_transaction.next, 2659 - struct ext4_free_metadata, list); 2660 - list_del(&md->list); 2661 - } 2662 - spin_unlock(&sbi->s_md_lock); 2663 - 2664 - if (md == NULL) 2665 - break; 2661 + list_for_each_safe(l, ltmp, &txn->t_private_list) { 2662 + entry = list_entry(l, struct ext4_free_data, list); 2666 2663 2667 2664 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2668 - md->num, md->group, md); 2665 + entry->count, entry->group, entry); 2669 2666 2670 - err = ext4_mb_load_buddy(sb, md->group, &e4b); 2667 + err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2671 2668 /* we expect to find existing buddy because it's pinned */ 2672 2669 BUG_ON(err != 0); 2673 2670 2671 + db = e4b.bd_info; 2674 2672 /* there are blocks to put in buddy to make them really free */ 2675 - count += md->num; 2673 + count += entry->count; 2676 2674 count2++; 2677 - ext4_lock_group(sb, md->group); 2678 - for (i = 0; i < md->num; i++) { 2679 - mb_debug(" %u", md->blocks[i]); 2680 - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2675 + ext4_lock_group(sb, entry->group); 2676 + /* Take it out of per group rb tree */ 2677 + rb_erase(&entry->node, &(db->bb_free_root)); 2678 + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2679 + 2680 + if (!db->bb_free_root.rb_node) { 2681 + /* No more items in the per group rb tree 2682 + * balance refcounts from ext4_mb_free_metadata() 2683 + */ 2684 + page_cache_release(e4b.bd_buddy_page); 2685 + page_cache_release(e4b.bd_bitmap_page); 2681 2686 } 2682 - mb_debug("\n"); 2683 - ext4_unlock_group(sb, md->group); 2687 + ext4_unlock_group(sb, entry->group); 2688 + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2689 + + entry->start_blk 2690 + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2691 + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, 2692 + (unsigned long long) discard_block, entry->count); 2693 + sb_issue_discard(sb, discard_block, entry->count); 2684 2694 2685 - /* balance refcounts from ext4_mb_free_metadata() */ 2686 - page_cache_release(e4b.bd_buddy_page); 2687 - page_cache_release(e4b.bd_bitmap_page); 2688 - 2689 - kfree(md); 2695 + kmem_cache_free(ext4_free_ext_cachep, entry); 2690 2696 ext4_mb_release_desc(&e4b); 2691 - 2692 - } while (md); 2697 + } 2693 2698 2694 2699 mb_debug("freed %u blocks in %u structures\n", count, count2); 2695 2700 } ··· 2699 2712 2700 2713 static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2701 2714 { 2715 + #ifdef CONFIG_PROC_FS 2702 2716 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2703 2717 struct ext4_sb_info *sbi = EXT4_SB(sb); 2704 2718 struct proc_dir_entry *proc; ··· 2723 2735 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2724 2736 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2725 2737 return -ENOMEM; 2738 + #else 2739 + return 0; 2740 + #endif 2726 2741 } 2727 2742 2728 2743 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2729 2744 { 2745 + #ifdef CONFIG_PROC_FS 2730 2746 struct ext4_sb_info *sbi = EXT4_SB(sb); 2731 2747 2732 2748 if (sbi->s_proc == NULL) ··· 2742 2750 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); 2743 2751 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2744 2752 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2745 - 2753 + #endif 2746 2754 return 0; 2747 2755 } 2748 2756 ··· 2763 2771 kmem_cache_destroy(ext4_pspace_cachep); 2764 2772 return -ENOMEM; 2765 2773 } 2774 + 2775 + ext4_free_ext_cachep = 2776 + kmem_cache_create("ext4_free_block_extents", 2777 + sizeof(struct ext4_free_data), 2778 + 0, SLAB_RECLAIM_ACCOUNT, NULL); 2779 + if (ext4_free_ext_cachep == NULL) { 2780 + kmem_cache_destroy(ext4_pspace_cachep); 2781 + kmem_cache_destroy(ext4_ac_cachep); 2782 + return -ENOMEM; 2783 + } 2766 2784 return 0; 2767 2785 } 2768 2786 ··· 2781 2779 /* XXX: synchronize_rcu(); */ 2782 2780 kmem_cache_destroy(ext4_pspace_cachep); 2783 2781 kmem_cache_destroy(ext4_ac_cachep); 2782 + kmem_cache_destroy(ext4_free_ext_cachep); 2784 2783 } 2785 2784 2786 2785 ··· 4327 4324 goto out1; 4328 4325 } 4329 4326 4330 - ext4_mb_poll_new_transaction(sb, handle); 4331 - 4332 4327 *errp = ext4_mb_initialize_context(ac, ar); 4333 4328 if (*errp) { 4334 4329 ar->len = 0; ··· 4385 4384 4386 4385 return block; 4387 4386 } 4388 - static void ext4_mb_poll_new_transaction(struct super_block *sb, 4389 - handle_t *handle) 4387 + 4388 + /* 4389 + * We can merge two free data extents only if the physical blocks 4390 + * are contiguous, AND the extents were freed by the same transaction, 4391 + * AND the blocks are associated with the same group. 4392 + */ 4393 + static int can_merge(struct ext4_free_data *entry1, 4394 + struct ext4_free_data *entry2) 4390 4395 { 4391 - struct ext4_sb_info *sbi = EXT4_SB(sb); 4392 - 4393 - if (sbi->s_last_transaction == handle->h_transaction->t_tid) 4394 - return; 4395 - 4396 - /* new transaction! time to close last one and free blocks for 4397 - * committed transaction. we know that only transaction can be 4398 - * active, so previos transaction can be being logged and we 4399 - * know that transaction before previous is known to be already 4400 - * logged. this means that now we may free blocks freed in all 4401 - * transactions before previous one. hope I'm clear enough ... */ 4402 - 4403 - spin_lock(&sbi->s_md_lock); 4404 - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { 4405 - mb_debug("new transaction %lu, old %lu\n", 4406 - (unsigned long) handle->h_transaction->t_tid, 4407 - (unsigned long) sbi->s_last_transaction); 4408 - list_splice_init(&sbi->s_closed_transaction, 4409 - &sbi->s_committed_transaction); 4410 - list_splice_init(&sbi->s_active_transaction, 4411 - &sbi->s_closed_transaction); 4412 - sbi->s_last_transaction = handle->h_transaction->t_tid; 4413 - } 4414 - spin_unlock(&sbi->s_md_lock); 4415 - 4416 - ext4_mb_free_committed_blocks(sb); 4396 + if ((entry1->t_tid == entry2->t_tid) && 4397 + (entry1->group == entry2->group) && 4398 + ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4399 + return 1; 4400 + return 0; 4417 4401 } 4418 4402 4419 4403 static noinline_for_stack int ··· 4408 4422 struct ext4_group_info *db = e4b->bd_info; 4409 4423 struct super_block *sb = e4b->bd_sb; 4410 4424 struct ext4_sb_info *sbi = EXT4_SB(sb); 4411 - struct ext4_free_metadata *md; 4412 - int i; 4425 + struct ext4_free_data *entry, *new_entry; 4426 + struct rb_node **n = &db->bb_free_root.rb_node, *node; 4427 + struct rb_node *parent = NULL, *new_node; 4428 + 4413 4429 4414 4430 BUG_ON(e4b->bd_bitmap_page == NULL); 4415 4431 BUG_ON(e4b->bd_buddy_page == NULL); 4416 4432 4433 + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4434 + new_entry->start_blk = block; 4435 + new_entry->group = group; 4436 + new_entry->count = count; 4437 + new_entry->t_tid = handle->h_transaction->t_tid; 4438 + new_node = &new_entry->node; 4439 + 4417 4440 ext4_lock_group(sb, group); 4418 - for (i = 0; i < count; i++) { 4419 - md = db->bb_md_cur; 4420 - if (md && db->bb_tid != handle->h_transaction->t_tid) { 4421 - db->bb_md_cur = NULL; 4422 - md = NULL; 4423 - } 4424 - 4425 - if (md == NULL) { 4426 - ext4_unlock_group(sb, group); 4427 - md = kmalloc(sizeof(*md), GFP_NOFS); 4428 - if (md == NULL) 4429 - return -ENOMEM; 4430 - md->num = 0; 4431 - md->group = group; 4432 - 4433 - ext4_lock_group(sb, group); 4434 - if (db->bb_md_cur == NULL) { 4435 - spin_lock(&sbi->s_md_lock); 4436 - list_add(&md->list, &sbi->s_active_transaction); 4437 - spin_unlock(&sbi->s_md_lock); 4438 - /* protect buddy cache from being freed, 4439 - * otherwise we'll refresh it from 4440 - * on-disk bitmap and lose not-yet-available 4441 - * blocks */ 4442 - page_cache_get(e4b->bd_buddy_page); 4443 - page_cache_get(e4b->bd_bitmap_page); 4444 - db->bb_md_cur = md; 4445 - db->bb_tid = handle->h_transaction->t_tid; 4446 - mb_debug("new md 0x%p for group %lu\n", 4447 - md, md->group); 4448 - } else { 4449 - kfree(md); 4450 - md = db->bb_md_cur; 4451 - } 4452 - } 4453 - 4454 - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); 4455 - md->blocks[md->num] = block + i; 4456 - md->num++; 4457 - if (md->num == EXT4_BB_MAX_BLOCKS) { 4458 - /* no more space, put full container on a sb's list */ 4459 - db->bb_md_cur = NULL; 4441 + if (!*n) { 4442 + /* first free block exent. We need to 4443 + protect buddy cache from being freed, 4444 + * otherwise we'll refresh it from 4445 + * on-disk bitmap and lose not-yet-available 4446 + * blocks */ 4447 + page_cache_get(e4b->bd_buddy_page); 4448 + page_cache_get(e4b->bd_bitmap_page); 4449 + } 4450 + while (*n) { 4451 + parent = *n; 4452 + entry = rb_entry(parent, struct ext4_free_data, node); 4453 + if (block < entry->start_blk) 4454 + n = &(*n)->rb_left; 4455 + else if (block >= (entry->start_blk + entry->count)) 4456 + n = &(*n)->rb_right; 4457 + else { 4458 + ext4_error(sb, __func__, 4459 + "Double free of blocks %d (%d %d)\n", 4460 + block, entry->start_blk, entry->count); 4461 + return 0; 4460 4462 } 4461 4463 } 4464 + 4465 + rb_link_node(new_node, parent, n); 4466 + rb_insert_color(new_node, &db->bb_free_root); 4467 + 4468 + /* Now try to see the extent can be merged to left and right */ 4469 + node = rb_prev(new_node); 4470 + if (node) { 4471 + entry = rb_entry(node, struct ext4_free_data, node); 4472 + if (can_merge(entry, new_entry)) { 4473 + new_entry->start_blk = entry->start_blk; 4474 + new_entry->count += entry->count; 4475 + rb_erase(node, &(db->bb_free_root)); 4476 + spin_lock(&sbi->s_md_lock); 4477 + list_del(&entry->list); 4478 + spin_unlock(&sbi->s_md_lock); 4479 + kmem_cache_free(ext4_free_ext_cachep, entry); 4480 + } 4481 + } 4482 + 4483 + node = rb_next(new_node); 4484 + if (node) { 4485 + entry = rb_entry(node, struct ext4_free_data, node); 4486 + if (can_merge(new_entry, entry)) { 4487 + new_entry->count += entry->count; 4488 + rb_erase(node, &(db->bb_free_root)); 4489 + spin_lock(&sbi->s_md_lock); 4490 + list_del(&entry->list); 4491 + spin_unlock(&sbi->s_md_lock); 4492 + kmem_cache_free(ext4_free_ext_cachep, entry); 4493 + } 4494 + } 4495 + /* Add the extent to transaction's private list */ 4496 + spin_lock(&sbi->s_md_lock); 4497 + list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4498 + spin_unlock(&sbi->s_md_lock); 4462 4499 ext4_unlock_group(sb, group); 4463 4500 return 0; 4464 4501 } ··· 4508 4499 int ret; 4509 4500 4510 4501 *freed = 0; 4511 - 4512 - ext4_mb_poll_new_transaction(sb, handle); 4513 4502 4514 4503 sbi = EXT4_SB(sb); 4515 4504 es = EXT4_SB(sb)->s_es;

+19 -12

fs/ext4/mballoc.h

··· 18 18 #include <linux/pagemap.h> 19 19 #include <linux/seq_file.h> 20 20 #include <linux/version.h> 21 + #include <linux/blkdev.h> 22 + #include <linux/marker.h> 21 23 #include "ext4_jbd2.h" 22 24 #include "ext4.h" 23 25 #include "group.h" ··· 100 98 101 99 static struct kmem_cache *ext4_pspace_cachep; 102 100 static struct kmem_cache *ext4_ac_cachep; 101 + static struct kmem_cache *ext4_free_ext_cachep; 103 102 104 - #ifdef EXT4_BB_MAX_BLOCKS 105 - #undef EXT4_BB_MAX_BLOCKS 106 - #endif 107 - #define EXT4_BB_MAX_BLOCKS 30 103 + struct ext4_free_data { 104 + /* this links the free block information from group_info */ 105 + struct rb_node node; 108 106 109 - struct ext4_free_metadata { 110 - ext4_group_t group; 111 - unsigned short num; 112 - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; 107 + /* this links the free block information from ext4_sb_info */ 113 108 struct list_head list; 109 + 110 + /* group which free block extent belongs */ 111 + ext4_group_t group; 112 + 113 + /* free block extent */ 114 + ext4_grpblk_t start_blk; 115 + ext4_grpblk_t count; 116 + 117 + /* transaction which freed this extent */ 118 + tid_t t_tid; 114 119 }; 115 120 116 121 struct ext4_group_info { 117 122 unsigned long bb_state; 118 - unsigned long bb_tid; 119 - struct ext4_free_metadata *bb_md_cur; 123 + struct rb_root bb_free_root; 120 124 unsigned short bb_first_free; 121 125 unsigned short bb_free; 122 126 unsigned short bb_fragments; ··· 269 261 270 262 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 271 263 ext4_group_t group); 272 - static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); 273 - static void ext4_mb_free_committed_blocks(struct super_block *); 274 264 static void ext4_mb_return_to_preallocation(struct inode *inode, 275 265 struct ext4_buddy *e4b, sector_t block, 276 266 int count); ··· 276 270 struct super_block *, struct ext4_prealloc_space *pa); 277 271 static int ext4_mb_init_per_dev_proc(struct super_block *sb); 278 272 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); 273 + static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 279 274 280 275 281 276 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)

+34 -98

fs/ext4/super.c

··· 374 374 */ 375 375 } 376 376 377 - int ext4_update_compat_feature(handle_t *handle, 378 - struct super_block *sb, __u32 compat) 379 - { 380 - int err = 0; 381 - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { 382 - err = ext4_journal_get_write_access(handle, 383 - EXT4_SB(sb)->s_sbh); 384 - if (err) 385 - return err; 386 - EXT4_SET_COMPAT_FEATURE(sb, compat); 387 - sb->s_dirt = 1; 388 - handle->h_sync = 1; 389 - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, 390 - "call ext4_journal_dirty_met adata"); 391 - err = ext4_journal_dirty_metadata(handle, 392 - EXT4_SB(sb)->s_sbh); 393 - } 394 - return err; 395 - } 396 - 397 - int ext4_update_rocompat_feature(handle_t *handle, 398 - struct super_block *sb, __u32 rocompat) 399 - { 400 - int err = 0; 401 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { 402 - err = ext4_journal_get_write_access(handle, 403 - EXT4_SB(sb)->s_sbh); 404 - if (err) 405 - return err; 406 - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); 407 - sb->s_dirt = 1; 408 - handle->h_sync = 1; 409 - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, 410 - "call ext4_journal_dirty_met adata"); 411 - err = ext4_journal_dirty_metadata(handle, 412 - EXT4_SB(sb)->s_sbh); 413 - } 414 - return err; 415 - } 416 - 417 - int ext4_update_incompat_feature(handle_t *handle, 418 - struct super_block *sb, __u32 incompat) 419 - { 420 - int err = 0; 421 - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { 422 - err = ext4_journal_get_write_access(handle, 423 - EXT4_SB(sb)->s_sbh); 424 - if (err) 425 - return err; 426 - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); 427 - sb->s_dirt = 1; 428 - handle->h_sync = 1; 429 - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, 430 - "call ext4_journal_dirty_met adata"); 431 - err = ext4_journal_dirty_metadata(handle, 432 - EXT4_SB(sb)->s_sbh); 433 - } 434 - return err; 435 - } 436 - 437 377 /* 438 378 * Open the external journal device 439 379 */ ··· 844 904 enum { 845 905 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 846 906 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 847 - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, 907 + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 848 908 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 849 909 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 850 910 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, ··· 855 915 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 856 916 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 857 917 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 858 - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 918 + Opt_stripe, Opt_delalloc, Opt_nodelalloc, 859 919 Opt_inode_readahead_blks 860 920 }; 861 921 ··· 873 933 {Opt_err_panic, "errors=panic"}, 874 934 {Opt_err_ro, "errors=remount-ro"}, 875 935 {Opt_nouid32, "nouid32"}, 876 - {Opt_nocheck, "nocheck"}, 877 - {Opt_nocheck, "check=none"}, 878 936 {Opt_debug, "debug"}, 879 937 {Opt_oldalloc, "oldalloc"}, 880 938 {Opt_orlov, "orlov"}, ··· 911 973 {Opt_extents, "extents"}, 912 974 {Opt_noextents, "noextents"}, 913 975 {Opt_i_version, "i_version"}, 914 - {Opt_mballoc, "mballoc"}, 915 - {Opt_nomballoc, "nomballoc"}, 916 976 {Opt_stripe, "stripe=%u"}, 917 977 {Opt_resize, "resize"}, 918 978 {Opt_delalloc, "delalloc"}, ··· 1008 1072 break; 1009 1073 case Opt_nouid32: 1010 1074 set_opt(sbi->s_mount_opt, NO_UID32); 1011 - break; 1012 - case Opt_nocheck: 1013 - clear_opt(sbi->s_mount_opt, CHECK); 1014 1075 break; 1015 1076 case Opt_debug: 1016 1077 set_opt(sbi->s_mount_opt, DEBUG); ··· 1551 1618 if (block_bitmap < first_block || block_bitmap > last_block) { 1552 1619 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1553 1620 "Block bitmap for group %lu not in group " 1554 - "(block %llu)!", i, block_bitmap); 1621 + "(block %llu)!\n", i, block_bitmap); 1555 1622 return 0; 1556 1623 } 1557 1624 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1558 1625 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1559 1626 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1560 1627 "Inode bitmap for group %lu not in group " 1561 - "(block %llu)!", i, inode_bitmap); 1628 + "(block %llu)!\n", i, inode_bitmap); 1562 1629 return 0; 1563 1630 } 1564 1631 inode_table = ext4_inode_table(sb, gdp); ··· 1566 1633 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1567 1634 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1568 1635 "Inode table for group %lu not in group " 1569 - "(block %llu)!", i, inode_table); 1636 + "(block %llu)!\n", i, inode_table); 1570 1637 return 0; 1571 1638 } 1572 1639 spin_lock(sb_bgl_lock(sbi, i)); ··· 1711 1778 * 1712 1779 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 1713 1780 */ 1714 - static loff_t ext4_max_size(int blkbits) 1781 + static loff_t ext4_max_size(int blkbits, int has_huge_files) 1715 1782 { 1716 1783 loff_t res; 1717 1784 loff_t upper_limit = MAX_LFS_FILESIZE; 1718 1785 1719 1786 /* small i_blocks in vfs inode? */ 1720 - if (sizeof(blkcnt_t) < sizeof(u64)) { 1787 + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1721 1788 /* 1722 1789 * CONFIG_LSF is not enabled implies the inode 1723 1790 * i_block represent total blocks in 512 bytes ··· 1747 1814 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 1748 1815 * We need to be 1 filesystem block less than the 2^48 sector limit. 1749 1816 */ 1750 - static loff_t ext4_max_bitmap_size(int bits) 1817 + static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) 1751 1818 { 1752 1819 loff_t res = EXT4_NDIR_BLOCKS; 1753 1820 int meta_blocks; ··· 1760 1827 * total number of 512 bytes blocks of the file 1761 1828 */ 1762 1829 1763 - if (sizeof(blkcnt_t) < sizeof(u64)) { 1830 + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1764 1831 /* 1765 - * CONFIG_LSF is not enabled implies the inode 1766 - * i_block represent total blocks in 512 bytes 1767 - * 32 == size of vfs inode i_blocks * 8 1832 + * !has_huge_files or CONFIG_LSF is not enabled 1833 + * implies the inode i_block represent total blocks in 1834 + * 512 bytes 32 == size of vfs inode i_blocks * 8 1768 1835 */ 1769 1836 upper_limit = (1LL << 32) - 1; 1770 1837 ··· 1873 1940 int blocksize; 1874 1941 int db_count; 1875 1942 int i; 1876 - int needs_recovery; 1943 + int needs_recovery, has_huge_files; 1877 1944 __le32 features; 1878 1945 __u64 blocks_count; 1879 1946 int err; ··· 2014 2081 sb->s_id, le32_to_cpu(features)); 2015 2082 goto failed_mount; 2016 2083 } 2017 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 2084 + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2085 + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 2086 + if (has_huge_files) { 2018 2087 /* 2019 2088 * Large file size enabled file system can only be 2020 2089 * mount if kernel is build with CONFIG_LSF ··· 2066 2131 } 2067 2132 } 2068 2133 2069 - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); 2070 - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); 2134 + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2135 + has_huge_files); 2136 + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2071 2137 2072 2138 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2073 2139 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; ··· 2392 2456 "available.\n"); 2393 2457 } 2394 2458 2459 + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2460 + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " 2461 + "requested data journaling mode\n"); 2462 + clear_opt(sbi->s_mount_opt, DELALLOC); 2463 + } else if (test_opt(sb, DELALLOC)) 2464 + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2465 + 2466 + ext4_ext_init(sb); 2467 + err = ext4_mb_init(sb, needs_recovery); 2468 + if (err) { 2469 + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", 2470 + err); 2471 + goto failed_mount4; 2472 + } 2473 + 2395 2474 /* 2396 2475 * akpm: core read_super() calls in here with the superblock locked. 2397 2476 * That deadlocks, because orphan cleanup needs to lock the superblock ··· 2425 2474 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2426 2475 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2427 2476 "writeback"); 2428 - 2429 - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2430 - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " 2431 - "requested data journaling mode\n"); 2432 - clear_opt(sbi->s_mount_opt, DELALLOC); 2433 - } else if (test_opt(sb, DELALLOC)) 2434 - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2435 - 2436 - ext4_ext_init(sb); 2437 - err = ext4_mb_init(sb, needs_recovery); 2438 - if (err) { 2439 - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", 2440 - err); 2441 - goto failed_mount4; 2442 - } 2443 2477 2444 2478 lock_kernel(); 2445 2479 return 0;

+3

fs/jbd2/commit.c

··· 995 995 } 996 996 spin_unlock(&journal->j_list_lock); 997 997 998 + if (journal->j_commit_callback) 999 + journal->j_commit_callback(journal, commit_transaction); 1000 + 998 1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 999 1002 journal->j_devname, commit_transaction->t_tid, 1000 1003 journal->j_tail_sequence);

+1

fs/jbd2/transaction.c

··· 52 52 transaction->t_expires = jiffies + journal->j_commit_interval; 53 53 spin_lock_init(&transaction->t_handle_lock); 54 54 INIT_LIST_HEAD(&transaction->t_inode_list); 55 + INIT_LIST_HEAD(&transaction->t_private_list); 55 56 56 57 /* Set up the commit timer for the new transaction. */ 57 58 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);

+9

include/linux/jbd2.h

··· 641 641 */ 642 642 int t_handle_count; 643 643 644 + /* 645 + * For use by the filesystem to store fs-specific data 646 + * structures associated with the transaction 647 + */ 648 + struct list_head t_private_list; 644 649 }; 645 650 646 651 struct transaction_run_stats_s { ··· 939 934 int j_wbufsize; 940 935 941 936 pid_t j_last_sync_writer; 937 + 938 + /* This function is called when a transaction is closed */ 939 + void (*j_commit_callback)(journal_t *, 940 + transaction_t *); 942 941 943 942 /* 944 943 * Journal statistics

+9 -1

include/linux/writeback.h

··· 63 63 unsigned for_writepages:1; /* This is a writepages() call */ 64 64 unsigned range_cyclic:1; /* range_start is cyclic */ 65 65 unsigned more_io:1; /* more io to be dispatched */ 66 - unsigned range_cont:1; 66 + /* 67 + * write_cache_pages() won't update wbc->nr_to_write and 68 + * mapping->writeback_index if no_nrwrite_index_update 69 + * is set. write_cache_pages() may write more than we 70 + * requested and we want to make sure nr_to_write and 71 + * writeback_index are updated in a consistent manner 72 + * so we use a single control to update them 73 + */ 74 + unsigned no_nrwrite_index_update:1; 67 75 }; 68 76 69 77 /*

+7 -5

mm/page-writeback.c

··· 876 876 pgoff_t end; /* Inclusive */ 877 877 int scanned = 0; 878 878 int range_whole = 0; 879 + long nr_to_write = wbc->nr_to_write; 879 880 880 881 if (wbc->nonblocking && bdi_write_congested(bdi)) { 881 882 wbc->encountered_congestion = 1; ··· 940 939 unlock_page(page); 941 940 ret = 0; 942 941 } 943 - if (ret || (--(wbc->nr_to_write) <= 0)) 942 + if (ret || (--nr_to_write <= 0)) 944 943 done = 1; 945 944 if (wbc->nonblocking && bdi_write_congested(bdi)) { 946 945 wbc->encountered_congestion = 1; ··· 959 958 index = 0; 960 959 goto retry; 961 960 } 962 - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 963 - mapping->writeback_index = index; 961 + if (!wbc->no_nrwrite_index_update) { 962 + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) 963 + mapping->writeback_index = index; 964 + wbc->nr_to_write = nr_to_write; 965 + } 964 966 965 - if (wbc->range_cont) 966 - wbc->range_start = index << PAGE_CACHE_SHIFT; 967 967 return ret; 968 968 } 969 969 EXPORT_SYMBOL(write_cache_pages);