commit 58617d5e59663d2edea03bd03cb74279827611bb · tjh.dev/kernel

+15 -17

Documentation/filesystems/ext4.txt

··· 2 Ext4 Filesystem 3 =============== 4 5 - This is a development version of the ext4 filesystem, an advanced level 6 - of the ext3 filesystem which incorporates scalability and reliability 7 - enhancements for supporting large filesystems (64 bit) in keeping with 8 - increasing disk capacities and state-of-the-art feature requirements. 9 10 - Mailing list: linux-ext4@vger.kernel.org 11 12 13 1. Quick usage instructions: 14 =========================== 15 16 - Compile and install the latest version of e2fsprogs (as of this 17 - writing version 1.41) from: 18 19 http://sourceforge.net/project/showfiles.php?group_id=2406 20 ··· 41 42 # mke2fs -t ext4 /dev/hda1 43 44 - Or configure an existing ext3 filesystem to support extents and set 45 - the test_fs flag to indicate that it's ok for an in-development 46 - filesystem to touch this filesystem: 47 48 - # tune2fs -O extents -E test_fs /dev/hda1 49 50 If the filesystem was created with 128 byte inodes, it can be 51 converted to use 256 byte for greater efficiency via: ··· 107 The big performance win will come with mballoc, delalloc and flex_bg 108 grouping of bitmaps and inode tables. Some test results available here: 109 110 - - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html 111 - - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html 112 113 3. Options 114 ========== ··· 217 bsddf (*) Make 'df' act like BSD. 218 minixdf Make 'df' act like Minix. 219 220 - check=none Don't do extra checking of bitmaps on mount. 221 - nocheck 222 - 223 debug Extra debugging information is sent to syslog. 224 225 errors=remount-ro(*) Remount the filesystem read-only on an error. ··· 253 "nobh" option tries to avoid associating buffer 254 heads (supported only for "writeback" mode). 255 256 - mballoc (*) Use the multiple block allocator for block allocation 257 - nomballoc disabled multiple block allocator for block allocation. 258 stripe=n Number of filesystem blocks that mballoc will try 259 to use for allocation size and alignment. For RAID5/6 260 systems this should be the number of data

··· 2 Ext4 Filesystem 3 =============== 4 5 + Ext4 is an an advanced level of the ext3 filesystem which incorporates 6 + scalability and reliability enhancements for supporting large filesystems 7 + (64 bit) in keeping with increasing disk capacities and state-of-the-art 8 + feature requirements. 9 10 + Mailing list: linux-ext4@vger.kernel.org 11 + Web site: http://ext4.wiki.kernel.org 12 13 14 1. Quick usage instructions: 15 =========================== 16 17 + Note: More extensive information for getting started with ext4 can be 18 + found at the ext4 wiki site at the URL: 19 + http://ext4.wiki.kernel.org/index.php/Ext4_Howto 20 + 21 - Compile and install the latest version of e2fsprogs (as of this 22 + writing version 1.41.3) from: 23 24 http://sourceforge.net/project/showfiles.php?group_id=2406 25 ··· 36 37 # mke2fs -t ext4 /dev/hda1 38 39 + Or to configure an existing ext3 filesystem to support extents: 40 41 + # tune2fs -O extents /dev/hda1 42 43 If the filesystem was created with 128 byte inodes, it can be 44 converted to use 256 byte for greater efficiency via: ··· 104 The big performance win will come with mballoc, delalloc and flex_bg 105 grouping of bitmaps and inode tables. Some test results available here: 106 107 + - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html 108 + - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html 109 110 3. Options 111 ========== ··· 214 bsddf (*) Make 'df' act like BSD. 215 minixdf Make 'df' act like Minix. 216 217 debug Extra debugging information is sent to syslog. 218 219 errors=remount-ro(*) Remount the filesystem read-only on an error. ··· 253 "nobh" option tries to avoid associating buffer 254 heads (supported only for "writeback" mode). 255 256 stripe=n Number of filesystem blocks that mballoc will try 257 to use for allocation size and alignment. For RAID5/6 258 systems this should be the number of data

+1 -1

fs/Kconfig

··· 160 filesystem initially. 161 162 To compile this file system support as a module, choose M here. The 163 - module will be called ext4dev. 164 165 If unsure, say N. 166

··· 160 filesystem initially. 161 162 To compile this file system support as a module, choose M here. The 163 + module will be called ext4. 164 165 If unsure, say N. 166

+1 -1

fs/Makefile

··· 71 # Do not add any filesystems before this line 72 obj-$(CONFIG_REISERFS_FS) += reiserfs/ 73 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 74 - obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev 75 obj-$(CONFIG_JBD) += jbd/ 76 obj-$(CONFIG_JBD2) += jbd2/ 77 obj-$(CONFIG_EXT2_FS) += ext2/

··· 71 # Do not add any filesystems before this line 72 obj-$(CONFIG_REISERFS_FS) += reiserfs/ 73 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 74 + obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 75 obj-$(CONFIG_JBD) += jbd/ 76 obj-$(CONFIG_JBD2) += jbd2/ 77 obj-$(CONFIG_EXT2_FS) += ext2/

+10 -2

fs/ext4/balloc.c

··· 568 569 /* this isn't the right place to decide whether block is metadata 570 * inode.c/extents.c knows better, but for safety ... */ 571 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || 572 - ext4_should_journal_data(inode)) 573 metadata = 1; 574 575 sb = inode->i_sb;

··· 568 569 /* this isn't the right place to decide whether block is metadata 570 * inode.c/extents.c knows better, but for safety ... */ 571 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 572 + metadata = 1; 573 + 574 + /* We need to make sure we don't reuse 575 + * block released untill the transaction commit. 576 + * writeback mode have weak data consistency so 577 + * don't force data as metadata when freeing block 578 + * for writeback mode. 579 + */ 580 + if (metadata == 0 && !ext4_should_writeback_data(inode)) 581 metadata = 1; 582 583 sb = inode->i_sb;

-1

fs/ext4/ext4.h

··· 511 /* 512 * Mount flags 513 */ 514 - #define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ 515 #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ 516 #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 517 #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */

··· 511 /* 512 * Mount flags 513 */ 514 #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ 515 #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 516 #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */

-3

fs/ext4/ext4_sb.h

··· 99 struct inode *s_buddy_cache; 100 long s_blocks_reserved; 101 spinlock_t s_reserve_lock; 102 - struct list_head s_active_transaction; 103 - struct list_head s_closed_transaction; 104 - struct list_head s_committed_transaction; 105 spinlock_t s_md_lock; 106 tid_t s_last_transaction; 107 unsigned short *s_mb_offsets, *s_mb_maxs;

··· 99 struct inode *s_buddy_cache; 100 long s_blocks_reserved; 101 spinlock_t s_reserve_lock; 102 spinlock_t s_md_lock; 103 tid_t s_last_transaction; 104 unsigned short *s_mb_offsets, *s_mb_maxs;

+75 -66

fs/ext4/inode.c

··· 1648 int ret = 0, err, nr_pages, i; 1649 unsigned long index, end; 1650 struct pagevec pvec; 1651 1652 BUG_ON(mpd->next_page <= mpd->first_page); 1653 pagevec_init(&pvec, 0); ··· 1656 end = mpd->next_page - 1; 1657 1658 while (index <= end) { 1659 - /* XXX: optimize tail */ 1660 - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1661 if (nr_pages == 0) 1662 break; 1663 for (i = 0; i < nr_pages; i++) { 1664 struct page *page = pvec.pages[i]; 1665 1666 - index = page->index; 1667 - if (index > end) 1668 - break; 1669 - index++; 1670 - 1671 err = mapping->a_ops->writepage(page, mpd->wbc); 1672 - if (!err) 1673 mpd->pages_written++; 1674 /* 1675 * In error case, we have to continue because ··· 2115 struct writeback_control *wbc, 2116 struct mpage_da_data *mpd) 2117 { 2118 - long to_write; 2119 int ret; 2120 2121 if (!mpd->get_block) ··· 2129 mpd->pages_written = 0; 2130 mpd->retval = 0; 2131 2132 - to_write = wbc->nr_to_write; 2133 - 2134 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2135 - 2136 /* 2137 * Handle last extent of pages 2138 */ 2139 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2140 if (mpage_da_map_blocks(mpd) == 0) 2141 mpage_da_submit_io(mpd); 2142 - } 2143 2144 - wbc->nr_to_write = to_write - mpd->pages_written; 2145 return ret; 2146 } 2147 ··· 2369 static int ext4_da_writepages(struct address_space *mapping, 2370 struct writeback_control *wbc) 2371 { 2372 handle_t *handle = NULL; 2373 - loff_t range_start = 0; 2374 struct mpage_da_data mpd; 2375 struct inode *inode = mapping->host; 2376 int needed_blocks, ret = 0, nr_to_writebump = 0; 2377 - long to_write, pages_skipped = 0; 2378 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2379 2380 /* ··· 2396 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2397 wbc->nr_to_write = sbi->s_mb_stream_request; 2398 } 2399 2400 - if (!wbc->range_cyclic) 2401 - /* 2402 - * If range_cyclic is not set force range_cont 2403 - * and save the old writeback_index 2404 - */ 2405 - wbc->range_cont = 1; 2406 - 2407 - range_start = wbc->range_start; 2408 - pages_skipped = wbc->pages_skipped; 2409 2410 mpd.wbc = wbc; 2411 mpd.inode = mapping->host; 2412 2413 - restart_loop: 2414 - to_write = wbc->nr_to_write; 2415 - while (!ret && to_write > 0) { 2416 2417 /* 2418 * we insert one extent at a time. So we need ··· 2436 dump_stack(); 2437 goto out_writepages; 2438 } 2439 - to_write -= wbc->nr_to_write; 2440 - 2441 mpd.get_block = ext4_da_get_block_write; 2442 ret = mpage_da_writepages(mapping, wbc, &mpd); 2443 2444 ext4_journal_stop(handle); 2445 2446 - if (mpd.retval == -ENOSPC) 2447 jbd2_journal_force_commit_nested(sbi->s_journal); 2448 - 2449 - /* reset the retry count */ 2450 - if (ret == MPAGE_DA_EXTENT_TAIL) { 2451 /* 2452 * got one extent now try with 2453 * rest of the pages 2454 */ 2455 - to_write += wbc->nr_to_write; 2456 ret = 0; 2457 - } else if (wbc->nr_to_write) { 2458 /* 2459 * There is no more writeout needed 2460 * or we requested for a noblocking writeout 2461 * and we found the device congested 2462 */ 2463 - to_write += wbc->nr_to_write; 2464 break; 2465 - } 2466 - wbc->nr_to_write = to_write; 2467 } 2468 2469 - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { 2470 - /* We skipped pages in this loop */ 2471 - wbc->range_start = range_start; 2472 - wbc->nr_to_write = to_write + 2473 - wbc->pages_skipped - pages_skipped; 2474 - wbc->pages_skipped = pages_skipped; 2475 - goto restart_loop; 2476 - } 2477 2478 out_writepages: 2479 - wbc->nr_to_write = to_write - nr_to_writebump; 2480 - wbc->range_start = range_start; 2481 return ret; 2482 } 2483 ··· 4194 struct inode *inode = &(ei->vfs_inode); 4195 u64 i_blocks = inode->i_blocks; 4196 struct super_block *sb = inode->i_sb; 4197 - int err = 0; 4198 4199 if (i_blocks <= ~0U) { 4200 /* ··· 4203 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4204 raw_inode->i_blocks_high = 0; 4205 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4206 - } else if (i_blocks <= 0xffffffffffffULL) { 4207 /* 4208 * i_blocks can be represented in a 48 bit variable 4209 * as multiple of 512 bytes 4210 */ 4211 - err = ext4_update_rocompat_feature(handle, sb, 4212 - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 4213 - if (err) 4214 - goto err_out; 4215 - /* i_block is stored in the split 48 bit fields */ 4216 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4217 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4218 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4219 } else { 4220 - /* 4221 - * i_blocks should be represented in a 48 bit variable 4222 - * as multiple of file system block size 4223 - */ 4224 - err = ext4_update_rocompat_feature(handle, sb, 4225 - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 4226 - if (err) 4227 - goto err_out; 4228 ei->i_flags |= EXT4_HUGE_FILE_FL; 4229 /* i_block is stored in file system block size */ 4230 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4231 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4232 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4233 } 4234 - err_out: 4235 - return err; 4236 } 4237 4238 /*

··· 1648 int ret = 0, err, nr_pages, i; 1649 unsigned long index, end; 1650 struct pagevec pvec; 1651 + long pages_skipped; 1652 1653 BUG_ON(mpd->next_page <= mpd->first_page); 1654 pagevec_init(&pvec, 0); ··· 1655 end = mpd->next_page - 1; 1656 1657 while (index <= end) { 1658 + /* 1659 + * We can use PAGECACHE_TAG_DIRTY lookup here because 1660 + * even though we have cleared the dirty flag on the page 1661 + * We still keep the page in the radix tree with tag 1662 + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. 1663 + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback 1664 + * which is called via the below writepage callback. 1665 + */ 1666 + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1667 + PAGECACHE_TAG_DIRTY, 1668 + min(end - index, 1669 + (pgoff_t)PAGEVEC_SIZE-1) + 1); 1670 if (nr_pages == 0) 1671 break; 1672 for (i = 0; i < nr_pages; i++) { 1673 struct page *page = pvec.pages[i]; 1674 1675 + pages_skipped = mpd->wbc->pages_skipped; 1676 err = mapping->a_ops->writepage(page, mpd->wbc); 1677 + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1678 + /* 1679 + * have successfully written the page 1680 + * without skipping the same 1681 + */ 1682 mpd->pages_written++; 1683 /* 1684 * In error case, we have to continue because ··· 2104 struct writeback_control *wbc, 2105 struct mpage_da_data *mpd) 2106 { 2107 int ret; 2108 2109 if (!mpd->get_block) ··· 2119 mpd->pages_written = 0; 2120 mpd->retval = 0; 2121 2122 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2123 /* 2124 * Handle last extent of pages 2125 */ 2126 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2127 if (mpage_da_map_blocks(mpd) == 0) 2128 mpage_da_submit_io(mpd); 2129 2130 + mpd->io_done = 1; 2131 + ret = MPAGE_DA_EXTENT_TAIL; 2132 + } 2133 + wbc->nr_to_write -= mpd->pages_written; 2134 return ret; 2135 } 2136 ··· 2360 static int ext4_da_writepages(struct address_space *mapping, 2361 struct writeback_control *wbc) 2362 { 2363 + pgoff_t index; 2364 + int range_whole = 0; 2365 handle_t *handle = NULL; 2366 struct mpage_da_data mpd; 2367 struct inode *inode = mapping->host; 2368 + int no_nrwrite_index_update; 2369 + long pages_written = 0, pages_skipped; 2370 int needed_blocks, ret = 0, nr_to_writebump = 0; 2371 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2372 2373 /* ··· 2385 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2386 wbc->nr_to_write = sbi->s_mb_stream_request; 2387 } 2388 + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2389 + range_whole = 1; 2390 2391 + if (wbc->range_cyclic) 2392 + index = mapping->writeback_index; 2393 + else 2394 + index = wbc->range_start >> PAGE_CACHE_SHIFT; 2395 2396 mpd.wbc = wbc; 2397 mpd.inode = mapping->host; 2398 2399 + /* 2400 + * we don't want write_cache_pages to update 2401 + * nr_to_write and writeback_index 2402 + */ 2403 + no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2404 + wbc->no_nrwrite_index_update = 1; 2405 + pages_skipped = wbc->pages_skipped; 2406 + 2407 + while (!ret && wbc->nr_to_write > 0) { 2408 2409 /* 2410 * we insert one extent at a time. So we need ··· 2422 dump_stack(); 2423 goto out_writepages; 2424 } 2425 mpd.get_block = ext4_da_get_block_write; 2426 ret = mpage_da_writepages(mapping, wbc, &mpd); 2427 2428 ext4_journal_stop(handle); 2429 2430 + if (mpd.retval == -ENOSPC) { 2431 + /* commit the transaction which would 2432 + * free blocks released in the transaction 2433 + * and try again 2434 + */ 2435 jbd2_journal_force_commit_nested(sbi->s_journal); 2436 + wbc->pages_skipped = pages_skipped; 2437 + ret = 0; 2438 + } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2439 /* 2440 * got one extent now try with 2441 * rest of the pages 2442 */ 2443 + pages_written += mpd.pages_written; 2444 + wbc->pages_skipped = pages_skipped; 2445 ret = 0; 2446 + } else if (wbc->nr_to_write) 2447 /* 2448 * There is no more writeout needed 2449 * or we requested for a noblocking writeout 2450 * and we found the device congested 2451 */ 2452 break; 2453 } 2454 + if (pages_skipped != wbc->pages_skipped) 2455 + printk(KERN_EMERG "This should not happen leaving %s " 2456 + "with nr_to_write = %ld ret = %d\n", 2457 + __func__, wbc->nr_to_write, ret); 2458 2459 + /* Update index */ 2460 + index += pages_written; 2461 + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2462 + /* 2463 + * set the writeback_index so that range_cyclic 2464 + * mode will write it back later 2465 + */ 2466 + mapping->writeback_index = index; 2467 2468 out_writepages: 2469 + if (!no_nrwrite_index_update) 2470 + wbc->no_nrwrite_index_update = 0; 2471 + wbc->nr_to_write -= nr_to_writebump; 2472 return ret; 2473 } 2474 ··· 4175 struct inode *inode = &(ei->vfs_inode); 4176 u64 i_blocks = inode->i_blocks; 4177 struct super_block *sb = inode->i_sb; 4178 4179 if (i_blocks <= ~0U) { 4180 /* ··· 4185 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4186 raw_inode->i_blocks_high = 0; 4187 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4188 + return 0; 4189 + } 4190 + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4191 + return -EFBIG; 4192 + 4193 + if (i_blocks <= 0xffffffffffffULL) { 4194 /* 4195 * i_blocks can be represented in a 48 bit variable 4196 * as multiple of 512 bytes 4197 */ 4198 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4199 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4200 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4201 } else { 4202 ei->i_flags |= EXT4_HUGE_FILE_FL; 4203 /* i_block is stored in file system block size */ 4204 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4205 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4206 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4207 } 4208 + return 0; 4209 } 4210 4211 /*

+138 -131

fs/ext4/mballoc.c

··· 2300 } 2301 2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2303 2304 #ifdef DOUBLE_CHECK 2305 { ··· 2523 } 2524 2525 spin_lock_init(&sbi->s_md_lock); 2526 - INIT_LIST_HEAD(&sbi->s_active_transaction); 2527 - INIT_LIST_HEAD(&sbi->s_closed_transaction); 2528 - INIT_LIST_HEAD(&sbi->s_committed_transaction); 2529 spin_lock_init(&sbi->s_bal_lock); 2530 2531 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; ··· 2551 ext4_mb_init_per_dev_proc(sb); 2552 ext4_mb_history_init(sb); 2553 2554 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2555 return 0; 2556 } ··· 2568 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2569 list_del(&pa->pa_group_list); 2570 count++; 2571 - kfree(pa); 2572 } 2573 if (count) 2574 mb_debug("mballoc: %u PAs left\n", count); ··· 2581 int num_meta_group_infos; 2582 struct ext4_group_info *grinfo; 2583 struct ext4_sb_info *sbi = EXT4_SB(sb); 2584 - 2585 - /* release freed, non-committed blocks */ 2586 - spin_lock(&sbi->s_md_lock); 2587 - list_splice_init(&sbi->s_closed_transaction, 2588 - &sbi->s_committed_transaction); 2589 - list_splice_init(&sbi->s_active_transaction, 2590 - &sbi->s_committed_transaction); 2591 - spin_unlock(&sbi->s_md_lock); 2592 - ext4_mb_free_committed_blocks(sb); 2593 2594 if (sbi->s_group_info) { 2595 for (i = 0; i < sbi->s_groups_count; i++) { ··· 2635 return 0; 2636 } 2637 2638 - static noinline_for_stack void 2639 - ext4_mb_free_committed_blocks(struct super_block *sb) 2640 { 2641 - struct ext4_sb_info *sbi = EXT4_SB(sb); 2642 - int err; 2643 - int i; 2644 - int count = 0; 2645 - int count2 = 0; 2646 - struct ext4_free_metadata *md; 2647 struct ext4_buddy e4b; 2648 2649 - if (list_empty(&sbi->s_committed_transaction)) 2650 - return; 2651 - 2652 - /* there is committed blocks to be freed yet */ 2653 - do { 2654 - /* get next array of blocks */ 2655 - md = NULL; 2656 - spin_lock(&sbi->s_md_lock); 2657 - if (!list_empty(&sbi->s_committed_transaction)) { 2658 - md = list_entry(sbi->s_committed_transaction.next, 2659 - struct ext4_free_metadata, list); 2660 - list_del(&md->list); 2661 - } 2662 - spin_unlock(&sbi->s_md_lock); 2663 - 2664 - if (md == NULL) 2665 - break; 2666 2667 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2668 - md->num, md->group, md); 2669 2670 - err = ext4_mb_load_buddy(sb, md->group, &e4b); 2671 /* we expect to find existing buddy because it's pinned */ 2672 BUG_ON(err != 0); 2673 2674 /* there are blocks to put in buddy to make them really free */ 2675 - count += md->num; 2676 count2++; 2677 - ext4_lock_group(sb, md->group); 2678 - for (i = 0; i < md->num; i++) { 2679 - mb_debug(" %u", md->blocks[i]); 2680 - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2681 } 2682 - mb_debug("\n"); 2683 - ext4_unlock_group(sb, md->group); 2684 2685 - /* balance refcounts from ext4_mb_free_metadata() */ 2686 - page_cache_release(e4b.bd_buddy_page); 2687 - page_cache_release(e4b.bd_bitmap_page); 2688 - 2689 - kfree(md); 2690 ext4_mb_release_desc(&e4b); 2691 - 2692 - } while (md); 2693 2694 mb_debug("freed %u blocks in %u structures\n", count, count2); 2695 } ··· 2699 2700 static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2701 { 2702 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2703 struct ext4_sb_info *sbi = EXT4_SB(sb); 2704 struct proc_dir_entry *proc; ··· 2723 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2724 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2725 return -ENOMEM; 2726 } 2727 2728 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2729 { 2730 struct ext4_sb_info *sbi = EXT4_SB(sb); 2731 2732 if (sbi->s_proc == NULL) ··· 2742 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); 2743 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2744 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2745 - 2746 return 0; 2747 } 2748 ··· 2763 kmem_cache_destroy(ext4_pspace_cachep); 2764 return -ENOMEM; 2765 } 2766 return 0; 2767 } 2768 ··· 2781 /* XXX: synchronize_rcu(); */ 2782 kmem_cache_destroy(ext4_pspace_cachep); 2783 kmem_cache_destroy(ext4_ac_cachep); 2784 } 2785 2786 ··· 4327 goto out1; 4328 } 4329 4330 - ext4_mb_poll_new_transaction(sb, handle); 4331 - 4332 *errp = ext4_mb_initialize_context(ac, ar); 4333 if (*errp) { 4334 ar->len = 0; ··· 4385 4386 return block; 4387 } 4388 - static void ext4_mb_poll_new_transaction(struct super_block *sb, 4389 - handle_t *handle) 4390 { 4391 - struct ext4_sb_info *sbi = EXT4_SB(sb); 4392 - 4393 - if (sbi->s_last_transaction == handle->h_transaction->t_tid) 4394 - return; 4395 - 4396 - /* new transaction! time to close last one and free blocks for 4397 - * committed transaction. we know that only transaction can be 4398 - * active, so previos transaction can be being logged and we 4399 - * know that transaction before previous is known to be already 4400 - * logged. this means that now we may free blocks freed in all 4401 - * transactions before previous one. hope I'm clear enough ... */ 4402 - 4403 - spin_lock(&sbi->s_md_lock); 4404 - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { 4405 - mb_debug("new transaction %lu, old %lu\n", 4406 - (unsigned long) handle->h_transaction->t_tid, 4407 - (unsigned long) sbi->s_last_transaction); 4408 - list_splice_init(&sbi->s_closed_transaction, 4409 - &sbi->s_committed_transaction); 4410 - list_splice_init(&sbi->s_active_transaction, 4411 - &sbi->s_closed_transaction); 4412 - sbi->s_last_transaction = handle->h_transaction->t_tid; 4413 - } 4414 - spin_unlock(&sbi->s_md_lock); 4415 - 4416 - ext4_mb_free_committed_blocks(sb); 4417 } 4418 4419 static noinline_for_stack int ··· 4408 struct ext4_group_info *db = e4b->bd_info; 4409 struct super_block *sb = e4b->bd_sb; 4410 struct ext4_sb_info *sbi = EXT4_SB(sb); 4411 - struct ext4_free_metadata *md; 4412 - int i; 4413 4414 BUG_ON(e4b->bd_bitmap_page == NULL); 4415 BUG_ON(e4b->bd_buddy_page == NULL); 4416 4417 ext4_lock_group(sb, group); 4418 - for (i = 0; i < count; i++) { 4419 - md = db->bb_md_cur; 4420 - if (md && db->bb_tid != handle->h_transaction->t_tid) { 4421 - db->bb_md_cur = NULL; 4422 - md = NULL; 4423 - } 4424 - 4425 - if (md == NULL) { 4426 - ext4_unlock_group(sb, group); 4427 - md = kmalloc(sizeof(*md), GFP_NOFS); 4428 - if (md == NULL) 4429 - return -ENOMEM; 4430 - md->num = 0; 4431 - md->group = group; 4432 - 4433 - ext4_lock_group(sb, group); 4434 - if (db->bb_md_cur == NULL) { 4435 - spin_lock(&sbi->s_md_lock); 4436 - list_add(&md->list, &sbi->s_active_transaction); 4437 - spin_unlock(&sbi->s_md_lock); 4438 - /* protect buddy cache from being freed, 4439 - * otherwise we'll refresh it from 4440 - * on-disk bitmap and lose not-yet-available 4441 - * blocks */ 4442 - page_cache_get(e4b->bd_buddy_page); 4443 - page_cache_get(e4b->bd_bitmap_page); 4444 - db->bb_md_cur = md; 4445 - db->bb_tid = handle->h_transaction->t_tid; 4446 - mb_debug("new md 0x%p for group %lu\n", 4447 - md, md->group); 4448 - } else { 4449 - kfree(md); 4450 - md = db->bb_md_cur; 4451 - } 4452 - } 4453 - 4454 - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); 4455 - md->blocks[md->num] = block + i; 4456 - md->num++; 4457 - if (md->num == EXT4_BB_MAX_BLOCKS) { 4458 - /* no more space, put full container on a sb's list */ 4459 - db->bb_md_cur = NULL; 4460 } 4461 } 4462 ext4_unlock_group(sb, group); 4463 return 0; 4464 } ··· 4508 int ret; 4509 4510 *freed = 0; 4511 - 4512 - ext4_mb_poll_new_transaction(sb, handle); 4513 4514 sbi = EXT4_SB(sb); 4515 es = EXT4_SB(sb)->s_es;

··· 2300 } 2301 2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2303 + meta_group_info[i]->bb_free_root.rb_node = NULL;; 2304 2305 #ifdef DOUBLE_CHECK 2306 { ··· 2522 } 2523 2524 spin_lock_init(&sbi->s_md_lock); 2525 spin_lock_init(&sbi->s_bal_lock); 2526 2527 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; ··· 2553 ext4_mb_init_per_dev_proc(sb); 2554 ext4_mb_history_init(sb); 2555 2556 + sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2557 + 2558 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2559 return 0; 2560 } ··· 2568 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2569 list_del(&pa->pa_group_list); 2570 count++; 2571 + kmem_cache_free(ext4_pspace_cachep, pa); 2572 } 2573 if (count) 2574 mb_debug("mballoc: %u PAs left\n", count); ··· 2581 int num_meta_group_infos; 2582 struct ext4_group_info *grinfo; 2583 struct ext4_sb_info *sbi = EXT4_SB(sb); 2584 2585 if (sbi->s_group_info) { 2586 for (i = 0; i < sbi->s_groups_count; i++) { ··· 2644 return 0; 2645 } 2646 2647 + /* 2648 + * This function is called by the jbd2 layer once the commit has finished, 2649 + * so we know we can free the blocks that were released with that commit. 2650 + */ 2651 + static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2652 { 2653 + struct super_block *sb = journal->j_private; 2654 struct ext4_buddy e4b; 2655 + struct ext4_group_info *db; 2656 + int err, count = 0, count2 = 0; 2657 + struct ext4_free_data *entry; 2658 + ext4_fsblk_t discard_block; 2659 + struct list_head *l, *ltmp; 2660 2661 + list_for_each_safe(l, ltmp, &txn->t_private_list) { 2662 + entry = list_entry(l, struct ext4_free_data, list); 2663 2664 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2665 + entry->count, entry->group, entry); 2666 2667 + err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2668 /* we expect to find existing buddy because it's pinned */ 2669 BUG_ON(err != 0); 2670 2671 + db = e4b.bd_info; 2672 /* there are blocks to put in buddy to make them really free */ 2673 + count += entry->count; 2674 count2++; 2675 + ext4_lock_group(sb, entry->group); 2676 + /* Take it out of per group rb tree */ 2677 + rb_erase(&entry->node, &(db->bb_free_root)); 2678 + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2679 + 2680 + if (!db->bb_free_root.rb_node) { 2681 + /* No more items in the per group rb tree 2682 + * balance refcounts from ext4_mb_free_metadata() 2683 + */ 2684 + page_cache_release(e4b.bd_buddy_page); 2685 + page_cache_release(e4b.bd_bitmap_page); 2686 } 2687 + ext4_unlock_group(sb, entry->group); 2688 + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2689 + + entry->start_blk 2690 + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2691 + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, 2692 + (unsigned long long) discard_block, entry->count); 2693 + sb_issue_discard(sb, discard_block, entry->count); 2694 2695 + kmem_cache_free(ext4_free_ext_cachep, entry); 2696 ext4_mb_release_desc(&e4b); 2697 + } 2698 2699 mb_debug("freed %u blocks in %u structures\n", count, count2); 2700 } ··· 2712 2713 static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2714 { 2715 + #ifdef CONFIG_PROC_FS 2716 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2717 struct ext4_sb_info *sbi = EXT4_SB(sb); 2718 struct proc_dir_entry *proc; ··· 2735 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2736 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2737 return -ENOMEM; 2738 + #else 2739 + return 0; 2740 + #endif 2741 } 2742 2743 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2744 { 2745 + #ifdef CONFIG_PROC_FS 2746 struct ext4_sb_info *sbi = EXT4_SB(sb); 2747 2748 if (sbi->s_proc == NULL) ··· 2750 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); 2751 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2752 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2753 + #endif 2754 return 0; 2755 } 2756 ··· 2771 kmem_cache_destroy(ext4_pspace_cachep); 2772 return -ENOMEM; 2773 } 2774 + 2775 + ext4_free_ext_cachep = 2776 + kmem_cache_create("ext4_free_block_extents", 2777 + sizeof(struct ext4_free_data), 2778 + 0, SLAB_RECLAIM_ACCOUNT, NULL); 2779 + if (ext4_free_ext_cachep == NULL) { 2780 + kmem_cache_destroy(ext4_pspace_cachep); 2781 + kmem_cache_destroy(ext4_ac_cachep); 2782 + return -ENOMEM; 2783 + } 2784 return 0; 2785 } 2786 ··· 2779 /* XXX: synchronize_rcu(); */ 2780 kmem_cache_destroy(ext4_pspace_cachep); 2781 kmem_cache_destroy(ext4_ac_cachep); 2782 + kmem_cache_destroy(ext4_free_ext_cachep); 2783 } 2784 2785 ··· 4324 goto out1; 4325 } 4326 4327 *errp = ext4_mb_initialize_context(ac, ar); 4328 if (*errp) { 4329 ar->len = 0; ··· 4384 4385 return block; 4386 } 4387 + 4388 + /* 4389 + * We can merge two free data extents only if the physical blocks 4390 + * are contiguous, AND the extents were freed by the same transaction, 4391 + * AND the blocks are associated with the same group. 4392 + */ 4393 + static int can_merge(struct ext4_free_data *entry1, 4394 + struct ext4_free_data *entry2) 4395 { 4396 + if ((entry1->t_tid == entry2->t_tid) && 4397 + (entry1->group == entry2->group) && 4398 + ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4399 + return 1; 4400 + return 0; 4401 } 4402 4403 static noinline_for_stack int ··· 4422 struct ext4_group_info *db = e4b->bd_info; 4423 struct super_block *sb = e4b->bd_sb; 4424 struct ext4_sb_info *sbi = EXT4_SB(sb); 4425 + struct ext4_free_data *entry, *new_entry; 4426 + struct rb_node **n = &db->bb_free_root.rb_node, *node; 4427 + struct rb_node *parent = NULL, *new_node; 4428 + 4429 4430 BUG_ON(e4b->bd_bitmap_page == NULL); 4431 BUG_ON(e4b->bd_buddy_page == NULL); 4432 4433 + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4434 + new_entry->start_blk = block; 4435 + new_entry->group = group; 4436 + new_entry->count = count; 4437 + new_entry->t_tid = handle->h_transaction->t_tid; 4438 + new_node = &new_entry->node; 4439 + 4440 ext4_lock_group(sb, group); 4441 + if (!*n) { 4442 + /* first free block exent. We need to 4443 + protect buddy cache from being freed, 4444 + * otherwise we'll refresh it from 4445 + * on-disk bitmap and lose not-yet-available 4446 + * blocks */ 4447 + page_cache_get(e4b->bd_buddy_page); 4448 + page_cache_get(e4b->bd_bitmap_page); 4449 + } 4450 + while (*n) { 4451 + parent = *n; 4452 + entry = rb_entry(parent, struct ext4_free_data, node); 4453 + if (block < entry->start_blk) 4454 + n = &(*n)->rb_left; 4455 + else if (block >= (entry->start_blk + entry->count)) 4456 + n = &(*n)->rb_right; 4457 + else { 4458 + ext4_error(sb, __func__, 4459 + "Double free of blocks %d (%d %d)\n", 4460 + block, entry->start_blk, entry->count); 4461 + return 0; 4462 } 4463 } 4464 + 4465 + rb_link_node(new_node, parent, n); 4466 + rb_insert_color(new_node, &db->bb_free_root); 4467 + 4468 + /* Now try to see the extent can be merged to left and right */ 4469 + node = rb_prev(new_node); 4470 + if (node) { 4471 + entry = rb_entry(node, struct ext4_free_data, node); 4472 + if (can_merge(entry, new_entry)) { 4473 + new_entry->start_blk = entry->start_blk; 4474 + new_entry->count += entry->count; 4475 + rb_erase(node, &(db->bb_free_root)); 4476 + spin_lock(&sbi->s_md_lock); 4477 + list_del(&entry->list); 4478 + spin_unlock(&sbi->s_md_lock); 4479 + kmem_cache_free(ext4_free_ext_cachep, entry); 4480 + } 4481 + } 4482 + 4483 + node = rb_next(new_node); 4484 + if (node) { 4485 + entry = rb_entry(node, struct ext4_free_data, node); 4486 + if (can_merge(new_entry, entry)) { 4487 + new_entry->count += entry->count; 4488 + rb_erase(node, &(db->bb_free_root)); 4489 + spin_lock(&sbi->s_md_lock); 4490 + list_del(&entry->list); 4491 + spin_unlock(&sbi->s_md_lock); 4492 + kmem_cache_free(ext4_free_ext_cachep, entry); 4493 + } 4494 + } 4495 + /* Add the extent to transaction's private list */ 4496 + spin_lock(&sbi->s_md_lock); 4497 + list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4498 + spin_unlock(&sbi->s_md_lock); 4499 ext4_unlock_group(sb, group); 4500 return 0; 4501 } ··· 4499 int ret; 4500 4501 *freed = 0; 4502 4503 sbi = EXT4_SB(sb); 4504 es = EXT4_SB(sb)->s_es;

+19 -12

fs/ext4/mballoc.h

··· 18 #include <linux/pagemap.h> 19 #include <linux/seq_file.h> 20 #include <linux/version.h> 21 #include "ext4_jbd2.h" 22 #include "ext4.h" 23 #include "group.h" ··· 100 101 static struct kmem_cache *ext4_pspace_cachep; 102 static struct kmem_cache *ext4_ac_cachep; 103 104 - #ifdef EXT4_BB_MAX_BLOCKS 105 - #undef EXT4_BB_MAX_BLOCKS 106 - #endif 107 - #define EXT4_BB_MAX_BLOCKS 30 108 109 - struct ext4_free_metadata { 110 - ext4_group_t group; 111 - unsigned short num; 112 - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; 113 struct list_head list; 114 }; 115 116 struct ext4_group_info { 117 unsigned long bb_state; 118 - unsigned long bb_tid; 119 - struct ext4_free_metadata *bb_md_cur; 120 unsigned short bb_first_free; 121 unsigned short bb_free; 122 unsigned short bb_fragments; ··· 269 270 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 271 ext4_group_t group); 272 - static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); 273 - static void ext4_mb_free_committed_blocks(struct super_block *); 274 static void ext4_mb_return_to_preallocation(struct inode *inode, 275 struct ext4_buddy *e4b, sector_t block, 276 int count); ··· 276 struct super_block *, struct ext4_prealloc_space *pa); 277 static int ext4_mb_init_per_dev_proc(struct super_block *sb); 278 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); 279 280 281 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)

··· 18 #include <linux/pagemap.h> 19 #include <linux/seq_file.h> 20 #include <linux/version.h> 21 + #include <linux/blkdev.h> 22 + #include <linux/marker.h> 23 #include "ext4_jbd2.h" 24 #include "ext4.h" 25 #include "group.h" ··· 98 99 static struct kmem_cache *ext4_pspace_cachep; 100 static struct kmem_cache *ext4_ac_cachep; 101 + static struct kmem_cache *ext4_free_ext_cachep; 102 103 + struct ext4_free_data { 104 + /* this links the free block information from group_info */ 105 + struct rb_node node; 106 107 + /* this links the free block information from ext4_sb_info */ 108 struct list_head list; 109 + 110 + /* group which free block extent belongs */ 111 + ext4_group_t group; 112 + 113 + /* free block extent */ 114 + ext4_grpblk_t start_blk; 115 + ext4_grpblk_t count; 116 + 117 + /* transaction which freed this extent */ 118 + tid_t t_tid; 119 }; 120 121 struct ext4_group_info { 122 unsigned long bb_state; 123 + struct rb_root bb_free_root; 124 unsigned short bb_first_free; 125 unsigned short bb_free; 126 unsigned short bb_fragments; ··· 261 262 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 263 ext4_group_t group); 264 static void ext4_mb_return_to_preallocation(struct inode *inode, 265 struct ext4_buddy *e4b, sector_t block, 266 int count); ··· 270 struct super_block *, struct ext4_prealloc_space *pa); 271 static int ext4_mb_init_per_dev_proc(struct super_block *sb); 272 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); 273 + static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 274 275 276 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)

+34 -98

fs/ext4/super.c

··· 374 */ 375 } 376 377 - int ext4_update_compat_feature(handle_t *handle, 378 - struct super_block *sb, __u32 compat) 379 - { 380 - int err = 0; 381 - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { 382 - err = ext4_journal_get_write_access(handle, 383 - EXT4_SB(sb)->s_sbh); 384 - if (err) 385 - return err; 386 - EXT4_SET_COMPAT_FEATURE(sb, compat); 387 - sb->s_dirt = 1; 388 - handle->h_sync = 1; 389 - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, 390 - "call ext4_journal_dirty_met adata"); 391 - err = ext4_journal_dirty_metadata(handle, 392 - EXT4_SB(sb)->s_sbh); 393 - } 394 - return err; 395 - } 396 - 397 - int ext4_update_rocompat_feature(handle_t *handle, 398 - struct super_block *sb, __u32 rocompat) 399 - { 400 - int err = 0; 401 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { 402 - err = ext4_journal_get_write_access(handle, 403 - EXT4_SB(sb)->s_sbh); 404 - if (err) 405 - return err; 406 - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); 407 - sb->s_dirt = 1; 408 - handle->h_sync = 1; 409 - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, 410 - "call ext4_journal_dirty_met adata"); 411 - err = ext4_journal_dirty_metadata(handle, 412 - EXT4_SB(sb)->s_sbh); 413 - } 414 - return err; 415 - } 416 - 417 - int ext4_update_incompat_feature(handle_t *handle, 418 - struct super_block *sb, __u32 incompat) 419 - { 420 - int err = 0; 421 - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { 422 - err = ext4_journal_get_write_access(handle, 423 - EXT4_SB(sb)->s_sbh); 424 - if (err) 425 - return err; 426 - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); 427 - sb->s_dirt = 1; 428 - handle->h_sync = 1; 429 - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, 430 - "call ext4_journal_dirty_met adata"); 431 - err = ext4_journal_dirty_metadata(handle, 432 - EXT4_SB(sb)->s_sbh); 433 - } 434 - return err; 435 - } 436 - 437 /* 438 * Open the external journal device 439 */ ··· 844 enum { 845 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 846 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 847 - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, 848 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 849 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 850 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, ··· 855 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 856 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 857 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 858 - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 859 Opt_inode_readahead_blks 860 }; 861 ··· 873 {Opt_err_panic, "errors=panic"}, 874 {Opt_err_ro, "errors=remount-ro"}, 875 {Opt_nouid32, "nouid32"}, 876 - {Opt_nocheck, "nocheck"}, 877 - {Opt_nocheck, "check=none"}, 878 {Opt_debug, "debug"}, 879 {Opt_oldalloc, "oldalloc"}, 880 {Opt_orlov, "orlov"}, ··· 911 {Opt_extents, "extents"}, 912 {Opt_noextents, "noextents"}, 913 {Opt_i_version, "i_version"}, 914 - {Opt_mballoc, "mballoc"}, 915 - {Opt_nomballoc, "nomballoc"}, 916 {Opt_stripe, "stripe=%u"}, 917 {Opt_resize, "resize"}, 918 {Opt_delalloc, "delalloc"}, ··· 1008 break; 1009 case Opt_nouid32: 1010 set_opt(sbi->s_mount_opt, NO_UID32); 1011 - break; 1012 - case Opt_nocheck: 1013 - clear_opt(sbi->s_mount_opt, CHECK); 1014 break; 1015 case Opt_debug: 1016 set_opt(sbi->s_mount_opt, DEBUG); ··· 1551 if (block_bitmap < first_block || block_bitmap > last_block) { 1552 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1553 "Block bitmap for group %lu not in group " 1554 - "(block %llu)!", i, block_bitmap); 1555 return 0; 1556 } 1557 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1558 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1559 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1560 "Inode bitmap for group %lu not in group " 1561 - "(block %llu)!", i, inode_bitmap); 1562 return 0; 1563 } 1564 inode_table = ext4_inode_table(sb, gdp); ··· 1566 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1567 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1568 "Inode table for group %lu not in group " 1569 - "(block %llu)!", i, inode_table); 1570 return 0; 1571 } 1572 spin_lock(sb_bgl_lock(sbi, i)); ··· 1711 * 1712 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 1713 */ 1714 - static loff_t ext4_max_size(int blkbits) 1715 { 1716 loff_t res; 1717 loff_t upper_limit = MAX_LFS_FILESIZE; 1718 1719 /* small i_blocks in vfs inode? */ 1720 - if (sizeof(blkcnt_t) < sizeof(u64)) { 1721 /* 1722 * CONFIG_LSF is not enabled implies the inode 1723 * i_block represent total blocks in 512 bytes ··· 1747 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 1748 * We need to be 1 filesystem block less than the 2^48 sector limit. 1749 */ 1750 - static loff_t ext4_max_bitmap_size(int bits) 1751 { 1752 loff_t res = EXT4_NDIR_BLOCKS; 1753 int meta_blocks; ··· 1760 * total number of 512 bytes blocks of the file 1761 */ 1762 1763 - if (sizeof(blkcnt_t) < sizeof(u64)) { 1764 /* 1765 - * CONFIG_LSF is not enabled implies the inode 1766 - * i_block represent total blocks in 512 bytes 1767 - * 32 == size of vfs inode i_blocks * 8 1768 */ 1769 upper_limit = (1LL << 32) - 1; 1770 ··· 1873 int blocksize; 1874 int db_count; 1875 int i; 1876 - int needs_recovery; 1877 __le32 features; 1878 __u64 blocks_count; 1879 int err; ··· 2014 sb->s_id, le32_to_cpu(features)); 2015 goto failed_mount; 2016 } 2017 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 2018 /* 2019 * Large file size enabled file system can only be 2020 * mount if kernel is build with CONFIG_LSF ··· 2066 } 2067 } 2068 2069 - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); 2070 - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); 2071 2072 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2073 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; ··· 2392 "available.\n"); 2393 } 2394 2395 /* 2396 * akpm: core read_super() calls in here with the superblock locked. 2397 * That deadlocks, because orphan cleanup needs to lock the superblock ··· 2425 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2426 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2427 "writeback"); 2428 - 2429 - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2430 - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " 2431 - "requested data journaling mode\n"); 2432 - clear_opt(sbi->s_mount_opt, DELALLOC); 2433 - } else if (test_opt(sb, DELALLOC)) 2434 - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2435 - 2436 - ext4_ext_init(sb); 2437 - err = ext4_mb_init(sb, needs_recovery); 2438 - if (err) { 2439 - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", 2440 - err); 2441 - goto failed_mount4; 2442 - } 2443 2444 lock_kernel(); 2445 return 0;

··· 374 */ 375 } 376 377 /* 378 * Open the external journal device 379 */ ··· 904 enum { 905 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 906 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 907 + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 908 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 909 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 910 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, ··· 915 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 916 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 917 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 918 + Opt_stripe, Opt_delalloc, Opt_nodelalloc, 919 Opt_inode_readahead_blks 920 }; 921 ··· 933 {Opt_err_panic, "errors=panic"}, 934 {Opt_err_ro, "errors=remount-ro"}, 935 {Opt_nouid32, "nouid32"}, 936 {Opt_debug, "debug"}, 937 {Opt_oldalloc, "oldalloc"}, 938 {Opt_orlov, "orlov"}, ··· 973 {Opt_extents, "extents"}, 974 {Opt_noextents, "noextents"}, 975 {Opt_i_version, "i_version"}, 976 {Opt_stripe, "stripe=%u"}, 977 {Opt_resize, "resize"}, 978 {Opt_delalloc, "delalloc"}, ··· 1072 break; 1073 case Opt_nouid32: 1074 set_opt(sbi->s_mount_opt, NO_UID32); 1075 break; 1076 case Opt_debug: 1077 set_opt(sbi->s_mount_opt, DEBUG); ··· 1618 if (block_bitmap < first_block || block_bitmap > last_block) { 1619 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1620 "Block bitmap for group %lu not in group " 1621 + "(block %llu)!\n", i, block_bitmap); 1622 return 0; 1623 } 1624 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1625 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1626 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1627 "Inode bitmap for group %lu not in group " 1628 + "(block %llu)!\n", i, inode_bitmap); 1629 return 0; 1630 } 1631 inode_table = ext4_inode_table(sb, gdp); ··· 1633 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1634 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1635 "Inode table for group %lu not in group " 1636 + "(block %llu)!\n", i, inode_table); 1637 return 0; 1638 } 1639 spin_lock(sb_bgl_lock(sbi, i)); ··· 1778 * 1779 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 1780 */ 1781 + static loff_t ext4_max_size(int blkbits, int has_huge_files) 1782 { 1783 loff_t res; 1784 loff_t upper_limit = MAX_LFS_FILESIZE; 1785 1786 /* small i_blocks in vfs inode? */ 1787 + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1788 /* 1789 * CONFIG_LSF is not enabled implies the inode 1790 * i_block represent total blocks in 512 bytes ··· 1814 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 1815 * We need to be 1 filesystem block less than the 2^48 sector limit. 1816 */ 1817 + static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) 1818 { 1819 loff_t res = EXT4_NDIR_BLOCKS; 1820 int meta_blocks; ··· 1827 * total number of 512 bytes blocks of the file 1828 */ 1829 1830 + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1831 /* 1832 + * !has_huge_files or CONFIG_LSF is not enabled 1833 + * implies the inode i_block represent total blocks in 1834 + * 512 bytes 32 == size of vfs inode i_blocks * 8 1835 */ 1836 upper_limit = (1LL << 32) - 1; 1837 ··· 1940 int blocksize; 1941 int db_count; 1942 int i; 1943 + int needs_recovery, has_huge_files; 1944 __le32 features; 1945 __u64 blocks_count; 1946 int err; ··· 2081 sb->s_id, le32_to_cpu(features)); 2082 goto failed_mount; 2083 } 2084 + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2085 + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 2086 + if (has_huge_files) { 2087 /* 2088 * Large file size enabled file system can only be 2089 * mount if kernel is build with CONFIG_LSF ··· 2131 } 2132 } 2133 2134 + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2135 + has_huge_files); 2136 + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2137 2138 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2139 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; ··· 2456 "available.\n"); 2457 } 2458 2459 + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2460 + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " 2461 + "requested data journaling mode\n"); 2462 + clear_opt(sbi->s_mount_opt, DELALLOC); 2463 + } else if (test_opt(sb, DELALLOC)) 2464 + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2465 + 2466 + ext4_ext_init(sb); 2467 + err = ext4_mb_init(sb, needs_recovery); 2468 + if (err) { 2469 + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", 2470 + err); 2471 + goto failed_mount4; 2472 + } 2473 + 2474 /* 2475 * akpm: core read_super() calls in here with the superblock locked. 2476 * That deadlocks, because orphan cleanup needs to lock the superblock ··· 2474 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2475 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2476 "writeback"); 2477 2478 lock_kernel(); 2479 return 0;

+3

fs/jbd2/commit.c

··· 995 } 996 spin_unlock(&journal->j_list_lock); 997 998 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 999 journal->j_devname, commit_transaction->t_tid, 1000 journal->j_tail_sequence);

··· 995 } 996 spin_unlock(&journal->j_list_lock); 997 998 + if (journal->j_commit_callback) 999 + journal->j_commit_callback(journal, commit_transaction); 1000 + 1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1002 journal->j_devname, commit_transaction->t_tid, 1003 journal->j_tail_sequence);

+1

fs/jbd2/transaction.c

··· 52 transaction->t_expires = jiffies + journal->j_commit_interval; 53 spin_lock_init(&transaction->t_handle_lock); 54 INIT_LIST_HEAD(&transaction->t_inode_list); 55 56 /* Set up the commit timer for the new transaction. */ 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);

··· 52 transaction->t_expires = jiffies + journal->j_commit_interval; 53 spin_lock_init(&transaction->t_handle_lock); 54 INIT_LIST_HEAD(&transaction->t_inode_list); 55 + INIT_LIST_HEAD(&transaction->t_private_list); 56 57 /* Set up the commit timer for the new transaction. */ 58 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);

+9

include/linux/jbd2.h

··· 641 */ 642 int t_handle_count; 643 644 }; 645 646 struct transaction_run_stats_s { ··· 939 int j_wbufsize; 940 941 pid_t j_last_sync_writer; 942 943 /* 944 * Journal statistics

··· 641 */ 642 int t_handle_count; 643 644 + /* 645 + * For use by the filesystem to store fs-specific data 646 + * structures associated with the transaction 647 + */ 648 + struct list_head t_private_list; 649 }; 650 651 struct transaction_run_stats_s { ··· 934 int j_wbufsize; 935 936 pid_t j_last_sync_writer; 937 + 938 + /* This function is called when a transaction is closed */ 939 + void (*j_commit_callback)(journal_t *, 940 + transaction_t *); 941 942 /* 943 * Journal statistics

+9 -1

include/linux/writeback.h

··· 63 unsigned for_writepages:1; /* This is a writepages() call */ 64 unsigned range_cyclic:1; /* range_start is cyclic */ 65 unsigned more_io:1; /* more io to be dispatched */ 66 - unsigned range_cont:1; 67 }; 68 69 /*

··· 63 unsigned for_writepages:1; /* This is a writepages() call */ 64 unsigned range_cyclic:1; /* range_start is cyclic */ 65 unsigned more_io:1; /* more io to be dispatched */ 66 + /* 67 + * write_cache_pages() won't update wbc->nr_to_write and 68 + * mapping->writeback_index if no_nrwrite_index_update 69 + * is set. write_cache_pages() may write more than we 70 + * requested and we want to make sure nr_to_write and 71 + * writeback_index are updated in a consistent manner 72 + * so we use a single control to update them 73 + */ 74 + unsigned no_nrwrite_index_update:1; 75 }; 76 77 /*

+7 -5

mm/page-writeback.c

··· 876 pgoff_t end; /* Inclusive */ 877 int scanned = 0; 878 int range_whole = 0; 879 880 if (wbc->nonblocking && bdi_write_congested(bdi)) { 881 wbc->encountered_congestion = 1; ··· 940 unlock_page(page); 941 ret = 0; 942 } 943 - if (ret || (--(wbc->nr_to_write) <= 0)) 944 done = 1; 945 if (wbc->nonblocking && bdi_write_congested(bdi)) { 946 wbc->encountered_congestion = 1; ··· 959 index = 0; 960 goto retry; 961 } 962 - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 963 - mapping->writeback_index = index; 964 965 - if (wbc->range_cont) 966 - wbc->range_start = index << PAGE_CACHE_SHIFT; 967 return ret; 968 } 969 EXPORT_SYMBOL(write_cache_pages);

··· 876 pgoff_t end; /* Inclusive */ 877 int scanned = 0; 878 int range_whole = 0; 879 + long nr_to_write = wbc->nr_to_write; 880 881 if (wbc->nonblocking && bdi_write_congested(bdi)) { 882 wbc->encountered_congestion = 1; ··· 939 unlock_page(page); 940 ret = 0; 941 } 942 + if (ret || (--nr_to_write <= 0)) 943 done = 1; 944 if (wbc->nonblocking && bdi_write_congested(bdi)) { 945 wbc->encountered_congestion = 1; ··· 958 index = 0; 959 goto retry; 960 } 961 + if (!wbc->no_nrwrite_index_update) { 962 + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) 963 + mapping->writeback_index = index; 964 + wbc->nr_to_write = nr_to_write; 965 + } 966 967 return ret; 968 } 969 EXPORT_SYMBOL(write_cache_pages);