commit 7991c92f4cc50b971fcb4d05087e490dc47a6857 · tjh.dev/kernel

-5

fs/ext4/acl.h

··· 68 static inline int 69 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 70 { 71 - /* usually, the umask is applied by posix_acl_create(), but if 72 - ext4 ACL support is disabled at compile time, we need to do 73 - it here, because posix_acl_create() will never be called */ 74 - inode->i_mode &= ~current_umask(); 75 - 76 return 0; 77 } 78 #endif /* CONFIG_EXT4_FS_POSIX_ACL */

··· 68 static inline int 69 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 70 { 71 return 0; 72 } 73 #endif /* CONFIG_EXT4_FS_POSIX_ACL */

+6 -3

fs/ext4/ext4.h

··· 213 #define EXT4_MB_USE_RESERVED 0x2000 214 /* Do strict check for free blocks while retrying block allocation */ 215 #define EXT4_MB_STRICT_CHECK 0x4000 216 - /* Large fragment size list lookup succeeded at least once for cr = 0 */ 217 #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 218 - /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ 219 #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 220 - /* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ 221 #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 222 223 struct ext4_allocation_request {

··· 213 #define EXT4_MB_USE_RESERVED 0x2000 214 /* Do strict check for free blocks while retrying block allocation */ 215 #define EXT4_MB_STRICT_CHECK 0x4000 216 + /* Large fragment size list lookup succeeded at least once for 217 + * CR_POWER2_ALIGNED */ 218 #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 219 + /* Avg fragment size rb tree lookup succeeded at least once for 220 + * CR_GOAL_LEN_FAST */ 221 #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 222 + /* Avg fragment size rb tree lookup succeeded at least once for 223 + * CR_BEST_AVAIL_LEN */ 224 #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 225 226 struct ext4_allocation_request {

+2 -1

fs/ext4/extents.c

··· 3402 struct ext4_extent *ex, *abut_ex; 3403 ext4_lblk_t ee_block, eof_block; 3404 unsigned int ee_len, depth, map_len = map->m_len; 3405 - int allocated = 0, max_zeroout = 0; 3406 int err = 0; 3407 int split_flag = EXT4_EXT_DATA_VALID2; 3408 3409 ext_debug(inode, "logical block %llu, max_blocks %u\n", 3410 (unsigned long long)map->m_lblk, map_len);

··· 3402 struct ext4_extent *ex, *abut_ex; 3403 ext4_lblk_t ee_block, eof_block; 3404 unsigned int ee_len, depth, map_len = map->m_len; 3405 int err = 0; 3406 int split_flag = EXT4_EXT_DATA_VALID2; 3407 + int allocated = 0; 3408 + unsigned int max_zeroout = 0; 3409 3410 ext_debug(inode, "logical block %llu, max_blocks %u\n", 3411 (unsigned long long)map->m_lblk, map_len);

+2 -3

fs/ext4/file.c

··· 844 if (err) 845 goto out_journal; 846 lock_buffer(sbi->s_sbh); 847 - strncpy(sbi->s_es->s_last_mounted, cp, 848 - sizeof(sbi->s_es->s_last_mounted)); 849 ext4_superblock_csum_set(sb); 850 unlock_buffer(sbi->s_sbh); 851 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); ··· 884 return ret; 885 } 886 887 - filp->f_mode |= FMODE_NOWAIT; 888 return dquot_file_open(inode, filp); 889 } 890

··· 844 if (err) 845 goto out_journal; 846 lock_buffer(sbi->s_sbh); 847 + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); 848 ext4_superblock_csum_set(sb); 849 unlock_buffer(sbi->s_sbh); 850 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); ··· 885 return ret; 886 } 887 888 + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 889 return dquot_file_open(inode, filp); 890 } 891

+2 -9

fs/ext4/inode.c

··· 1865 len = folio_size(folio); 1866 if (folio_pos(folio) + len > size && 1867 !ext4_verity_in_progress(mpd->inode)) 1868 - len = size & ~PAGE_MASK; 1869 err = ext4_bio_write_folio(&mpd->io_submit, folio, len); 1870 if (!err) 1871 mpd->wbc->nr_to_write--; ··· 2334 2335 if (folio_pos(folio) + len > size && 2336 !ext4_verity_in_progress(inode)) 2337 - len = size - folio_pos(folio); 2338 2339 return ext4_journal_folio_buffers(handle, folio, len); 2340 } ··· 2886 mapping_gfp_mask(mapping)); 2887 if (IS_ERR(folio)) 2888 return PTR_ERR(folio); 2889 - 2890 - /* In case writeback began while the folio was unlocked */ 2891 - folio_wait_stable(folio); 2892 2893 #ifdef CONFIG_FS_ENCRYPTION 2894 ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); ··· 3527 .bmap = ext4_bmap, 3528 .invalidate_folio = ext4_invalidate_folio, 3529 .release_folio = ext4_release_folio, 3530 - .direct_IO = noop_direct_IO, 3531 .migrate_folio = buffer_migrate_folio, 3532 .is_partially_uptodate = block_is_partially_uptodate, 3533 .error_remove_folio = generic_error_remove_folio, ··· 3543 .bmap = ext4_bmap, 3544 .invalidate_folio = ext4_journalled_invalidate_folio, 3545 .release_folio = ext4_release_folio, 3546 - .direct_IO = noop_direct_IO, 3547 .migrate_folio = buffer_migrate_folio_norefs, 3548 .is_partially_uptodate = block_is_partially_uptodate, 3549 .error_remove_folio = generic_error_remove_folio, ··· 3559 .bmap = ext4_bmap, 3560 .invalidate_folio = ext4_invalidate_folio, 3561 .release_folio = ext4_release_folio, 3562 - .direct_IO = noop_direct_IO, 3563 .migrate_folio = buffer_migrate_folio, 3564 .is_partially_uptodate = block_is_partially_uptodate, 3565 .error_remove_folio = generic_error_remove_folio, ··· 3567 3568 static const struct address_space_operations ext4_dax_aops = { 3569 .writepages = ext4_dax_writepages, 3570 - .direct_IO = noop_direct_IO, 3571 .dirty_folio = noop_dirty_folio, 3572 .bmap = ext4_bmap, 3573 .swap_activate = ext4_iomap_swap_activate,

··· 1865 len = folio_size(folio); 1866 if (folio_pos(folio) + len > size && 1867 !ext4_verity_in_progress(mpd->inode)) 1868 + len = size & (len - 1); 1869 err = ext4_bio_write_folio(&mpd->io_submit, folio, len); 1870 if (!err) 1871 mpd->wbc->nr_to_write--; ··· 2334 2335 if (folio_pos(folio) + len > size && 2336 !ext4_verity_in_progress(inode)) 2337 + len = size & (len - 1); 2338 2339 return ext4_journal_folio_buffers(handle, folio, len); 2340 } ··· 2886 mapping_gfp_mask(mapping)); 2887 if (IS_ERR(folio)) 2888 return PTR_ERR(folio); 2889 2890 #ifdef CONFIG_FS_ENCRYPTION 2891 ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); ··· 3530 .bmap = ext4_bmap, 3531 .invalidate_folio = ext4_invalidate_folio, 3532 .release_folio = ext4_release_folio, 3533 .migrate_folio = buffer_migrate_folio, 3534 .is_partially_uptodate = block_is_partially_uptodate, 3535 .error_remove_folio = generic_error_remove_folio, ··· 3547 .bmap = ext4_bmap, 3548 .invalidate_folio = ext4_journalled_invalidate_folio, 3549 .release_folio = ext4_release_folio, 3550 .migrate_folio = buffer_migrate_folio_norefs, 3551 .is_partially_uptodate = block_is_partially_uptodate, 3552 .error_remove_folio = generic_error_remove_folio, ··· 3564 .bmap = ext4_bmap, 3565 .invalidate_folio = ext4_invalidate_folio, 3566 .release_folio = ext4_release_folio, 3567 .migrate_folio = buffer_migrate_folio, 3568 .is_partially_uptodate = block_is_partially_uptodate, 3569 .error_remove_folio = generic_error_remove_folio, ··· 3573 3574 static const struct address_space_operations ext4_dax_aops = { 3575 .writepages = ext4_dax_writepages, 3576 .dirty_folio = noop_dirty_folio, 3577 .bmap = ext4_bmap, 3578 .swap_activate = ext4_iomap_swap_activate,

+1 -2

fs/ext4/ioctl.c

··· 1150 */ 1151 BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); 1152 1153 - memset(label, 0, sizeof(label)); 1154 lock_buffer(sbi->s_sbh); 1155 - strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); 1156 unlock_buffer(sbi->s_sbh); 1157 1158 if (copy_to_user(user_label, label, sizeof(label)))

··· 1150 */ 1151 BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); 1152 1153 lock_buffer(sbi->s_sbh); 1154 + strscpy_pad(label, sbi->s_es->s_volume_name); 1155 unlock_buffer(sbi->s_sbh); 1156 1157 if (copy_to_user(user_label, label, sizeof(label)))

+76

fs/ext4/mballoc-test.c

··· 30 #define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) 31 #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) 32 33 static const struct super_operations mbt_sops = { 34 }; 35 36 static void mbt_kill_sb(struct super_block *sb) ··· 883 ext4_mb_unload_buddy(&e4b); 884 } 885 886 static const struct mbt_ext4_block_layout mbt_test_layouts[] = { 887 { 888 .blocksize_bits = 10, ··· 975 KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), 976 KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), 977 KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), 978 {} 979 }; 980

··· 30 #define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) 31 #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) 32 33 + static struct inode *mbt_alloc_inode(struct super_block *sb) 34 + { 35 + struct ext4_inode_info *ei; 36 + 37 + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); 38 + if (!ei) 39 + return NULL; 40 + 41 + INIT_LIST_HEAD(&ei->i_orphan); 42 + init_rwsem(&ei->xattr_sem); 43 + init_rwsem(&ei->i_data_sem); 44 + inode_init_once(&ei->vfs_inode); 45 + ext4_fc_init_inode(&ei->vfs_inode); 46 + 47 + return &ei->vfs_inode; 48 + } 49 + 50 + static void mbt_free_inode(struct inode *inode) 51 + { 52 + kfree(EXT4_I(inode)); 53 + } 54 + 55 static const struct super_operations mbt_sops = { 56 + .alloc_inode = mbt_alloc_inode, 57 + .free_inode = mbt_free_inode, 58 }; 59 60 static void mbt_kill_sb(struct super_block *sb) ··· 859 ext4_mb_unload_buddy(&e4b); 860 } 861 862 + #define COUNT_FOR_ESTIMATE 100000 863 + static void test_mb_mark_used_cost(struct kunit *test) 864 + { 865 + struct ext4_buddy e4b; 866 + struct super_block *sb = (struct super_block *)test->priv; 867 + struct ext4_free_extent ex; 868 + int ret; 869 + struct test_range ranges[TEST_RANGE_COUNT]; 870 + int i, j; 871 + unsigned long start, end, all = 0; 872 + 873 + /* buddy cache assumes that each page contains at least one block */ 874 + if (sb->s_blocksize > PAGE_SIZE) 875 + kunit_skip(test, "blocksize exceeds pagesize"); 876 + 877 + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); 878 + KUNIT_ASSERT_EQ(test, ret, 0); 879 + 880 + ex.fe_group = TEST_GOAL_GROUP; 881 + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { 882 + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); 883 + start = jiffies; 884 + for (i = 0; i < TEST_RANGE_COUNT; i++) { 885 + if (ranges[i].len == 0) 886 + continue; 887 + 888 + ex.fe_start = ranges[i].start; 889 + ex.fe_len = ranges[i].len; 890 + ext4_lock_group(sb, TEST_GOAL_GROUP); 891 + mb_mark_used(&e4b, &ex); 892 + ext4_unlock_group(sb, TEST_GOAL_GROUP); 893 + } 894 + end = jiffies; 895 + all += (end - start); 896 + 897 + for (i = 0; i < TEST_RANGE_COUNT; i++) { 898 + if (ranges[i].len == 0) 899 + continue; 900 + 901 + ext4_lock_group(sb, TEST_GOAL_GROUP); 902 + mb_free_blocks(NULL, &e4b, ranges[i].start, 903 + ranges[i].len); 904 + ext4_unlock_group(sb, TEST_GOAL_GROUP); 905 + } 906 + } 907 + 908 + kunit_info(test, "costed jiffies %lu\n", all); 909 + ext4_mb_unload_buddy(&e4b); 910 + } 911 + 912 static const struct mbt_ext4_block_layout mbt_test_layouts[] = { 913 { 914 .blocksize_bits = 10, ··· 901 KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), 902 KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), 903 KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), 904 + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, 905 + { .speed = KUNIT_SPEED_SLOW }), 906 {} 907 }; 908

+168 -154

fs/ext4/mballoc.c

··· 831 return 0; 832 if (order == MB_NUM_ORDERS(sb)) 833 order--; 834 return order; 835 } 836 ··· 1010 * goal length. 1011 */ 1012 order = fls(ac->ac_g_ex.fe_len) - 1; 1013 min_order = order - sbi->s_mb_best_avail_max_trim_order; 1014 if (min_order < 0) 1015 min_order = 0; ··· 1080 } 1081 1082 /* 1083 - * Return next linear group for allocation. If linear traversal should not be 1084 - * performed, this function just returns the same group 1085 */ 1086 static ext4_group_t 1087 - next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, 1088 - ext4_group_t ngroups) 1089 { 1090 - if (!should_optimize_scan(ac)) 1091 - goto inc_and_return; 1092 - 1093 - if (ac->ac_groups_linear_remaining) { 1094 - ac->ac_groups_linear_remaining--; 1095 - goto inc_and_return; 1096 - } 1097 - 1098 - return group; 1099 - inc_and_return: 1100 /* 1101 * Artificially restricted ngroups for non-extent 1102 * files makes group > ngroups possible on first loop. ··· 1110 { 1111 *new_cr = ac->ac_criteria; 1112 1113 - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { 1114 - *group = next_linear_group(ac, *group, ngroups); 1115 return; 1116 } 1117 ··· 1134 ext4_mb_choose_next_group_best_avail(ac, new_cr, group); 1135 } else { 1136 /* 1137 - * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1138 - * bb_free. But until that happens, we should never come here. 1139 */ 1140 WARN_ON(1); 1141 } ··· 1274 * for this page; do not hold this lock when calling this routine! 1275 */ 1276 1277 - static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) 1278 { 1279 ext4_group_t ngroups; 1280 unsigned int blocksize; ··· 1292 char *bitmap; 1293 struct ext4_group_info *grinfo; 1294 1295 - inode = page->mapping->host; 1296 sb = inode->i_sb; 1297 ngroups = ext4_get_groups_count(sb); 1298 blocksize = i_blocksize(inode); 1299 blocks_per_page = PAGE_SIZE / blocksize; 1300 1301 - mb_debug(sb, "init page %lu\n", page->index); 1302 1303 groups_per_page = blocks_per_page >> 1; 1304 if (groups_per_page == 0) ··· 1313 } else 1314 bh = &bhs; 1315 1316 - first_group = page->index * blocks_per_page / 2; 1317 1318 - /* read all groups the page covers into the cache */ 1319 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1320 if (group >= ngroups) 1321 break; ··· 1326 /* 1327 * If page is uptodate then we came here after online resize 1328 * which added some new uninitialized group info structs, so 1329 - * we must skip all initialized uptodate buddies on the page, 1330 * which may be currently in use by an allocating task. 1331 */ 1332 - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 1333 bh[i] = NULL; 1334 continue; 1335 } ··· 1354 err = err2; 1355 } 1356 1357 - first_block = page->index * blocks_per_page; 1358 for (i = 0; i < blocks_per_page; i++) { 1359 group = (first_block + i) >> 1; 1360 if (group >= ngroups) ··· 1375 * above 1376 * 1377 */ 1378 - data = page_address(page) + (i * blocksize); 1379 bitmap = bh[group - first_group]->b_data; 1380 1381 /* ··· 1390 if ((first_block + i) & 1) { 1391 /* this is block of buddy */ 1392 BUG_ON(incore == NULL); 1393 - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", 1394 - group, page->index, i * blocksize); 1395 trace_ext4_mb_buddy_bitmap_load(sb, group); 1396 grinfo->bb_fragments = 0; 1397 memset(grinfo->bb_counters, 0, ··· 1409 } else { 1410 /* this is block of bitmap */ 1411 BUG_ON(incore != NULL); 1412 - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", 1413 - group, page->index, i * blocksize); 1414 trace_ext4_mb_bitmap_load(sb, group); 1415 1416 /* see comments in ext4_mb_put_pa() */ ··· 1428 incore = data; 1429 } 1430 } 1431 - SetPageUptodate(page); 1432 1433 out: 1434 if (bh) { ··· 1444 * Lock the buddy and bitmap pages. This make sure other parallel init_group 1445 * on the same buddy page doesn't happen whild holding the buddy page lock. 1446 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1447 - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 1448 */ 1449 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1450 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) ··· 1452 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1453 int block, pnum, poff; 1454 int blocks_per_page; 1455 - struct page *page; 1456 1457 - e4b->bd_buddy_page = NULL; 1458 - e4b->bd_bitmap_page = NULL; 1459 1460 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1461 /* ··· 1466 block = group * 2; 1467 pnum = block / blocks_per_page; 1468 poff = block % blocks_per_page; 1469 - page = find_or_create_page(inode->i_mapping, pnum, gfp); 1470 - if (!page) 1471 - return -ENOMEM; 1472 - BUG_ON(page->mapping != inode->i_mapping); 1473 - e4b->bd_bitmap_page = page; 1474 - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1475 1476 if (blocks_per_page >= 2) { 1477 /* buddy and bitmap are on the same page */ ··· 1480 } 1481 1482 /* blocks_per_page == 1, hence we need another page for the buddy */ 1483 - page = find_or_create_page(inode->i_mapping, block + 1, gfp); 1484 - if (!page) 1485 - return -ENOMEM; 1486 - BUG_ON(page->mapping != inode->i_mapping); 1487 - e4b->bd_buddy_page = page; 1488 return 0; 1489 } 1490 1491 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1492 { 1493 - if (e4b->bd_bitmap_page) { 1494 - unlock_page(e4b->bd_bitmap_page); 1495 - put_page(e4b->bd_bitmap_page); 1496 } 1497 - if (e4b->bd_buddy_page) { 1498 - unlock_page(e4b->bd_buddy_page); 1499 - put_page(e4b->bd_buddy_page); 1500 } 1501 } 1502 ··· 1512 1513 struct ext4_group_info *this_grp; 1514 struct ext4_buddy e4b; 1515 - struct page *page; 1516 int ret = 0; 1517 1518 might_sleep(); ··· 1539 goto err; 1540 } 1541 1542 - page = e4b.bd_bitmap_page; 1543 - ret = ext4_mb_init_cache(page, NULL, gfp); 1544 if (ret) 1545 goto err; 1546 - if (!PageUptodate(page)) { 1547 ret = -EIO; 1548 goto err; 1549 } 1550 1551 - if (e4b.bd_buddy_page == NULL) { 1552 /* 1553 * If both the bitmap and buddy are in 1554 * the same page we don't need to force ··· 1558 goto err; 1559 } 1560 /* init buddy cache */ 1561 - page = e4b.bd_buddy_page; 1562 - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); 1563 if (ret) 1564 goto err; 1565 - if (!PageUptodate(page)) { 1566 ret = -EIO; 1567 goto err; 1568 } ··· 1584 int block; 1585 int pnum; 1586 int poff; 1587 - struct page *page; 1588 int ret; 1589 struct ext4_group_info *grp; 1590 struct ext4_sb_info *sbi = EXT4_SB(sb); ··· 1602 e4b->bd_info = grp; 1603 e4b->bd_sb = sb; 1604 e4b->bd_group = group; 1605 - e4b->bd_buddy_page = NULL; 1606 - e4b->bd_bitmap_page = NULL; 1607 1608 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1609 /* ··· 1624 pnum = block / blocks_per_page; 1625 poff = block % blocks_per_page; 1626 1627 - /* we could use find_or_create_page(), but it locks page 1628 - * what we'd like to avoid in fast path ... */ 1629 - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1630 - if (page == NULL || !PageUptodate(page)) { 1631 - if (page) 1632 /* 1633 - * drop the page reference and try 1634 - * to get the page with lock. If we 1635 * are not uptodate that implies 1636 - * somebody just created the page but 1637 - * is yet to initialize the same. So 1638 * wait for it to initialize. 1639 */ 1640 - put_page(page); 1641 - page = find_or_create_page(inode->i_mapping, pnum, gfp); 1642 - if (page) { 1643 - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1644 - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { 1645 /* should never happen */ 1646 - unlock_page(page); 1647 ret = -EINVAL; 1648 goto err; 1649 } 1650 - if (!PageUptodate(page)) { 1651 - ret = ext4_mb_init_cache(page, NULL, gfp); 1652 if (ret) { 1653 - unlock_page(page); 1654 goto err; 1655 } 1656 - mb_cmp_bitmaps(e4b, page_address(page) + 1657 (poff * sb->s_blocksize)); 1658 } 1659 - unlock_page(page); 1660 } 1661 } 1662 - if (page == NULL) { 1663 - ret = -ENOMEM; 1664 goto err; 1665 } 1666 - if (!PageUptodate(page)) { 1667 ret = -EIO; 1668 goto err; 1669 } 1670 1671 - /* Pages marked accessed already */ 1672 - e4b->bd_bitmap_page = page; 1673 - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1674 1675 block++; 1676 pnum = block / blocks_per_page; 1677 poff = block % blocks_per_page; 1678 1679 - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1680 - if (page == NULL || !PageUptodate(page)) { 1681 - if (page) 1682 - put_page(page); 1683 - page = find_or_create_page(inode->i_mapping, pnum, gfp); 1684 - if (page) { 1685 - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1686 - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { 1687 /* should never happen */ 1688 - unlock_page(page); 1689 ret = -EINVAL; 1690 goto err; 1691 } 1692 - if (!PageUptodate(page)) { 1693 - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, 1694 gfp); 1695 if (ret) { 1696 - unlock_page(page); 1697 goto err; 1698 } 1699 } 1700 - unlock_page(page); 1701 } 1702 } 1703 - if (page == NULL) { 1704 - ret = -ENOMEM; 1705 goto err; 1706 } 1707 - if (!PageUptodate(page)) { 1708 ret = -EIO; 1709 goto err; 1710 } 1711 1712 - /* Pages marked accessed already */ 1713 - e4b->bd_buddy_page = page; 1714 - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1715 1716 return 0; 1717 1718 err: 1719 - if (page) 1720 - put_page(page); 1721 - if (e4b->bd_bitmap_page) 1722 - put_page(e4b->bd_bitmap_page); 1723 1724 e4b->bd_buddy = NULL; 1725 e4b->bd_bitmap = NULL; ··· 1735 1736 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1737 { 1738 - if (e4b->bd_bitmap_page) 1739 - put_page(e4b->bd_bitmap_page); 1740 - if (e4b->bd_buddy_page) 1741 - put_page(e4b->bd_buddy_page); 1742 } 1743 1744 ··· 2048 int ord; 2049 int mlen = 0; 2050 int max = 0; 2051 - int cur; 2052 int start = ex->fe_start; 2053 int len = ex->fe_len; 2054 unsigned ret = 0; 2055 int len0 = len; 2056 void *buddy; 2057 - bool split = false; 2058 2059 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 2060 BUG_ON(e4b->bd_group != ex->fe_group); ··· 2078 2079 /* let's maintain buddy itself */ 2080 while (len) { 2081 - if (!split) 2082 - ord = mb_find_order_for_block(e4b, start); 2083 2084 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 2085 /* the whole chunk may be allocated at once! */ 2086 mlen = 1 << ord; 2087 - if (!split) 2088 - buddy = mb_find_buddy(e4b, ord, &max); 2089 - else 2090 - split = false; 2091 BUG_ON((start >> ord) >= max); 2092 mb_set_bit(start >> ord, buddy); 2093 e4b->bd_info->bb_counters[ord]--; ··· 2097 if (ret == 0) 2098 ret = len | (ord << 16); 2099 2100 - /* we have to split large buddy */ 2101 BUG_ON(ord <= 0); 2102 buddy = mb_find_buddy(e4b, ord, &max); 2103 mb_set_bit(start >> ord, buddy); 2104 e4b->bd_info->bb_counters[ord]--; 2105 2106 - ord--; 2107 - cur = (start >> ord) & ~1U; 2108 - buddy = mb_find_buddy(e4b, ord, &max); 2109 - mb_clear_bit(cur, buddy); 2110 - mb_clear_bit(cur + 1, buddy); 2111 - e4b->bd_info->bb_counters[ord]++; 2112 - e4b->bd_info->bb_counters[ord]++; 2113 - split = true; 2114 } 2115 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 2116 ··· 2161 * double allocate blocks. The reference is dropped 2162 * in ext4_mb_release_context 2163 */ 2164 - ac->ac_bitmap_page = e4b->bd_bitmap_page; 2165 - get_page(ac->ac_bitmap_page); 2166 - ac->ac_buddy_page = e4b->bd_buddy_page; 2167 - get_page(ac->ac_buddy_page); 2168 /* store last allocated for subsequent stream allocation */ 2169 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2170 spin_lock(&sbi->s_md_lock); ··· 2687 int ret; 2688 2689 /* 2690 - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic 2691 * search to find large good chunks almost for free. If buddy 2692 * data is not ready, then this optimization makes no sense. But 2693 * we never skip the first block group in a flex_bg, since this ··· 2868 group = ac->ac_g_ex.fe_group; 2869 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2870 prefetch_grp = group; 2871 2872 for (i = 0, new_cr = cr; i < ngroups; i++, 2873 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { ··· 3199 } 3200 3201 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 3202 - __acquires(&EXT4_SB(sb)->s_mb_rb_lock) 3203 { 3204 struct super_block *sb = pde_data(file_inode(seq->file)); 3205 unsigned long position; ··· 3452 } 3453 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) 3454 sbi->s_mb_prefetch = ext4_get_groups_count(sb); 3455 - /* now many real IOs to prefetch within a single allocation at cr=0 3456 - * given cr=0 is an CPU-related optimization we shouldn't try to 3457 - * load too many groups, at some point we should start to use what 3458 - * we've got in memory. 3459 * with an average random access time 5ms, it'd take a second to get 3460 * 200 groups (* N with flex_bg), so let's make this limit 4 3461 */ ··· 3897 /* No more items in the per group rb tree 3898 * balance refcounts from ext4_mb_free_metadata() 3899 */ 3900 - put_page(e4b.bd_buddy_page); 3901 - put_page(e4b.bd_bitmap_page); 3902 } 3903 ext4_unlock_group(sb, entry->efd_group); 3904 ext4_mb_unload_buddy(&e4b); ··· 6002 6003 ext4_mb_put_pa(ac, ac->ac_sb, pa); 6004 } 6005 - if (ac->ac_bitmap_page) 6006 - put_page(ac->ac_bitmap_page); 6007 - if (ac->ac_buddy_page) 6008 - put_page(ac->ac_buddy_page); 6009 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 6010 mutex_unlock(&ac->ac_lg->lg_mutex); 6011 ext4_mb_collect_stats(ac); ··· 6126 ext4_mb_mark_bb(sb, block, 1, true); 6127 ar->len = 1; 6128 6129 return block; 6130 } 6131 ··· 6321 struct rb_node *parent = NULL, *new_node; 6322 6323 BUG_ON(!ext4_handle_valid(handle)); 6324 - BUG_ON(e4b->bd_bitmap_page == NULL); 6325 - BUG_ON(e4b->bd_buddy_page == NULL); 6326 6327 new_node = &new_entry->efd_node; 6328 cluster = new_entry->efd_start_cluster; ··· 6333 * otherwise we'll refresh it from 6334 * on-disk bitmap and lose not-yet-available 6335 * blocks */ 6336 - get_page(e4b->bd_buddy_page); 6337 - get_page(e4b->bd_bitmap_page); 6338 } 6339 while (*n) { 6340 parent = *n;

··· 831 return 0; 832 if (order == MB_NUM_ORDERS(sb)) 833 order--; 834 + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) 835 + order = MB_NUM_ORDERS(sb) - 1; 836 return order; 837 } 838 ··· 1008 * goal length. 1009 */ 1010 order = fls(ac->ac_g_ex.fe_len) - 1; 1011 + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) 1012 + order = MB_NUM_ORDERS(ac->ac_sb); 1013 min_order = order - sbi->s_mb_best_avail_max_trim_order; 1014 if (min_order < 0) 1015 min_order = 0; ··· 1076 } 1077 1078 /* 1079 + * Return next linear group for allocation. 1080 */ 1081 static ext4_group_t 1082 + next_linear_group(ext4_group_t group, ext4_group_t ngroups) 1083 { 1084 /* 1085 * Artificially restricted ngroups for non-extent 1086 * files makes group > ngroups possible on first loop. ··· 1118 { 1119 *new_cr = ac->ac_criteria; 1120 1121 + if (!should_optimize_scan(ac)) { 1122 + *group = next_linear_group(*group, ngroups); 1123 + return; 1124 + } 1125 + 1126 + /* 1127 + * Optimized scanning can return non adjacent groups which can cause 1128 + * seek overhead for rotational disks. So try few linear groups before 1129 + * trying optimized scan. 1130 + */ 1131 + if (ac->ac_groups_linear_remaining) { 1132 + *group = next_linear_group(*group, ngroups); 1133 + ac->ac_groups_linear_remaining--; 1134 return; 1135 } 1136 ··· 1131 ext4_mb_choose_next_group_best_avail(ac, new_cr, group); 1132 } else { 1133 /* 1134 + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an 1135 + * rb tree sorted by bb_free. But until that happens, we should 1136 + * never come here. 1137 */ 1138 WARN_ON(1); 1139 } ··· 1270 * for this page; do not hold this lock when calling this routine! 1271 */ 1272 1273 + static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) 1274 { 1275 ext4_group_t ngroups; 1276 unsigned int blocksize; ··· 1288 char *bitmap; 1289 struct ext4_group_info *grinfo; 1290 1291 + inode = folio->mapping->host; 1292 sb = inode->i_sb; 1293 ngroups = ext4_get_groups_count(sb); 1294 blocksize = i_blocksize(inode); 1295 blocks_per_page = PAGE_SIZE / blocksize; 1296 1297 + mb_debug(sb, "init folio %lu\n", folio->index); 1298 1299 groups_per_page = blocks_per_page >> 1; 1300 if (groups_per_page == 0) ··· 1309 } else 1310 bh = &bhs; 1311 1312 + first_group = folio->index * blocks_per_page / 2; 1313 1314 + /* read all groups the folio covers into the cache */ 1315 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1316 if (group >= ngroups) 1317 break; ··· 1322 /* 1323 * If page is uptodate then we came here after online resize 1324 * which added some new uninitialized group info structs, so 1325 + * we must skip all initialized uptodate buddies on the folio, 1326 * which may be currently in use by an allocating task. 1327 */ 1328 + if (folio_test_uptodate(folio) && 1329 + !EXT4_MB_GRP_NEED_INIT(grinfo)) { 1330 bh[i] = NULL; 1331 continue; 1332 } ··· 1349 err = err2; 1350 } 1351 1352 + first_block = folio->index * blocks_per_page; 1353 for (i = 0; i < blocks_per_page; i++) { 1354 group = (first_block + i) >> 1; 1355 if (group >= ngroups) ··· 1370 * above 1371 * 1372 */ 1373 + data = folio_address(folio) + (i * blocksize); 1374 bitmap = bh[group - first_group]->b_data; 1375 1376 /* ··· 1385 if ((first_block + i) & 1) { 1386 /* this is block of buddy */ 1387 BUG_ON(incore == NULL); 1388 + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", 1389 + group, folio->index, i * blocksize); 1390 trace_ext4_mb_buddy_bitmap_load(sb, group); 1391 grinfo->bb_fragments = 0; 1392 memset(grinfo->bb_counters, 0, ··· 1404 } else { 1405 /* this is block of bitmap */ 1406 BUG_ON(incore != NULL); 1407 + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", 1408 + group, folio->index, i * blocksize); 1409 trace_ext4_mb_bitmap_load(sb, group); 1410 1411 /* see comments in ext4_mb_put_pa() */ ··· 1423 incore = data; 1424 } 1425 } 1426 + folio_mark_uptodate(folio); 1427 1428 out: 1429 if (bh) { ··· 1439 * Lock the buddy and bitmap pages. This make sure other parallel init_group 1440 * on the same buddy page doesn't happen whild holding the buddy page lock. 1441 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1442 + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. 1443 */ 1444 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1445 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) ··· 1447 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1448 int block, pnum, poff; 1449 int blocks_per_page; 1450 + struct folio *folio; 1451 1452 + e4b->bd_buddy_folio = NULL; 1453 + e4b->bd_bitmap_folio = NULL; 1454 1455 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1456 /* ··· 1461 block = group * 2; 1462 pnum = block / blocks_per_page; 1463 poff = block % blocks_per_page; 1464 + folio = __filemap_get_folio(inode->i_mapping, pnum, 1465 + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1466 + if (IS_ERR(folio)) 1467 + return PTR_ERR(folio); 1468 + BUG_ON(folio->mapping != inode->i_mapping); 1469 + e4b->bd_bitmap_folio = folio; 1470 + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); 1471 1472 if (blocks_per_page >= 2) { 1473 /* buddy and bitmap are on the same page */ ··· 1474 } 1475 1476 /* blocks_per_page == 1, hence we need another page for the buddy */ 1477 + folio = __filemap_get_folio(inode->i_mapping, block + 1, 1478 + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1479 + if (IS_ERR(folio)) 1480 + return PTR_ERR(folio); 1481 + BUG_ON(folio->mapping != inode->i_mapping); 1482 + e4b->bd_buddy_folio = folio; 1483 return 0; 1484 } 1485 1486 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1487 { 1488 + if (e4b->bd_bitmap_folio) { 1489 + folio_unlock(e4b->bd_bitmap_folio); 1490 + folio_put(e4b->bd_bitmap_folio); 1491 } 1492 + if (e4b->bd_buddy_folio) { 1493 + folio_unlock(e4b->bd_buddy_folio); 1494 + folio_put(e4b->bd_buddy_folio); 1495 } 1496 } 1497 ··· 1505 1506 struct ext4_group_info *this_grp; 1507 struct ext4_buddy e4b; 1508 + struct folio *folio; 1509 int ret = 0; 1510 1511 might_sleep(); ··· 1532 goto err; 1533 } 1534 1535 + folio = e4b.bd_bitmap_folio; 1536 + ret = ext4_mb_init_cache(folio, NULL, gfp); 1537 if (ret) 1538 goto err; 1539 + if (!folio_test_uptodate(folio)) { 1540 ret = -EIO; 1541 goto err; 1542 } 1543 1544 + if (e4b.bd_buddy_folio == NULL) { 1545 /* 1546 * If both the bitmap and buddy are in 1547 * the same page we don't need to force ··· 1551 goto err; 1552 } 1553 /* init buddy cache */ 1554 + folio = e4b.bd_buddy_folio; 1555 + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); 1556 if (ret) 1557 goto err; 1558 + if (!folio_test_uptodate(folio)) { 1559 ret = -EIO; 1560 goto err; 1561 } ··· 1577 int block; 1578 int pnum; 1579 int poff; 1580 + struct folio *folio; 1581 int ret; 1582 struct ext4_group_info *grp; 1583 struct ext4_sb_info *sbi = EXT4_SB(sb); ··· 1595 e4b->bd_info = grp; 1596 e4b->bd_sb = sb; 1597 e4b->bd_group = group; 1598 + e4b->bd_buddy_folio = NULL; 1599 + e4b->bd_bitmap_folio = NULL; 1600 1601 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1602 /* ··· 1617 pnum = block / blocks_per_page; 1618 poff = block % blocks_per_page; 1619 1620 + /* Avoid locking the folio in the fast path ... */ 1621 + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); 1622 + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { 1623 + if (!IS_ERR(folio)) 1624 /* 1625 + * drop the folio reference and try 1626 + * to get the folio with lock. If we 1627 * are not uptodate that implies 1628 + * somebody just created the folio but 1629 + * is yet to initialize it. So 1630 * wait for it to initialize. 1631 */ 1632 + folio_put(folio); 1633 + folio = __filemap_get_folio(inode->i_mapping, pnum, 1634 + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1635 + if (!IS_ERR(folio)) { 1636 + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, 1637 + "ext4: bitmap's mapping != inode->i_mapping\n")) { 1638 /* should never happen */ 1639 + folio_unlock(folio); 1640 ret = -EINVAL; 1641 goto err; 1642 } 1643 + if (!folio_test_uptodate(folio)) { 1644 + ret = ext4_mb_init_cache(folio, NULL, gfp); 1645 if (ret) { 1646 + folio_unlock(folio); 1647 goto err; 1648 } 1649 + mb_cmp_bitmaps(e4b, folio_address(folio) + 1650 (poff * sb->s_blocksize)); 1651 } 1652 + folio_unlock(folio); 1653 } 1654 } 1655 + if (IS_ERR(folio)) { 1656 + ret = PTR_ERR(folio); 1657 goto err; 1658 } 1659 + if (!folio_test_uptodate(folio)) { 1660 ret = -EIO; 1661 goto err; 1662 } 1663 1664 + /* Folios marked accessed already */ 1665 + e4b->bd_bitmap_folio = folio; 1666 + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); 1667 1668 block++; 1669 pnum = block / blocks_per_page; 1670 poff = block % blocks_per_page; 1671 1672 + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); 1673 + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { 1674 + if (!IS_ERR(folio)) 1675 + folio_put(folio); 1676 + folio = __filemap_get_folio(inode->i_mapping, pnum, 1677 + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1678 + if (!IS_ERR(folio)) { 1679 + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, 1680 + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { 1681 /* should never happen */ 1682 + folio_unlock(folio); 1683 ret = -EINVAL; 1684 goto err; 1685 } 1686 + if (!folio_test_uptodate(folio)) { 1687 + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, 1688 gfp); 1689 if (ret) { 1690 + folio_unlock(folio); 1691 goto err; 1692 } 1693 } 1694 + folio_unlock(folio); 1695 } 1696 } 1697 + if (IS_ERR(folio)) { 1698 + ret = PTR_ERR(folio); 1699 goto err; 1700 } 1701 + if (!folio_test_uptodate(folio)) { 1702 ret = -EIO; 1703 goto err; 1704 } 1705 1706 + /* Folios marked accessed already */ 1707 + e4b->bd_buddy_folio = folio; 1708 + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); 1709 1710 return 0; 1711 1712 err: 1713 + if (!IS_ERR_OR_NULL(folio)) 1714 + folio_put(folio); 1715 + if (e4b->bd_bitmap_folio) 1716 + folio_put(e4b->bd_bitmap_folio); 1717 1718 e4b->bd_buddy = NULL; 1719 e4b->bd_bitmap = NULL; ··· 1727 1728 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1729 { 1730 + if (e4b->bd_bitmap_folio) 1731 + folio_put(e4b->bd_bitmap_folio); 1732 + if (e4b->bd_buddy_folio) 1733 + folio_put(e4b->bd_buddy_folio); 1734 } 1735 1736 ··· 2040 int ord; 2041 int mlen = 0; 2042 int max = 0; 2043 int start = ex->fe_start; 2044 int len = ex->fe_len; 2045 unsigned ret = 0; 2046 int len0 = len; 2047 void *buddy; 2048 + int ord_start, ord_end; 2049 2050 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 2051 BUG_ON(e4b->bd_group != ex->fe_group); ··· 2071 2072 /* let's maintain buddy itself */ 2073 while (len) { 2074 + ord = mb_find_order_for_block(e4b, start); 2075 2076 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 2077 /* the whole chunk may be allocated at once! */ 2078 mlen = 1 << ord; 2079 + buddy = mb_find_buddy(e4b, ord, &max); 2080 BUG_ON((start >> ord) >= max); 2081 mb_set_bit(start >> ord, buddy); 2082 e4b->bd_info->bb_counters[ord]--; ··· 2094 if (ret == 0) 2095 ret = len | (ord << 16); 2096 2097 BUG_ON(ord <= 0); 2098 buddy = mb_find_buddy(e4b, ord, &max); 2099 mb_set_bit(start >> ord, buddy); 2100 e4b->bd_info->bb_counters[ord]--; 2101 2102 + ord_start = (start >> ord) << ord; 2103 + ord_end = ord_start + (1 << ord); 2104 + /* first chunk */ 2105 + if (start > ord_start) 2106 + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, 2107 + ord_start, start - ord_start, 2108 + e4b->bd_info); 2109 + 2110 + /* last chunk */ 2111 + if (start + len < ord_end) { 2112 + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, 2113 + start + len, 2114 + ord_end - (start + len), 2115 + e4b->bd_info); 2116 + break; 2117 + } 2118 + len = start + len - ord_end; 2119 + start = ord_end; 2120 } 2121 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 2122 ··· 2149 * double allocate blocks. The reference is dropped 2150 * in ext4_mb_release_context 2151 */ 2152 + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; 2153 + folio_get(ac->ac_bitmap_folio); 2154 + ac->ac_buddy_folio = e4b->bd_buddy_folio; 2155 + folio_get(ac->ac_buddy_folio); 2156 /* store last allocated for subsequent stream allocation */ 2157 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2158 spin_lock(&sbi->s_md_lock); ··· 2675 int ret; 2676 2677 /* 2678 + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic 2679 * search to find large good chunks almost for free. If buddy 2680 * data is not ready, then this optimization makes no sense. But 2681 * we never skip the first block group in a flex_bg, since this ··· 2856 group = ac->ac_g_ex.fe_group; 2857 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2858 prefetch_grp = group; 2859 + nr = 0; 2860 2861 for (i = 0, new_cr = cr; i < ngroups; i++, 2862 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { ··· 3186 } 3187 3188 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 3189 { 3190 struct super_block *sb = pde_data(file_inode(seq->file)); 3191 unsigned long position; ··· 3440 } 3441 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) 3442 sbi->s_mb_prefetch = ext4_get_groups_count(sb); 3443 + /* 3444 + * now many real IOs to prefetch within a single allocation at 3445 + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related 3446 + * optimization we shouldn't try to load too many groups, at some point 3447 + * we should start to use what we've got in memory. 3448 * with an average random access time 5ms, it'd take a second to get 3449 * 200 groups (* N with flex_bg), so let's make this limit 4 3450 */ ··· 3884 /* No more items in the per group rb tree 3885 * balance refcounts from ext4_mb_free_metadata() 3886 */ 3887 + folio_put(e4b.bd_buddy_folio); 3888 + folio_put(e4b.bd_bitmap_folio); 3889 } 3890 ext4_unlock_group(sb, entry->efd_group); 3891 ext4_mb_unload_buddy(&e4b); ··· 5989 5990 ext4_mb_put_pa(ac, ac->ac_sb, pa); 5991 } 5992 + if (ac->ac_bitmap_folio) 5993 + folio_put(ac->ac_bitmap_folio); 5994 + if (ac->ac_buddy_folio) 5995 + folio_put(ac->ac_buddy_folio); 5996 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 5997 mutex_unlock(&ac->ac_lg->lg_mutex); 5998 ext4_mb_collect_stats(ac); ··· 6113 ext4_mb_mark_bb(sb, block, 1, true); 6114 ar->len = 1; 6115 6116 + *errp = 0; 6117 return block; 6118 } 6119 ··· 6307 struct rb_node *parent = NULL, *new_node; 6308 6309 BUG_ON(!ext4_handle_valid(handle)); 6310 + BUG_ON(e4b->bd_bitmap_folio == NULL); 6311 + BUG_ON(e4b->bd_buddy_folio == NULL); 6312 6313 new_node = &new_entry->efd_node; 6314 cluster = new_entry->efd_start_cluster; ··· 6319 * otherwise we'll refresh it from 6320 * on-disk bitmap and lose not-yet-available 6321 * blocks */ 6322 + folio_get(e4b->bd_buddy_folio); 6323 + folio_get(e4b->bd_bitmap_folio); 6324 } 6325 while (*n) { 6326 parent = *n;

+7 -7

fs/ext4/mballoc.h

··· 187 struct ext4_free_extent ac_f_ex; 188 189 /* 190 - * goal len can change in CR1.5, so save the original len. This is 191 - * used while adjusting the PA window and for accounting. 192 */ 193 ext4_grpblk_t ac_orig_goal_len; 194 195 __u32 ac_flags; /* allocation hints */ 196 __u16 ac_groups_scanned; 197 - __u16 ac_groups_linear_remaining; 198 __u16 ac_found; 199 __u16 ac_cX_found[EXT4_MB_NUM_CRS]; 200 __u16 ac_tail; ··· 204 __u8 ac_2order; /* if request is to allocate 2^N blocks and 205 * N > 0, the field stores N, otherwise 0 */ 206 __u8 ac_op; /* operation, for history only */ 207 - struct page *ac_bitmap_page; 208 - struct page *ac_buddy_page; 209 struct ext4_prealloc_space *ac_pa; 210 struct ext4_locality_group *ac_lg; 211 }; ··· 215 #define AC_STATUS_BREAK 3 216 217 struct ext4_buddy { 218 - struct page *bd_buddy_page; 219 void *bd_buddy; 220 - struct page *bd_bitmap_page; 221 void *bd_bitmap; 222 struct ext4_group_info *bd_info; 223 struct super_block *bd_sb;

··· 187 struct ext4_free_extent ac_f_ex; 188 189 /* 190 + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. 191 + * This is used while adjusting the PA window and for accounting. 192 */ 193 ext4_grpblk_t ac_orig_goal_len; 194 195 __u32 ac_flags; /* allocation hints */ 196 + __u32 ac_groups_linear_remaining; 197 __u16 ac_groups_scanned; 198 __u16 ac_found; 199 __u16 ac_cX_found[EXT4_MB_NUM_CRS]; 200 __u16 ac_tail; ··· 204 __u8 ac_2order; /* if request is to allocate 2^N blocks and 205 * N > 0, the field stores N, otherwise 0 */ 206 __u8 ac_op; /* operation, for history only */ 207 + struct folio *ac_bitmap_folio; 208 + struct folio *ac_buddy_folio; 209 struct ext4_prealloc_space *ac_pa; 210 struct ext4_locality_group *ac_lg; 211 }; ··· 215 #define AC_STATUS_BREAK 3 216 217 struct ext4_buddy { 218 + struct folio *bd_buddy_folio; 219 void *bd_buddy; 220 + struct folio *bd_bitmap_folio; 221 void *bd_bitmap; 222 struct ext4_group_info *bd_info; 223 struct super_block *bd_sb;

+1 -3

fs/ext4/move_extent.c

··· 199 continue; 200 if (!buffer_mapped(bh)) { 201 err = ext4_get_block(inode, block, bh, 0); 202 - if (err) { 203 - folio_set_error(folio); 204 return err; 205 - } 206 if (!buffer_mapped(bh)) { 207 folio_zero_range(folio, block_start, blocksize); 208 set_buffer_uptodate(bh);

··· 199 continue; 200 if (!buffer_mapped(bh)) { 201 err = ext4_get_block(inode, block, bh, 0); 202 + if (err) 203 return err; 204 if (!buffer_mapped(bh)) { 205 folio_zero_range(folio, block_start, blocksize); 206 set_buffer_uptodate(bh);

+1 -1

fs/ext4/namei.c

··· 2897 inode = ext4_new_inode_start_handle(idmap, dir, mode, 2898 NULL, 0, NULL, 2899 EXT4_HT_DIR, 2900 - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + 2901 4 + EXT4_XATTR_TRANS_BLOCKS); 2902 handle = ext4_journal_current_handle(); 2903 err = PTR_ERR(inode);

··· 2897 inode = ext4_new_inode_start_handle(idmap, dir, mode, 2898 NULL, 0, NULL, 2899 EXT4_HT_DIR, 2900 + EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 2901 4 + EXT4_XATTR_TRANS_BLOCKS); 2902 handle = ext4_journal_current_handle(); 2903 err = PTR_ERR(inode);

-3

fs/ext4/page-io.c

··· 117 118 if (bio->bi_status) { 119 int err = blk_status_to_errno(bio->bi_status); 120 - folio_set_error(folio); 121 mapping_set_error(folio->mapping, err); 122 } 123 bh = head = folio_buffers(folio); ··· 439 440 BUG_ON(!folio_test_locked(folio)); 441 BUG_ON(folio_test_writeback(folio)); 442 - 443 - folio_clear_error(folio); 444 445 /* 446 * Comments copied from block_write_full_folio:

··· 117 118 if (bio->bi_status) { 119 int err = blk_status_to_errno(bio->bi_status); 120 mapping_set_error(folio->mapping, err); 121 } 122 bh = head = folio_buffers(folio); ··· 440 441 BUG_ON(!folio_test_locked(folio)); 442 BUG_ON(folio_test_writeback(folio)); 443 444 /* 445 * Comments copied from block_write_full_folio:

-1

fs/ext4/readpage.c

··· 289 290 if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { 291 set_error_page: 292 - folio_set_error(folio); 293 folio_zero_segment(folio, 0, 294 folio_size(folio)); 295 folio_unlock(folio);

··· 289 290 if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { 291 set_error_page: 292 folio_zero_segment(folio, 0, 293 folio_size(folio)); 294 folio_unlock(folio);

+16 -20

fs/ext4/super.c

··· 2074 { 2075 struct ext4_fs_context *ctx = fc->fs_private; 2076 2077 - if (ctx->s_qf_names[qtype]) 2078 - kfree(ctx->s_qf_names[qtype]); 2079 2080 ctx->s_qf_names[qtype] = NULL; 2081 ctx->qname_spec |= 1 << qtype; ··· 2479 param.size = v_len; 2480 2481 ret = ext4_parse_param(fc, &param); 2482 - if (param.string) 2483 - kfree(param.string); 2484 if (ret < 0) 2485 return ret; 2486 } ··· 5336 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; 5337 #endif 5338 super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); 5339 5340 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 5341 mutex_init(&sbi->s_orphan_lock); ··· 5546 if (err) 5547 goto failed_mount6; 5548 5549 - err = ext4_register_sysfs(sb); 5550 - if (err) 5551 - goto failed_mount7; 5552 - 5553 err = ext4_init_orphan_info(sb); 5554 if (err) 5555 - goto failed_mount8; 5556 #ifdef CONFIG_QUOTA 5557 /* Enable quota usage during mount. */ 5558 if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { 5559 err = ext4_enable_quotas(sb); 5560 if (err) 5561 - goto failed_mount9; 5562 } 5563 #endif /* CONFIG_QUOTA */ 5564 ··· 5580 ext4_msg(sb, KERN_INFO, "recovery complete"); 5581 err = ext4_mark_recovery_complete(sb, es); 5582 if (err) 5583 - goto failed_mount10; 5584 } 5585 5586 if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) ··· 5597 atomic_set(&sbi->s_warning_count, 0); 5598 atomic_set(&sbi->s_msg_count, 0); 5599 5600 return 0; 5601 5602 - failed_mount10: 5603 ext4_quotas_off(sb, EXT4_MAXQUOTAS); 5604 - failed_mount9: __maybe_unused 5605 ext4_release_orphan_info(sb); 5606 - failed_mount8: 5607 - ext4_unregister_sysfs(sb); 5608 - kobject_put(&sbi->s_kobj); 5609 failed_mount7: 5610 ext4_unregister_li_request(sb); 5611 failed_mount6: ··· 6123 __ext4_update_tstamp(&es->s_first_error_time, 6124 &es->s_first_error_time_hi, 6125 sbi->s_first_error_time); 6126 - strncpy(es->s_first_error_func, sbi->s_first_error_func, 6127 - sizeof(es->s_first_error_func)); 6128 es->s_first_error_line = 6129 cpu_to_le32(sbi->s_first_error_line); 6130 es->s_first_error_ino = ··· 6137 __ext4_update_tstamp(&es->s_last_error_time, 6138 &es->s_last_error_time_hi, 6139 sbi->s_last_error_time); 6140 - strncpy(es->s_last_error_func, sbi->s_last_error_func, 6141 - sizeof(es->s_last_error_func)); 6142 es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); 6143 es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); 6144 es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);

··· 2074 { 2075 struct ext4_fs_context *ctx = fc->fs_private; 2076 2077 + kfree(ctx->s_qf_names[qtype]); 2078 2079 ctx->s_qf_names[qtype] = NULL; 2080 ctx->qname_spec |= 1 << qtype; ··· 2480 param.size = v_len; 2481 2482 ret = ext4_parse_param(fc, &param); 2483 + kfree(param.string); 2484 if (ret < 0) 2485 return ret; 2486 } ··· 5338 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; 5339 #endif 5340 super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); 5341 + super_set_sysfs_name_bdev(sb); 5342 5343 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 5344 mutex_init(&sbi->s_orphan_lock); ··· 5547 if (err) 5548 goto failed_mount6; 5549 5550 err = ext4_init_orphan_info(sb); 5551 if (err) 5552 + goto failed_mount7; 5553 #ifdef CONFIG_QUOTA 5554 /* Enable quota usage during mount. */ 5555 if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { 5556 err = ext4_enable_quotas(sb); 5557 if (err) 5558 + goto failed_mount8; 5559 } 5560 #endif /* CONFIG_QUOTA */ 5561 ··· 5585 ext4_msg(sb, KERN_INFO, "recovery complete"); 5586 err = ext4_mark_recovery_complete(sb, es); 5587 if (err) 5588 + goto failed_mount9; 5589 } 5590 5591 if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) ··· 5602 atomic_set(&sbi->s_warning_count, 0); 5603 atomic_set(&sbi->s_msg_count, 0); 5604 5605 + /* Register sysfs after all initializations are complete. */ 5606 + err = ext4_register_sysfs(sb); 5607 + if (err) 5608 + goto failed_mount9; 5609 + 5610 return 0; 5611 5612 + failed_mount9: 5613 ext4_quotas_off(sb, EXT4_MAXQUOTAS); 5614 + failed_mount8: __maybe_unused 5615 ext4_release_orphan_info(sb); 5616 failed_mount7: 5617 ext4_unregister_li_request(sb); 5618 failed_mount6: ··· 6126 __ext4_update_tstamp(&es->s_first_error_time, 6127 &es->s_first_error_time_hi, 6128 sbi->s_first_error_time); 6129 + strtomem_pad(es->s_first_error_func, 6130 + sbi->s_first_error_func, 0); 6131 es->s_first_error_line = 6132 cpu_to_le32(sbi->s_first_error_line); 6133 es->s_first_error_ino = ··· 6140 __ext4_update_tstamp(&es->s_last_error_time, 6141 &es->s_last_error_time_hi, 6142 sbi->s_last_error_time); 6143 + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); 6144 es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); 6145 es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); 6146 es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);

+108 -72

fs/ext4/sysfs.c

··· 29 attr_trigger_test_error, 30 attr_first_error_time, 31 attr_last_error_time, 32 attr_feature, 33 attr_pointer_ui, 34 attr_pointer_ul, 35 attr_pointer_u64, ··· 107 int ret; 108 109 ret = kstrtoull(skip_spaces(buf), 0, &val); 110 - if (ret || val >= clusters) 111 return -EINVAL; 112 113 atomic64_set(&sbi->s_resv_clusters, val); ··· 181 #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ 182 EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) 183 184 #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ 185 EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) 186 ··· 213 214 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, 215 ext4_sb_info, s_inode_readahead_blks); 216 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 217 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); 218 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); 219 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); 220 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 221 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 222 - EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 223 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); 224 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 225 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); 226 - EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); 227 - EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); 228 - EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); 229 - EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); 230 - EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 231 - EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 232 - EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); 233 #ifdef CONFIG_EXT4_DEBUG 234 EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); 235 #endif ··· 374 #define print_tstamp(buf, es, tstamp) \ 375 __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) 376 377 static ssize_t ext4_attr_show(struct kobject *kobj, 378 struct attribute *attr, char *buf) 379 { 380 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, 381 s_kobj); 382 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); 383 - void *ptr = calc_ptr(a, sbi); 384 385 switch (a->attr_id) { 386 case attr_delayed_allocation_blocks: ··· 431 return sysfs_emit(buf, "%llu\n", 432 (unsigned long long) 433 percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); 434 - case attr_inode_readahead: 435 - case attr_pointer_ui: 436 - if (!ptr) 437 - return 0; 438 - if (a->attr_ptr == ptr_ext4_super_block_offset) 439 - return sysfs_emit(buf, "%u\n", 440 - le32_to_cpup(ptr)); 441 - else 442 - return sysfs_emit(buf, "%u\n", 443 - *((unsigned int *) ptr)); 444 - case attr_pointer_ul: 445 - if (!ptr) 446 - return 0; 447 - return sysfs_emit(buf, "%lu\n", 448 - *((unsigned long *) ptr)); 449 - case attr_pointer_u8: 450 - if (!ptr) 451 - return 0; 452 - return sysfs_emit(buf, "%u\n", 453 - *((unsigned char *) ptr)); 454 - case attr_pointer_u64: 455 - if (!ptr) 456 - return 0; 457 - if (a->attr_ptr == ptr_ext4_super_block_offset) 458 - return sysfs_emit(buf, "%llu\n", 459 - le64_to_cpup(ptr)); 460 - else 461 - return sysfs_emit(buf, "%llu\n", 462 - *((unsigned long long *) ptr)); 463 - case attr_pointer_string: 464 - if (!ptr) 465 - return 0; 466 - return sysfs_emit(buf, "%.*s\n", a->attr_size, 467 - (char *) ptr); 468 - case attr_pointer_atomic: 469 - if (!ptr) 470 - return 0; 471 - return sysfs_emit(buf, "%d\n", 472 - atomic_read((atomic_t *) ptr)); 473 case attr_feature: 474 return sysfs_emit(buf, "supported\n"); 475 case attr_first_error_time: ··· 439 return print_tstamp(buf, sbi->s_es, s_last_error_time); 440 case attr_journal_task: 441 return journal_task_show(sbi, buf); 442 } 443 444 return 0; 445 } 446 ··· 507 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, 508 s_kobj); 509 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); 510 - void *ptr = calc_ptr(a, sbi); 511 - unsigned long t; 512 - int ret; 513 514 switch (a->attr_id) { 515 case attr_reserved_clusters: 516 return reserved_clusters_store(sbi, buf, len); 517 - case attr_pointer_ui: 518 - if (!ptr) 519 - return 0; 520 - ret = kstrtoul(skip_spaces(buf), 0, &t); 521 - if (ret) 522 - return ret; 523 - if (a->attr_ptr == ptr_ext4_super_block_offset) 524 - *((__le32 *) ptr) = cpu_to_le32(t); 525 - else 526 - *((unsigned int *) ptr) = t; 527 - return len; 528 - case attr_pointer_ul: 529 - if (!ptr) 530 - return 0; 531 - ret = kstrtoul(skip_spaces(buf), 0, &t); 532 - if (ret) 533 - return ret; 534 - *((unsigned long *) ptr) = t; 535 - return len; 536 case attr_inode_readahead: 537 return inode_readahead_blks_store(sbi, buf, len); 538 case attr_trigger_test_error: 539 return trigger_test_error(sbi, buf, len); 540 } 541 - return 0; 542 } 543 544 static void ext4_sb_release(struct kobject *kobj)

··· 29 attr_trigger_test_error, 30 attr_first_error_time, 31 attr_last_error_time, 32 + attr_clusters_in_group, 33 + attr_mb_order, 34 attr_feature, 35 + attr_pointer_pi, 36 attr_pointer_ui, 37 attr_pointer_ul, 38 attr_pointer_u64, ··· 104 int ret; 105 106 ret = kstrtoull(skip_spaces(buf), 0, &val); 107 + if (ret || val >= clusters || (s64)val < 0) 108 return -EINVAL; 109 110 atomic64_set(&sbi->s_resv_clusters, val); ··· 178 #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ 179 EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) 180 181 + #define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ 182 + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) 183 + 184 #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ 185 EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) 186 ··· 207 208 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, 209 ext4_sb_info, s_inode_readahead_blks); 210 + EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, 211 + ext4_sb_info, s_mb_group_prealloc); 212 + EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, 213 + ext4_sb_info, s_mb_best_avail_max_trim_order); 214 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 215 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); 216 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); 217 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); 218 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 219 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 220 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); 221 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 222 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); 223 + EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); 224 + EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); 225 + EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); 226 + EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); 227 + EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 228 + EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 229 #ifdef CONFIG_EXT4_DEBUG 230 EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); 231 #endif ··· 366 #define print_tstamp(buf, es, tstamp) \ 367 __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) 368 369 + static ssize_t ext4_generic_attr_show(struct ext4_attr *a, 370 + struct ext4_sb_info *sbi, char *buf) 371 + { 372 + void *ptr = calc_ptr(a, sbi); 373 + 374 + if (!ptr) 375 + return 0; 376 + 377 + switch (a->attr_id) { 378 + case attr_inode_readahead: 379 + case attr_clusters_in_group: 380 + case attr_mb_order: 381 + case attr_pointer_pi: 382 + case attr_pointer_ui: 383 + if (a->attr_ptr == ptr_ext4_super_block_offset) 384 + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); 385 + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); 386 + case attr_pointer_ul: 387 + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); 388 + case attr_pointer_u8: 389 + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); 390 + case attr_pointer_u64: 391 + if (a->attr_ptr == ptr_ext4_super_block_offset) 392 + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); 393 + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); 394 + case attr_pointer_string: 395 + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); 396 + case attr_pointer_atomic: 397 + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); 398 + } 399 + return 0; 400 + } 401 + 402 static ssize_t ext4_attr_show(struct kobject *kobj, 403 struct attribute *attr, char *buf) 404 { 405 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, 406 s_kobj); 407 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); 408 409 switch (a->attr_id) { 410 case attr_delayed_allocation_blocks: ··· 391 return sysfs_emit(buf, "%llu\n", 392 (unsigned long long) 393 percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); 394 case attr_feature: 395 return sysfs_emit(buf, "supported\n"); 396 case attr_first_error_time: ··· 438 return print_tstamp(buf, sbi->s_es, s_last_error_time); 439 case attr_journal_task: 440 return journal_task_show(sbi, buf); 441 + default: 442 + return ext4_generic_attr_show(a, sbi, buf); 443 } 444 + } 445 446 + static ssize_t ext4_generic_attr_store(struct ext4_attr *a, 447 + struct ext4_sb_info *sbi, 448 + const char *buf, size_t len) 449 + { 450 + int ret; 451 + unsigned int t; 452 + unsigned long lt; 453 + void *ptr = calc_ptr(a, sbi); 454 + 455 + if (!ptr) 456 + return 0; 457 + 458 + switch (a->attr_id) { 459 + case attr_pointer_pi: 460 + ret = kstrtouint(skip_spaces(buf), 0, &t); 461 + if (ret) 462 + return ret; 463 + if ((int)t < 0) 464 + return -EINVAL; 465 + *((unsigned int *) ptr) = t; 466 + return len; 467 + case attr_pointer_ui: 468 + ret = kstrtouint(skip_spaces(buf), 0, &t); 469 + if (ret) 470 + return ret; 471 + if (a->attr_ptr == ptr_ext4_super_block_offset) 472 + *((__le32 *) ptr) = cpu_to_le32(t); 473 + else 474 + *((unsigned int *) ptr) = t; 475 + return len; 476 + case attr_mb_order: 477 + ret = kstrtouint(skip_spaces(buf), 0, &t); 478 + if (ret) 479 + return ret; 480 + if (t > 64) 481 + return -EINVAL; 482 + *((unsigned int *) ptr) = t; 483 + return len; 484 + case attr_clusters_in_group: 485 + ret = kstrtouint(skip_spaces(buf), 0, &t); 486 + if (ret) 487 + return ret; 488 + if (t > sbi->s_clusters_per_group) 489 + return -EINVAL; 490 + *((unsigned int *) ptr) = t; 491 + return len; 492 + case attr_pointer_ul: 493 + ret = kstrtoul(skip_spaces(buf), 0, &lt); 494 + if (ret) 495 + return ret; 496 + *((unsigned long *) ptr) = lt; 497 + return len; 498 + } 499 return 0; 500 } 501 ··· 450 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, 451 s_kobj); 452 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); 453 454 switch (a->attr_id) { 455 case attr_reserved_clusters: 456 return reserved_clusters_store(sbi, buf, len); 457 case attr_inode_readahead: 458 return inode_readahead_blks_store(sbi, buf, len); 459 case attr_trigger_test_error: 460 return trigger_test_error(sbi, buf, len); 461 + default: 462 + return ext4_generic_attr_store(a, sbi, buf, len); 463 } 464 } 465 466 static void ext4_sb_release(struct kobject *kobj)

+78 -69

fs/ext4/xattr.c

··· 1619 static int ext4_xattr_set_entry(struct ext4_xattr_info *i, 1620 struct ext4_xattr_search *s, 1621 handle_t *handle, struct inode *inode, 1622 bool is_block) 1623 { 1624 struct ext4_xattr_entry *last, *next; ··· 1627 size_t min_offs = s->end - s->base, name_len = strlen(i->name); 1628 int in_inode = i->in_inode; 1629 struct inode *old_ea_inode = NULL; 1630 - struct inode *new_ea_inode = NULL; 1631 size_t old_size, new_size; 1632 int ret; 1633 ··· 1711 old_ea_inode = NULL; 1712 goto out; 1713 } 1714 - } 1715 - if (i->value && in_inode) { 1716 - WARN_ON_ONCE(!i->value_len); 1717 1718 - new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode, 1719 - i->value, i->value_len); 1720 - if (IS_ERR(new_ea_inode)) { 1721 - ret = PTR_ERR(new_ea_inode); 1722 - new_ea_inode = NULL; 1723 - goto out; 1724 - } 1725 - } 1726 - 1727 - if (old_ea_inode) { 1728 /* We are ready to release ref count on the old_ea_inode. */ 1729 ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); 1730 - if (ret) { 1731 - /* Release newly required ref count on new_ea_inode. */ 1732 - if (new_ea_inode) { 1733 - int err; 1734 - 1735 - err = ext4_xattr_inode_dec_ref(handle, 1736 - new_ea_inode); 1737 - if (err) 1738 - ext4_warning_inode(new_ea_inode, 1739 - "dec ref new_ea_inode err=%d", 1740 - err); 1741 - ext4_xattr_inode_free_quota(inode, new_ea_inode, 1742 - i->value_len); 1743 - } 1744 goto out; 1745 - } 1746 1747 ext4_xattr_inode_free_quota(inode, old_ea_inode, 1748 le32_to_cpu(here->e_value_size)); ··· 1839 ret = 0; 1840 out: 1841 iput(old_ea_inode); 1842 - iput(new_ea_inode); 1843 return ret; 1844 } 1845 ··· 1901 size_t old_ea_inode_quota = 0; 1902 unsigned int ea_ino; 1903 1904 - 1905 #define header(x) ((struct ext4_xattr_header *)(x)) 1906 1907 if (s->base) { 1908 int offset = (char *)s->here - bs->bh->b_data; ··· 1924 EXT4_JTR_NONE); 1925 if (error) 1926 goto cleanup; 1927 lock_buffer(bs->bh); 1928 1929 if (header(s->base)->h_refcount == cpu_to_le32(1)) { ··· 1951 } 1952 ea_bdebug(bs->bh, "modifying in-place"); 1953 error = ext4_xattr_set_entry(i, s, handle, inode, 1954 - true /* is_block */); 1955 ext4_xattr_block_csum_set(inode, bs->bh); 1956 unlock_buffer(bs->bh); 1957 if (error == -EFSCORRUPTED) ··· 2019 s->end = s->base + sb->s_blocksize; 2020 } 2021 2022 - error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); 2023 if (error == -EFSCORRUPTED) 2024 goto bad_block; 2025 if (error) 2026 goto cleanup; 2027 2028 - if (i->value && s->here->e_value_inum) { 2029 - /* 2030 - * A ref count on ea_inode has been taken as part of the call to 2031 - * ext4_xattr_set_entry() above. We would like to drop this 2032 - * extra ref but we have to wait until the xattr block is 2033 - * initialized and has its own ref count on the ea_inode. 2034 - */ 2035 - ea_ino = le32_to_cpu(s->here->e_value_inum); 2036 - error = ext4_xattr_inode_iget(inode, ea_ino, 2037 - le32_to_cpu(s->here->e_hash), 2038 - &ea_inode); 2039 - if (error) { 2040 - ea_inode = NULL; 2041 - goto cleanup; 2042 - } 2043 - } 2044 - 2045 inserted: 2046 if (!IS_LAST_ENTRY(s->first)) { 2047 - new_bh = ext4_xattr_block_cache_find(inode, header(s->base), 2048 - &ce); 2049 if (new_bh) { 2050 /* We found an identical block in the cache. */ 2051 if (new_bh == bs->bh) ··· 2132 ENTRY(header(s->base)+1)); 2133 if (error) 2134 goto getblk_failed; 2135 2136 lock_buffer(new_bh); 2137 error = ext4_journal_get_create_access(handle, sb, ··· 2183 2184 cleanup: 2185 if (ea_inode) { 2186 - int error2; 2187 2188 - error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); 2189 - if (error2) 2190 - ext4_warning_inode(ea_inode, "dec ref error=%d", 2191 - error2); 2192 - 2193 - /* If there was an error, revert the quota charge. */ 2194 - if (error) 2195 ext4_xattr_inode_free_quota(inode, ea_inode, 2196 i_size_read(ea_inode)); 2197 iput(ea_inode); 2198 } 2199 if (ce) ··· 2250 { 2251 struct ext4_xattr_ibody_header *header; 2252 struct ext4_xattr_search *s = &is->s; 2253 int error; 2254 2255 if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) 2256 return -ENOSPC; 2257 2258 - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); 2259 - if (error) 2260 return error; 2261 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 2262 if (!IS_LAST_ENTRY(s->first)) { 2263 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ··· 2290 header->h_magic = cpu_to_le32(0); 2291 ext4_clear_inode_state(inode, EXT4_STATE_XATTR); 2292 } 2293 return 0; 2294 } 2295 ··· 3099 * 3100 * Find an identical extended attribute block. 3101 * 3102 - * Returns a pointer to the block found, or NULL if such a block was 3103 - * not found or an error occurred. 3104 */ 3105 static struct buffer_head * 3106 ext4_xattr_block_cache_find(struct inode *inode, ··· 3122 3123 bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); 3124 if (IS_ERR(bh)) { 3125 - if (PTR_ERR(bh) == -ENOMEM) 3126 - return NULL; 3127 - bh = NULL; 3128 - EXT4_ERROR_INODE(inode, "block %lu read error", 3129 - (unsigned long)ce->e_value); 3130 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { 3131 *pce = ce; 3132 return bh;

··· 1619 static int ext4_xattr_set_entry(struct ext4_xattr_info *i, 1620 struct ext4_xattr_search *s, 1621 handle_t *handle, struct inode *inode, 1622 + struct inode *new_ea_inode, 1623 bool is_block) 1624 { 1625 struct ext4_xattr_entry *last, *next; ··· 1626 size_t min_offs = s->end - s->base, name_len = strlen(i->name); 1627 int in_inode = i->in_inode; 1628 struct inode *old_ea_inode = NULL; 1629 size_t old_size, new_size; 1630 int ret; 1631 ··· 1711 old_ea_inode = NULL; 1712 goto out; 1713 } 1714 1715 /* We are ready to release ref count on the old_ea_inode. */ 1716 ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); 1717 + if (ret) 1718 goto out; 1719 1720 ext4_xattr_inode_free_quota(inode, old_ea_inode, 1721 le32_to_cpu(here->e_value_size)); ··· 1866 ret = 0; 1867 out: 1868 iput(old_ea_inode); 1869 return ret; 1870 } 1871 ··· 1929 size_t old_ea_inode_quota = 0; 1930 unsigned int ea_ino; 1931 1932 #define header(x) ((struct ext4_xattr_header *)(x)) 1933 + 1934 + /* If we need EA inode, prepare it before locking the buffer */ 1935 + if (i->value && i->in_inode) { 1936 + WARN_ON_ONCE(!i->value_len); 1937 + 1938 + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, 1939 + i->value, i->value_len); 1940 + if (IS_ERR(ea_inode)) { 1941 + error = PTR_ERR(ea_inode); 1942 + ea_inode = NULL; 1943 + goto cleanup; 1944 + } 1945 + } 1946 1947 if (s->base) { 1948 int offset = (char *)s->here - bs->bh->b_data; ··· 1940 EXT4_JTR_NONE); 1941 if (error) 1942 goto cleanup; 1943 + 1944 lock_buffer(bs->bh); 1945 1946 if (header(s->base)->h_refcount == cpu_to_le32(1)) { ··· 1966 } 1967 ea_bdebug(bs->bh, "modifying in-place"); 1968 error = ext4_xattr_set_entry(i, s, handle, inode, 1969 + ea_inode, true /* is_block */); 1970 ext4_xattr_block_csum_set(inode, bs->bh); 1971 unlock_buffer(bs->bh); 1972 if (error == -EFSCORRUPTED) ··· 2034 s->end = s->base + sb->s_blocksize; 2035 } 2036 2037 + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, 2038 + true /* is_block */); 2039 if (error == -EFSCORRUPTED) 2040 goto bad_block; 2041 if (error) 2042 goto cleanup; 2043 2044 inserted: 2045 if (!IS_LAST_ENTRY(s->first)) { 2046 + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); 2047 + if (IS_ERR(new_bh)) { 2048 + error = PTR_ERR(new_bh); 2049 + new_bh = NULL; 2050 + goto cleanup; 2051 + } 2052 + 2053 if (new_bh) { 2054 /* We found an identical block in the cache. */ 2055 if (new_bh == bs->bh) ··· 2158 ENTRY(header(s->base)+1)); 2159 if (error) 2160 goto getblk_failed; 2161 + if (ea_inode) { 2162 + /* Drop the extra ref on ea_inode. */ 2163 + error = ext4_xattr_inode_dec_ref(handle, 2164 + ea_inode); 2165 + if (error) 2166 + ext4_warning_inode(ea_inode, 2167 + "dec ref error=%d", 2168 + error); 2169 + iput(ea_inode); 2170 + ea_inode = NULL; 2171 + } 2172 2173 lock_buffer(new_bh); 2174 error = ext4_journal_get_create_access(handle, sb, ··· 2198 2199 cleanup: 2200 if (ea_inode) { 2201 + if (error) { 2202 + int error2; 2203 2204 + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); 2205 + if (error2) 2206 + ext4_warning_inode(ea_inode, "dec ref error=%d", 2207 + error2); 2208 ext4_xattr_inode_free_quota(inode, ea_inode, 2209 i_size_read(ea_inode)); 2210 + } 2211 iput(ea_inode); 2212 } 2213 if (ce) ··· 2266 { 2267 struct ext4_xattr_ibody_header *header; 2268 struct ext4_xattr_search *s = &is->s; 2269 + struct inode *ea_inode = NULL; 2270 int error; 2271 2272 if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) 2273 return -ENOSPC; 2274 2275 + /* If we need EA inode, prepare it before locking the buffer */ 2276 + if (i->value && i->in_inode) { 2277 + WARN_ON_ONCE(!i->value_len); 2278 + 2279 + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, 2280 + i->value, i->value_len); 2281 + if (IS_ERR(ea_inode)) 2282 + return PTR_ERR(ea_inode); 2283 + } 2284 + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, 2285 + false /* is_block */); 2286 + if (error) { 2287 + if (ea_inode) { 2288 + int error2; 2289 + 2290 + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); 2291 + if (error2) 2292 + ext4_warning_inode(ea_inode, "dec ref error=%d", 2293 + error2); 2294 + 2295 + ext4_xattr_inode_free_quota(inode, ea_inode, 2296 + i_size_read(ea_inode)); 2297 + iput(ea_inode); 2298 + } 2299 return error; 2300 + } 2301 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 2302 if (!IS_LAST_ENTRY(s->first)) { 2303 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ··· 2282 header->h_magic = cpu_to_le32(0); 2283 ext4_clear_inode_state(inode, EXT4_STATE_XATTR); 2284 } 2285 + iput(ea_inode); 2286 return 0; 2287 } 2288 ··· 3090 * 3091 * Find an identical extended attribute block. 3092 * 3093 + * Returns a pointer to the block found, or NULL if such a block was not 3094 + * found, or an error pointer if an error occurred while reading ea block. 3095 */ 3096 static struct buffer_head * 3097 ext4_xattr_block_cache_find(struct inode *inode, ··· 3113 3114 bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); 3115 if (IS_ERR(bh)) { 3116 + if (PTR_ERR(bh) != -ENOMEM) 3117 + EXT4_ERROR_INODE(inode, "block %lu read error", 3118 + (unsigned long)ce->e_value); 3119 + mb_cache_entry_put(ea_block_cache, ce); 3120 + return bh; 3121 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { 3122 *pce = ce; 3123 return bh;

+13 -11

fs/jbd2/checkpoint.c

··· 337 338 /* Checkpoint list management */ 339 340 - enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; 341 - 342 /* 343 * journal_shrink_one_cp_list 344 * ··· 348 * Called with j_list_lock held. 349 */ 350 static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, 351 - enum shrink_type type, 352 bool *released) 353 { 354 struct journal_head *last_jh; ··· 365 jh = next_jh; 366 next_jh = jh->b_cpnext; 367 368 - if (type == SHRINK_DESTROY) { 369 ret = __jbd2_journal_remove_checkpoint(jh); 370 } else { 371 ret = jbd2_journal_try_remove_checkpoint(jh); 372 if (ret < 0) { 373 - if (type == SHRINK_BUSY_SKIP) 374 continue; 375 break; 376 } ··· 437 tid = transaction->t_tid; 438 439 freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, 440 - SHRINK_BUSY_SKIP, &released); 441 nr_freed += freed; 442 (*nr_to_scan) -= min(*nr_to_scan, freed); 443 if (*nr_to_scan == 0) ··· 470 * journal_clean_checkpoint_list 471 * 472 * Find all the written-back checkpoint buffers in the journal and release them. 473 - * If 'destroy' is set, release all buffers unconditionally. 474 * 475 * Called with j_list_lock held. 476 */ 477 - void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) 478 { 479 transaction_t *transaction, *last_transaction, *next_transaction; 480 - enum shrink_type type; 481 bool released; 482 483 transaction = journal->j_checkpoint_transactions; 484 if (!transaction) 485 return; 486 487 - type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; 488 last_transaction = transaction->t_cpprev; 489 next_transaction = transaction; 490 do { ··· 529 spin_unlock(&journal->j_list_lock); 530 break; 531 } 532 - __jbd2_journal_clean_checkpoint_list(journal, true); 533 spin_unlock(&journal->j_list_lock); 534 cond_resched(); 535 }

··· 337 338 /* Checkpoint list management */ 339 340 /* 341 * journal_shrink_one_cp_list 342 * ··· 350 * Called with j_list_lock held. 351 */ 352 static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, 353 + enum jbd2_shrink_type type, 354 bool *released) 355 { 356 struct journal_head *last_jh; ··· 367 jh = next_jh; 368 next_jh = jh->b_cpnext; 369 370 + if (type == JBD2_SHRINK_DESTROY) { 371 ret = __jbd2_journal_remove_checkpoint(jh); 372 } else { 373 ret = jbd2_journal_try_remove_checkpoint(jh); 374 if (ret < 0) { 375 + if (type == JBD2_SHRINK_BUSY_SKIP) 376 continue; 377 break; 378 } ··· 439 tid = transaction->t_tid; 440 441 freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, 442 + JBD2_SHRINK_BUSY_SKIP, &released); 443 nr_freed += freed; 444 (*nr_to_scan) -= min(*nr_to_scan, freed); 445 if (*nr_to_scan == 0) ··· 472 * journal_clean_checkpoint_list 473 * 474 * Find all the written-back checkpoint buffers in the journal and release them. 475 + * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If 476 + * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a 477 + * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some 478 + * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. 479 * 480 * Called with j_list_lock held. 481 */ 482 + void __jbd2_journal_clean_checkpoint_list(journal_t *journal, 483 + enum jbd2_shrink_type type) 484 { 485 transaction_t *transaction, *last_transaction, *next_transaction; 486 bool released; 487 + 488 + WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); 489 490 transaction = journal->j_checkpoint_transactions; 491 if (!transaction) 492 return; 493 494 last_transaction = transaction->t_cpprev; 495 next_transaction = transaction; 496 do { ··· 527 spin_unlock(&journal->j_list_lock); 528 break; 529 } 530 + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); 531 spin_unlock(&journal->j_list_lock); 532 cond_resched(); 533 }

+1 -2

fs/jbd2/commit.c

··· 501 * frees some memory 502 */ 503 spin_lock(&journal->j_list_lock); 504 - __jbd2_journal_clean_checkpoint_list(journal, false); 505 spin_unlock(&journal->j_list_lock); 506 507 jbd2_debug(3, "JBD2: commit phase 1\n"); ··· 571 J_ASSERT(commit_transaction->t_nr_buffers <= 572 atomic_read(&commit_transaction->t_outstanding_credits)); 573 574 - err = 0; 575 bufs = 0; 576 descriptor = NULL; 577 while (commit_transaction->t_buffers) {

··· 501 * frees some memory 502 */ 503 spin_lock(&journal->j_list_lock); 504 + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); 505 spin_unlock(&journal->j_list_lock); 506 507 jbd2_debug(3, "JBD2: commit phase 1\n"); ··· 571 J_ASSERT(commit_transaction->t_nr_buffers <= 572 atomic_read(&commit_transaction->t_outstanding_credits)); 573 574 bufs = 0; 575 descriptor = NULL; 576 while (commit_transaction->t_buffers) {

+3 -1

include/linux/jbd2.h

··· 1434 extern void jbd2_journal_commit_transaction(journal_t *); 1435 1436 /* Checkpoint list management */ 1437 - void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); 1438 unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); 1439 int __jbd2_journal_remove_checkpoint(struct journal_head *); 1440 int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);

··· 1434 extern void jbd2_journal_commit_transaction(journal_t *); 1435 1436 /* Checkpoint list management */ 1437 + enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP}; 1438 + 1439 + void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type); 1440 unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); 1441 int __jbd2_journal_remove_checkpoint(struct journal_head *); 1442 int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);