Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+31 -8

Documentation/filesystems/ext4/journal.rst

··· 4 4 -------------- 5 5 6 6 Introduced in ext3, the ext4 filesystem employs a journal to protect the 7 - filesystem against corruption in the case of a system crash. A small 8 - continuous region of disk (default 128MiB) is reserved inside the 9 - filesystem as a place to land “important” data writes on-disk as quickly 10 - as possible. Once the important data transaction is fully written to the 11 - disk and flushed from the disk write cache, a record of the data being 12 - committed is also written to the journal. At some later point in time, 13 - the journal code writes the transactions to their final locations on 14 - disk (this could involve a lot of seeking or a lot of small 7 + filesystem against metadata inconsistencies in the case of a system crash. Up 8 + to 10,240,000 file system blocks (see man mke2fs(8) for more details on journal 9 + size limits) can be reserved inside the filesystem as a place to land 10 + “important” data writes on-disk as quickly as possible. Once the important 11 + data transaction is fully written to the disk and flushed from the disk write 12 + cache, a record of the data being committed is also written to the journal. At 13 + some later point in time, the journal code writes the transactions to their 14 + final locations on disk (this could involve a lot of seeking or a lot of small 15 15 read-write-erases) before erasing the commit record. Should the system 16 16 crash during the second slow write, the journal can be replayed all the 17 17 way to the latest commit record, guaranteeing the atomicity of whatever ··· 731 731 replay of last inode 11 tag. Thus, by converting a non-idempotent procedure 732 732 into a series of idempotent outcomes, fast commits ensured idempotence during 733 733 the replay. 734 + 735 + Journal Checkpoint 736 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 737 + 738 + Checkpointing the journal ensures all transactions and their associated buffers 739 + are submitted to the disk. In-progress transactions are waited upon and included 740 + in the checkpoint. Checkpointing is used internally during critical updates to 741 + the filesystem including journal recovery, filesystem resizing, and freeing of 742 + the journal_t structure. 743 + 744 + A journal checkpoint can be triggered from userspace via the ioctl 745 + EXT4_IOC_CHECKPOINT. This ioctl takes a single, u64 argument for flags. 746 + Currently, three flags are supported. First, EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 747 + can be used to verify input to the ioctl. It returns error if there is any 748 + invalid input, otherwise it returns success without performing 749 + any checkpointing. This can be used to check whether the ioctl exists on a 750 + system and to verify there are no issues with arguments or flags. The 751 + other two flags are EXT4_IOC_CHECKPOINT_FLAG_DISCARD and 752 + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT. These flags cause the journal blocks to be 753 + discarded or zero-filled, respectively, after the journal checkpoint is 754 + complete. EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 755 + cannot both be set. The ioctl may be useful when snapshotting a system or for 756 + complying with content deletion SLOs.

-15

fs/block_dev.c

··· 1673 1673 } 1674 1674 EXPORT_SYMBOL_GPL(blkdev_read_iter); 1675 1675 1676 - /* 1677 - * Try to release a page associated with block device when the system 1678 - * is under memory pressure. 1679 - */ 1680 - static int blkdev_releasepage(struct page *page, gfp_t wait) 1681 - { 1682 - struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1683 - 1684 - if (super && super->s_op->bdev_try_to_free_page) 1685 - return super->s_op->bdev_try_to_free_page(super, page, wait); 1686 - 1687 - return try_to_free_buffers(page); 1688 - } 1689 - 1690 1676 static int blkdev_writepages(struct address_space *mapping, 1691 1677 struct writeback_control *wbc) 1692 1678 { ··· 1687 1701 .write_begin = blkdev_write_begin, 1688 1702 .write_end = blkdev_write_end, 1689 1703 .writepages = blkdev_writepages, 1690 - .releasepage = blkdev_releasepage, 1691 1704 .direct_IO = blkdev_direct_IO, 1692 1705 .migratepage = buffer_migrate_page_norefs, 1693 1706 .is_dirty_writeback = buffer_check_dirty_writeback,

+16 -2

fs/ext4/ext4.h

··· 720 720 #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) 721 721 #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) 722 722 #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) 723 + #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) 723 724 724 725 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) 725 726 ··· 741 740 #define EXT4_STATE_FLAG_NEW 0x00000002 742 741 #define EXT4_STATE_FLAG_NEWENTRY 0x00000004 743 742 #define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 743 + 744 + /* flags for ioctl EXT4_IOC_CHECKPOINT */ 745 + #define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 746 + #define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 747 + #define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 748 + #define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ 749 + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ 750 + EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) 744 751 745 752 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 746 753 /* ··· 1486 1477 unsigned int s_inode_goal; 1487 1478 u32 s_hash_seed[4]; 1488 1479 int s_def_hash_version; 1489 - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ 1480 + int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ 1490 1481 struct percpu_counter s_freeclusters_counter; 1491 1482 struct percpu_counter s_freeinodes_counter; 1492 1483 struct percpu_counter s_dirs_counter; ··· 1497 1488 struct kobject s_kobj; 1498 1489 struct completion s_kobj_unregister; 1499 1490 struct super_block *s_sb; 1491 + struct buffer_head *s_mmp_bh; 1500 1492 1501 1493 /* Journaling */ 1502 1494 struct journal_s *s_journal; ··· 3624 3614 extern const struct inode_operations ext4_fast_symlink_inode_operations; 3625 3615 3626 3616 /* sysfs.c */ 3617 + extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); 3627 3618 extern int ext4_register_sysfs(struct super_block *sb); 3628 3619 extern void ext4_unregister_sysfs(struct super_block *sb); 3629 3620 extern int __init ext4_init_sysfs(void); ··· 3731 3720 /* mmp.c */ 3732 3721 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 3733 3722 3723 + /* mmp.c */ 3724 + extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); 3725 + 3734 3726 /* verity.c */ 3735 3727 extern const struct fsverity_operations ext4_verityops; 3736 3728 ··· 3798 3784 * have to read the block because we may read the old data 3799 3785 * successfully. 3800 3786 */ 3801 - if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) 3787 + if (buffer_write_io_error(bh)) 3802 3788 set_buffer_uptodate(bh); 3803 3789 return buffer_uptodate(bh); 3804 3790 }

+4

fs/ext4/extents.c

··· 825 825 eh->eh_entries = 0; 826 826 eh->eh_magic = EXT4_EXT_MAGIC; 827 827 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 828 + eh->eh_generation = 0; 828 829 ext4_mark_inode_dirty(handle, inode); 829 830 } 830 831 ··· 1091 1090 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1092 1091 neh->eh_magic = EXT4_EXT_MAGIC; 1093 1092 neh->eh_depth = 0; 1093 + neh->eh_generation = 0; 1094 1094 1095 1095 /* move remainder of path[depth] to the new leaf */ 1096 1096 if (unlikely(path[depth].p_hdr->eh_entries != ··· 1169 1167 neh->eh_magic = EXT4_EXT_MAGIC; 1170 1168 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1171 1169 neh->eh_depth = cpu_to_le16(depth - i); 1170 + neh->eh_generation = 0; 1172 1171 fidx = EXT_FIRST_INDEX(neh); 1173 1172 fidx->ei_block = border; 1174 1173 ext4_idx_store_pblock(fidx, oldblock); ··· 1309 1306 neh->eh_magic = EXT4_EXT_MAGIC; 1310 1307 ext4_extent_block_csum_set(inode, neh); 1311 1308 set_buffer_uptodate(bh); 1309 + set_buffer_verified(bh); 1312 1310 unlock_buffer(bh); 1313 1311 1314 1312 err = ext4_handle_dirty_metadata(handle, inode, bh);

+1 -3

fs/ext4/extents_status.c

··· 1574 1574 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); 1575 1575 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1576 1576 1577 - if (!nr_to_scan) 1578 - return ret; 1579 - 1580 1577 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); 1581 1578 1579 + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); 1582 1580 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); 1583 1581 return nr_shrunk; 1584 1582 }

+2 -2

fs/ext4/fsmap.h

··· 50 50 #define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ 51 51 #define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ 52 52 #define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ 53 - #define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */ 54 - #define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */ 53 + #define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */ 54 + #define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */ 55 55 56 56 #endif /* __EXT4_FSMAP_H__ */

+5 -6

fs/ext4/ialloc.c

··· 402 402 * 403 403 * We always try to spread first-level directories. 404 404 * 405 - * If there are blockgroups with both free inodes and free blocks counts 405 + * If there are blockgroups with both free inodes and free clusters counts 406 406 * not worse than average we return one with smallest directory count. 407 407 * Otherwise we simply return a random group. 408 408 * ··· 411 411 * It's OK to put directory into a group unless 412 412 * it has too many directories already (max_dirs) or 413 413 * it has too few free inodes left (min_inodes) or 414 - * it has too few free blocks left (min_blocks) or 414 + * it has too few free clusters left (min_clusters) or 415 415 * Parent's group is preferred, if it doesn't satisfy these 416 416 * conditions we search cyclically through the rest. If none 417 417 * of the groups look good we just look for a group with more ··· 427 427 ext4_group_t real_ngroups = ext4_get_groups_count(sb); 428 428 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 429 429 unsigned int freei, avefreei, grp_free; 430 - ext4_fsblk_t freeb, avefreec; 430 + ext4_fsblk_t freec, avefreec; 431 431 unsigned int ndirs; 432 432 int max_dirs, min_inodes; 433 433 ext4_grpblk_t min_clusters; ··· 446 446 447 447 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 448 448 avefreei = freei / ngroups; 449 - freeb = EXT4_C2B(sbi, 450 - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 451 - avefreec = freeb; 449 + freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); 450 + avefreec = freec; 452 451 do_div(avefreec, ngroups); 453 452 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 454 453

+5 -6

fs/ext4/inline.c

··· 204 204 /* 205 205 * write the buffer to the inline inode. 206 206 * If 'create' is set, we don't need to do the extra copy in the xattr 207 - * value since it is already handled by ext4_xattr_ibody_inline_set. 207 + * value since it is already handled by ext4_xattr_ibody_set. 208 208 * That saves us one memcpy. 209 209 */ 210 210 static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, ··· 286 286 287 287 BUG_ON(!is.s.not_found); 288 288 289 - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); 289 + error = ext4_xattr_ibody_set(handle, inode, &i, &is); 290 290 if (error) { 291 291 if (error == -ENOSPC) 292 292 ext4_clear_inode_state(inode, ··· 358 358 i.value = value; 359 359 i.value_len = len; 360 360 361 - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); 361 + error = ext4_xattr_ibody_set(handle, inode, &i, &is); 362 362 if (error) 363 363 goto out; 364 364 ··· 431 431 if (error) 432 432 goto out; 433 433 434 - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); 434 + error = ext4_xattr_ibody_set(handle, inode, &i, &is); 435 435 if (error) 436 436 goto out; 437 437 ··· 1925 1925 i.value = value; 1926 1926 i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? 1927 1927 i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; 1928 - err = ext4_xattr_ibody_inline_set(handle, inode, 1929 - &i, &is); 1928 + err = ext4_xattr_ibody_set(handle, inode, &i, &is); 1930 1929 if (err) 1931 1930 goto out_error; 1932 1931 }

+4 -4

fs/ext4/inode.c

··· 374 374 ei->i_reserved_data_blocks -= used; 375 375 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); 376 376 377 - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 377 + spin_unlock(&ei->i_block_reservation_lock); 378 378 379 379 /* Update quota subsystem for data blocks */ 380 380 if (quota_claim) ··· 3223 3223 ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3224 3224 journal = EXT4_JOURNAL(inode); 3225 3225 jbd2_journal_lock_updates(journal); 3226 - err = jbd2_journal_flush(journal); 3226 + err = jbd2_journal_flush(journal, 0); 3227 3227 jbd2_journal_unlock_updates(journal); 3228 3228 3229 3229 if (err) ··· 3418 3418 * i_disksize out to i_size. This could be beyond where direct I/O is 3419 3419 * happening and thus expose allocated blocks to direct I/O reads. 3420 3420 */ 3421 - else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) 3421 + else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) 3422 3422 m_flags = EXT4_GET_BLOCKS_CREATE; 3423 3423 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3424 3424 m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ··· 6005 6005 if (val) 6006 6006 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 6007 6007 else { 6008 - err = jbd2_journal_flush(journal); 6008 + err = jbd2_journal_flush(journal, 0); 6009 6009 if (err < 0) { 6010 6010 jbd2_journal_unlock_updates(journal); 6011 6011 percpu_up_write(&sbi->s_writepages_rwsem);

+60 -20

fs/ext4/ioctl.c

··· 659 659 info.gi_sb = sb; 660 660 info.gi_data = arg; 661 661 error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); 662 - if (error == EXT4_QUERY_RANGE_ABORT) { 663 - error = 0; 662 + if (error == EXT4_QUERY_RANGE_ABORT) 664 663 aborted = true; 665 - } else if (error) 664 + else if (error) 666 665 return error; 667 666 668 667 /* If we didn't abort, set the "last" flag in the last fmx */ ··· 692 693 if (err) 693 694 return err; 694 695 695 - if (ext4_has_feature_bigalloc(sb)) { 696 - ext4_msg(sb, KERN_ERR, 697 - "Online resizing not supported with bigalloc"); 698 - err = -EOPNOTSUPP; 699 - goto group_add_out; 700 - } 701 - 702 696 err = mnt_want_write_file(file); 703 697 if (err) 704 698 goto group_add_out; ··· 699 707 err = ext4_group_add(sb, input); 700 708 if (EXT4_SB(sb)->s_journal) { 701 709 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 702 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 710 + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 703 711 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 704 712 } 705 713 if (err == 0) ··· 792 800 return error; 793 801 } 794 802 803 + static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) 804 + { 805 + int err = 0; 806 + __u32 flags = 0; 807 + unsigned int flush_flags = 0; 808 + struct super_block *sb = file_inode(filp)->i_sb; 809 + struct request_queue *q; 810 + 811 + if (copy_from_user(&flags, (__u32 __user *)arg, 812 + sizeof(__u32))) 813 + return -EFAULT; 814 + 815 + if (!capable(CAP_SYS_ADMIN)) 816 + return -EPERM; 817 + 818 + /* check for invalid bits set */ 819 + if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || 820 + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && 821 + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) 822 + return -EINVAL; 823 + 824 + if (!EXT4_SB(sb)->s_journal) 825 + return -ENODEV; 826 + 827 + if (flags & ~JBD2_JOURNAL_FLUSH_VALID) 828 + return -EINVAL; 829 + 830 + q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); 831 + if (!q) 832 + return -ENXIO; 833 + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) 834 + return -EOPNOTSUPP; 835 + 836 + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) 837 + return 0; 838 + 839 + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) 840 + flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; 841 + 842 + if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { 843 + flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; 844 + pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); 845 + } 846 + 847 + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 848 + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); 849 + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 850 + 851 + return err; 852 + } 853 + 795 854 static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 796 855 { 797 856 struct inode *inode = file_inode(filp); ··· 914 871 goto group_extend_out; 915 872 } 916 873 917 - if (ext4_has_feature_bigalloc(sb)) { 918 - ext4_msg(sb, KERN_ERR, 919 - "Online resizing not supported with bigalloc"); 920 - err = -EOPNOTSUPP; 921 - goto group_extend_out; 922 - } 923 - 924 874 err = mnt_want_write_file(filp); 925 875 if (err) 926 876 goto group_extend_out; ··· 921 885 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 922 886 if (EXT4_SB(sb)->s_journal) { 923 887 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 924 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 888 + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 925 889 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 926 890 } 927 891 if (err == 0) ··· 1064 1028 if (EXT4_SB(sb)->s_journal) { 1065 1029 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); 1066 1030 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 1067 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 1031 + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 1068 1032 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 1069 1033 } 1070 1034 if (err == 0) ··· 1247 1211 return fsverity_ioctl_read_metadata(filp, 1248 1212 (const void __user *)arg); 1249 1213 1214 + case EXT4_IOC_CHECKPOINT: 1215 + return ext4_ioctl_checkpoint(filp, arg); 1216 + 1250 1217 default: 1251 1218 return -ENOTTY; 1252 1219 } ··· 1330 1291 case EXT4_IOC_CLEAR_ES_CACHE: 1331 1292 case EXT4_IOC_GETSTATE: 1332 1293 case EXT4_IOC_GET_ES_CACHE: 1294 + case EXT4_IOC_CHECKPOINT: 1333 1295 break; 1334 1296 default: 1335 1297 return -ENOIOCTLCMD;

+13 -15

fs/ext4/mmp.c

··· 127 127 */ 128 128 static int kmmpd(void *data) 129 129 { 130 - struct super_block *sb = ((struct mmpd_data *) data)->sb; 131 - struct buffer_head *bh = ((struct mmpd_data *) data)->bh; 130 + struct super_block *sb = (struct super_block *) data; 132 131 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 132 + struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh; 133 133 struct mmp_struct *mmp; 134 134 ext4_fsblk_t mmp_block; 135 135 u32 seq = 0; ··· 245 245 retval = write_mmp_block(sb, bh); 246 246 247 247 exit_thread: 248 - EXT4_SB(sb)->s_mmp_tsk = NULL; 249 - kfree(data); 250 - brelse(bh); 251 248 return retval; 249 + } 250 + 251 + void ext4_stop_mmpd(struct ext4_sb_info *sbi) 252 + { 253 + if (sbi->s_mmp_tsk) { 254 + kthread_stop(sbi->s_mmp_tsk); 255 + brelse(sbi->s_mmp_bh); 256 + sbi->s_mmp_tsk = NULL; 257 + } 252 258 } 253 259 254 260 /* ··· 281 275 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 282 276 struct buffer_head *bh = NULL; 283 277 struct mmp_struct *mmp = NULL; 284 - struct mmpd_data *mmpd_data; 285 278 u32 seq; 286 279 unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); 287 280 unsigned int wait_time = 0; ··· 369 364 goto failed; 370 365 } 371 366 372 - mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); 373 - if (!mmpd_data) { 374 - ext4_warning(sb, "not enough memory for mmpd_data"); 375 - goto failed; 376 - } 377 - mmpd_data->sb = sb; 378 - mmpd_data->bh = bh; 367 + EXT4_SB(sb)->s_mmp_bh = bh; 379 368 380 369 /* 381 370 * Start a kernel thread to update the MMP block periodically. 382 371 */ 383 - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s", 372 + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s", 384 373 (int)sizeof(mmp->mmp_bdevname), 385 374 bdevname(bh->b_bdev, 386 375 mmp->mmp_bdevname)); 387 376 if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { 388 377 EXT4_SB(sb)->s_mmp_tsk = NULL; 389 - kfree(mmpd_data); 390 378 ext4_warning(sb, "Unable to create kmmpd thread for %s.", 391 379 sb->s_id); 392 380 goto failed;

+1 -1

fs/ext4/namei.c

··· 2499 2499 2500 2500 /* Which index block gets the new entry? */ 2501 2501 if (at - entries >= icount1) { 2502 - frame->at = at = at - entries - icount1 + entries2; 2502 + frame->at = at - entries - icount1 + entries2; 2503 2503 frame->entries = entries = entries2; 2504 2504 swap(frame->bh, bh2); 2505 2505 }

+9

fs/ext4/resize.c

··· 74 74 return -EPERM; 75 75 } 76 76 77 + if (ext4_has_feature_bigalloc(sb)) { 78 + ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); 79 + return -EOPNOTSUPP; 80 + } 81 + if (ext4_has_feature_sparse_super2(sb)) { 82 + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); 83 + return -EOPNOTSUPP; 84 + } 85 + 77 86 if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, 78 87 &EXT4_SB(sb)->s_ext4_flags)) 79 88 ret = -EBUSY;

+26 -31

fs/ext4/super.c

··· 718 718 goto write_directly; 719 719 } 720 720 jbd2_journal_stop(handle); 721 + ext4_notify_error_sysfs(sbi); 721 722 return; 722 723 } 723 724 write_directly: ··· 727 726 * out and hope for the best. 728 727 */ 729 728 ext4_commit_super(sbi->s_sb); 729 + ext4_notify_error_sysfs(sbi); 730 730 } 731 731 732 732 #define ext4_error_ratelimit(sb) \ ··· 1176 1174 ext4_unregister_sysfs(sb); 1177 1175 1178 1176 if (sbi->s_journal) { 1177 + jbd2_journal_unregister_shrinker(sbi->s_journal); 1179 1178 aborted = is_journal_aborted(sbi->s_journal); 1180 1179 err = jbd2_journal_destroy(sbi->s_journal); 1181 1180 sbi->s_journal = NULL; ··· 1248 1245 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 1249 1246 sbi->s_ea_block_cache = NULL; 1250 1247 1251 - if (sbi->s_mmp_tsk) 1252 - kthread_stop(sbi->s_mmp_tsk); 1248 + ext4_stop_mmpd(sbi); 1249 + 1253 1250 brelse(sbi->s_sbh); 1254 1251 sb->s_fs_info = NULL; 1255 1252 /* ··· 1442 1439 1443 1440 trace_ext4_nfs_commit_metadata(inode); 1444 1441 return ext4_write_inode(inode, &wbc); 1445 - } 1446 - 1447 - /* 1448 - * Try to release metadata pages (indirect blocks, directories) which are 1449 - * mapped via the block device. Since these pages could have journal heads 1450 - * which would prevent try_to_free_buffers() from freeing them, we must use 1451 - * jbd2 layer's try_to_free_buffers() function to release them. 1452 - */ 1453 - static int bdev_try_to_free_page(struct super_block *sb, struct page *page, 1454 - gfp_t wait) 1455 - { 1456 - journal_t *journal = EXT4_SB(sb)->s_journal; 1457 - 1458 - WARN_ON(PageChecked(page)); 1459 - if (!page_has_buffers(page)) 1460 - return 0; 1461 - if (journal) 1462 - return jbd2_journal_try_to_free_buffers(journal, page); 1463 - 1464 - return try_to_free_buffers(page); 1465 1442 } 1466 1443 1467 1444 #ifdef CONFIG_FS_ENCRYPTION ··· 1638 1655 .quota_write = ext4_quota_write, 1639 1656 .get_dquots = ext4_get_dquots, 1640 1657 #endif 1641 - .bdev_try_to_free_page = bdev_try_to_free_page, 1642 1658 }; 1643 1659 1644 1660 static const struct export_operations ext4_export_ops = { ··· 3083 3101 inode_lock(inode); 3084 3102 truncate_inode_pages(inode->i_mapping, inode->i_size); 3085 3103 ret = ext4_truncate(inode); 3086 - if (ret) 3104 + if (ret) { 3105 + /* 3106 + * We need to clean up the in-core orphan list 3107 + * manually if ext4_truncate() failed to get a 3108 + * transaction handle. 3109 + */ 3110 + ext4_orphan_del(NULL, inode); 3087 3111 ext4_std_error(inode->i_sb, ret); 3112 + } 3088 3113 inode_unlock(inode); 3089 3114 nr_truncates++; 3090 3115 } else { ··· 5047 5058 ext4_msg(sb, KERN_ERR, 5048 5059 "unable to initialize " 5049 5060 "flex_bg meta info!"); 5061 + ret = -ENOMEM; 5050 5062 goto failed_mount6; 5051 5063 } 5052 5064 ··· 5168 5178 sbi->s_ea_block_cache = NULL; 5169 5179 5170 5180 if (sbi->s_journal) { 5181 + jbd2_journal_unregister_shrinker(sbi->s_journal); 5171 5182 jbd2_journal_destroy(sbi->s_journal); 5172 5183 sbi->s_journal = NULL; 5173 5184 } ··· 5177 5186 failed_mount3: 5178 5187 flush_work(&sbi->s_error_work); 5179 5188 del_timer_sync(&sbi->s_err_report); 5180 - if (sbi->s_mmp_tsk) 5181 - kthread_stop(sbi->s_mmp_tsk); 5189 + ext4_stop_mmpd(sbi); 5182 5190 failed_mount2: 5183 5191 rcu_read_lock(); 5184 5192 group_desc = rcu_dereference(sbi->s_group_desc); ··· 5494 5504 ext4_commit_super(sb); 5495 5505 } 5496 5506 5507 + err = jbd2_journal_register_shrinker(journal); 5508 + if (err) { 5509 + EXT4_SB(sb)->s_journal = NULL; 5510 + goto err_out; 5511 + } 5512 + 5497 5513 return 0; 5498 5514 5499 5515 err_out: ··· 5642 5646 return 0; 5643 5647 } 5644 5648 jbd2_journal_lock_updates(journal); 5645 - err = jbd2_journal_flush(journal); 5649 + err = jbd2_journal_flush(journal, 0); 5646 5650 if (err < 0) 5647 5651 goto out; 5648 5652 ··· 5784 5788 * Don't clear the needs_recovery flag if we failed to 5785 5789 * flush the journal. 5786 5790 */ 5787 - error = jbd2_journal_flush(journal); 5791 + error = jbd2_journal_flush(journal, 0); 5788 5792 if (error < 0) 5789 5793 goto out; 5790 5794 ··· 5985 5989 */ 5986 5990 ext4_mark_recovery_complete(sb, es); 5987 5991 } 5988 - if (sbi->s_mmp_tsk) 5989 - kthread_stop(sbi->s_mmp_tsk); 5992 + ext4_stop_mmpd(sbi); 5990 5993 } else { 5991 5994 /* Make sure we can mount this feature set readwrite */ 5992 5995 if (ext4_has_feature_readonly(sb) || ··· 6378 6383 * otherwise be livelocked... 6379 6384 */ 6380 6385 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 6381 - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 6386 + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 6382 6387 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 6383 6388 if (err) 6384 6389 return err;

+5

fs/ext4/sysfs.c

··· 506 506 .release = (void (*)(struct kobject *))kfree, 507 507 }; 508 508 509 + void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) 510 + { 511 + sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); 512 + } 513 + 509 514 static struct kobject *ext4_root; 510 515 511 516 static struct kobject *ext4_feat;

+1 -25

fs/ext4/xattr.c

··· 2190 2190 return 0; 2191 2191 } 2192 2192 2193 - int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, 2194 - struct ext4_xattr_info *i, 2195 - struct ext4_xattr_ibody_find *is) 2196 - { 2197 - struct ext4_xattr_ibody_header *header; 2198 - struct ext4_xattr_search *s = &is->s; 2199 - int error; 2200 - 2201 - if (EXT4_I(inode)->i_extra_isize == 0) 2202 - return -ENOSPC; 2203 - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); 2204 - if (error) 2205 - return error; 2206 - header = IHDR(inode, ext4_raw_inode(&is->iloc)); 2207 - if (!IS_LAST_ENTRY(s->first)) { 2208 - header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 2209 - ext4_set_inode_state(inode, EXT4_STATE_XATTR); 2210 - } else { 2211 - header->h_magic = cpu_to_le32(0); 2212 - ext4_clear_inode_state(inode, EXT4_STATE_XATTR); 2213 - } 2214 - return 0; 2215 - } 2216 - 2217 - static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 2193 + int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 2218 2194 struct ext4_xattr_info *i, 2219 2195 struct ext4_xattr_ibody_find *is) 2220 2196 {

+3 -3

fs/ext4/xattr.h

··· 186 186 extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, 187 187 const char *name, 188 188 void *buffer, size_t buffer_size); 189 - extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, 190 - struct ext4_xattr_info *i, 191 - struct ext4_xattr_ibody_find *is); 189 + extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 190 + struct ext4_xattr_info *i, 191 + struct ext4_xattr_ibody_find *is); 192 192 193 193 extern struct mb_cache *ext4_xattr_create_cache(void); 194 194 extern void ext4_xattr_destroy_cache(struct mb_cache *);

+167 -39

fs/jbd2/checkpoint.c

··· 80 80 } 81 81 82 82 /* 83 - * Try to release a checkpointed buffer from its transaction. 84 - * Returns 1 if we released it and 2 if we also released the 85 - * whole transaction. 83 + * Check a checkpoint buffer could be release or not. 86 84 * 87 85 * Requires j_list_lock 88 86 */ 89 - static int __try_to_free_cp_buf(struct journal_head *jh) 87 + static inline bool __cp_buffer_busy(struct journal_head *jh) 90 88 { 91 - int ret = 0; 92 89 struct buffer_head *bh = jh2bh(jh); 93 90 94 - if (jh->b_transaction == NULL && !buffer_locked(bh) && 95 - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 96 - JBUFFER_TRACE(jh, "remove from checkpoint list"); 97 - ret = __jbd2_journal_remove_checkpoint(jh) + 1; 98 - } 99 - return ret; 91 + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); 100 92 } 101 93 102 94 /* ··· 220 228 * OK, we need to start writing disk blocks. Take one transaction 221 229 * and write it. 222 230 */ 223 - result = 0; 224 231 spin_lock(&journal->j_list_lock); 225 232 if (!journal->j_checkpoint_transactions) 226 233 goto out; ··· 286 295 goto restart; 287 296 } 288 297 if (!buffer_dirty(bh)) { 289 - if (unlikely(buffer_write_io_error(bh)) && !result) 290 - result = -EIO; 291 298 BUFFER_TRACE(bh, "remove from checkpoint"); 292 299 if (__jbd2_journal_remove_checkpoint(jh)) 293 300 /* The transaction was released; we're done */ ··· 345 356 spin_lock(&journal->j_list_lock); 346 357 goto restart2; 347 358 } 348 - if (unlikely(buffer_write_io_error(bh)) && !result) 349 - result = -EIO; 350 359 351 360 /* 352 361 * Now in whatever state the buffer currently is, we ··· 356 369 } 357 370 out: 358 371 spin_unlock(&journal->j_list_lock); 359 - if (result < 0) 360 - jbd2_journal_abort(journal, result); 361 - else 362 - result = jbd2_cleanup_journal_tail(journal); 372 + result = jbd2_cleanup_journal_tail(journal); 363 373 364 374 return (result < 0) ? result : 0; 365 375 } ··· 421 437 { 422 438 struct journal_head *last_jh; 423 439 struct journal_head *next_jh = jh; 424 - int ret; 425 440 426 441 if (!jh) 427 442 return 0; ··· 429 446 do { 430 447 jh = next_jh; 431 448 next_jh = jh->b_cpnext; 432 - if (!destroy) 433 - ret = __try_to_free_cp_buf(jh); 434 - else 435 - ret = __jbd2_journal_remove_checkpoint(jh) + 1; 436 - if (!ret) 449 + 450 + if (!destroy && __cp_buffer_busy(jh)) 437 451 return 0; 438 - if (ret == 2) 452 + 453 + if (__jbd2_journal_remove_checkpoint(jh)) 439 454 return 1; 440 455 /* 441 456 * This function only frees up some memory ··· 446 465 } while (jh != last_jh); 447 466 448 467 return 0; 468 + } 469 + 470 + /* 471 + * journal_shrink_one_cp_list 472 + * 473 + * Find 'nr_to_scan' written-back checkpoint buffers in the given list 474 + * and try to release them. If the whole transaction is released, set 475 + * the 'released' parameter. Return the number of released checkpointed 476 + * buffers. 477 + * 478 + * Called with j_list_lock held. 479 + */ 480 + static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, 481 + unsigned long *nr_to_scan, 482 + bool *released) 483 + { 484 + struct journal_head *last_jh; 485 + struct journal_head *next_jh = jh; 486 + unsigned long nr_freed = 0; 487 + int ret; 488 + 489 + if (!jh || *nr_to_scan == 0) 490 + return 0; 491 + 492 + last_jh = jh->b_cpprev; 493 + do { 494 + jh = next_jh; 495 + next_jh = jh->b_cpnext; 496 + 497 + (*nr_to_scan)--; 498 + if (__cp_buffer_busy(jh)) 499 + continue; 500 + 501 + nr_freed++; 502 + ret = __jbd2_journal_remove_checkpoint(jh); 503 + if (ret) { 504 + *released = true; 505 + break; 506 + } 507 + 508 + if (need_resched()) 509 + break; 510 + } while (jh != last_jh && *nr_to_scan); 511 + 512 + return nr_freed; 513 + } 514 + 515 + /* 516 + * jbd2_journal_shrink_checkpoint_list 517 + * 518 + * Find 'nr_to_scan' written-back checkpoint buffers in the journal 519 + * and try to release them. Return the number of released checkpointed 520 + * buffers. 521 + * 522 + * Called with j_list_lock held. 523 + */ 524 + unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, 525 + unsigned long *nr_to_scan) 526 + { 527 + transaction_t *transaction, *last_transaction, *next_transaction; 528 + bool released; 529 + tid_t first_tid = 0, last_tid = 0, next_tid = 0; 530 + tid_t tid = 0; 531 + unsigned long nr_freed = 0; 532 + unsigned long nr_scanned = *nr_to_scan; 533 + 534 + again: 535 + spin_lock(&journal->j_list_lock); 536 + if (!journal->j_checkpoint_transactions) { 537 + spin_unlock(&journal->j_list_lock); 538 + goto out; 539 + } 540 + 541 + /* 542 + * Get next shrink transaction, resume previous scan or start 543 + * over again. If some others do checkpoint and drop transaction 544 + * from the checkpoint list, we ignore saved j_shrink_transaction 545 + * and start over unconditionally. 546 + */ 547 + if (journal->j_shrink_transaction) 548 + transaction = journal->j_shrink_transaction; 549 + else 550 + transaction = journal->j_checkpoint_transactions; 551 + 552 + if (!first_tid) 553 + first_tid = transaction->t_tid; 554 + last_transaction = journal->j_checkpoint_transactions->t_cpprev; 555 + next_transaction = transaction; 556 + last_tid = last_transaction->t_tid; 557 + do { 558 + transaction = next_transaction; 559 + next_transaction = transaction->t_cpnext; 560 + tid = transaction->t_tid; 561 + released = false; 562 + 563 + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, 564 + nr_to_scan, &released); 565 + if (*nr_to_scan == 0) 566 + break; 567 + if (need_resched() || spin_needbreak(&journal->j_list_lock)) 568 + break; 569 + if (released) 570 + continue; 571 + 572 + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, 573 + nr_to_scan, &released); 574 + if (*nr_to_scan == 0) 575 + break; 576 + if (need_resched() || spin_needbreak(&journal->j_list_lock)) 577 + break; 578 + } while (transaction != last_transaction); 579 + 580 + if (transaction != last_transaction) { 581 + journal->j_shrink_transaction = next_transaction; 582 + next_tid = next_transaction->t_tid; 583 + } else { 584 + journal->j_shrink_transaction = NULL; 585 + next_tid = 0; 586 + } 587 + 588 + spin_unlock(&journal->j_list_lock); 589 + cond_resched(); 590 + 591 + if (*nr_to_scan && next_tid) 592 + goto again; 593 + out: 594 + nr_scanned -= *nr_to_scan; 595 + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, 596 + nr_freed, nr_scanned, next_tid); 597 + 598 + return nr_freed; 449 599 } 450 600 451 601 /* ··· 676 564 struct transaction_chp_stats_s *stats; 677 565 transaction_t *transaction; 678 566 journal_t *journal; 679 - int ret = 0; 567 + struct buffer_head *bh = jh2bh(jh); 680 568 681 569 JBUFFER_TRACE(jh, "entry"); 682 570 683 - if ((transaction = jh->b_cp_transaction) == NULL) { 571 + transaction = jh->b_cp_transaction; 572 + if (!transaction) { 684 573 JBUFFER_TRACE(jh, "not on transaction"); 685 - goto out; 574 + return 0; 686 575 } 687 576 journal = transaction->t_journal; 688 577 689 578 JBUFFER_TRACE(jh, "removing from transaction"); 579 + 580 + /* 581 + * If we have failed to write the buffer out to disk, the filesystem 582 + * may become inconsistent. We cannot abort the journal here since 583 + * we hold j_list_lock and we have to be careful about races with 584 + * jbd2_journal_destroy(). So mark the writeback IO error in the 585 + * journal here and we abort the journal later from a better context. 586 + */ 587 + if (buffer_write_io_error(bh)) 588 + set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); 589 + 690 590 __buffer_unlink(jh); 691 591 jh->b_cp_transaction = NULL; 592 + percpu_counter_dec(&journal->j_jh_shrink_count); 692 593 jbd2_journal_put_journal_head(jh); 693 594 694 - if (transaction->t_checkpoint_list != NULL || 695 - transaction->t_checkpoint_io_list != NULL) 696 - goto out; 595 + /* Is this transaction empty? */ 596 + if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) 597 + return 0; 697 598 698 599 /* 699 600 * There is one special case to worry about: if we have just pulled the ··· 718 593 * See the comment at the end of jbd2_journal_commit_transaction(). 719 594 */ 720 595 if (transaction->t_state != T_FINISHED) 721 - goto out; 596 + return 0; 722 597 723 - /* OK, that was the last buffer for the transaction: we can now 724 - safely remove this transaction from the log */ 598 + /* 599 + * OK, that was the last buffer for the transaction, we can now 600 + * safely remove this transaction from the log. 601 + */ 725 602 stats = &transaction->t_chp_stats; 726 603 if (stats->cs_chp_time) 727 604 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, ··· 733 606 734 607 __jbd2_journal_drop_transaction(journal, transaction); 735 608 jbd2_journal_free_transaction(transaction); 736 - ret = 1; 737 - out: 738 - return ret; 609 + return 1; 739 610 } 740 611 741 612 /* ··· 764 639 jh->b_cpnext->b_cpprev = jh; 765 640 } 766 641 transaction->t_checkpoint_list = jh; 642 + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); 767 643 } 768 644 769 645 /* ··· 780 654 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) 781 655 { 782 656 assert_spin_locked(&journal->j_list_lock); 657 + 658 + journal->j_shrink_transaction = NULL; 783 659 if (transaction->t_cpnext) { 784 660 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 785 661 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;

+219 -11

fs/jbd2/journal.c

··· 934 934 } 935 935 EXPORT_SYMBOL(jbd2_fc_wait_bufs); 936 936 937 - /* 938 - * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf 939 - * for completion. 940 - */ 941 937 int jbd2_fc_release_bufs(journal_t *journal) 942 938 { 943 939 struct buffer_head *bh; ··· 941 945 942 946 j_fc_off = journal->j_fc_off; 943 947 944 - /* 945 - * Wait in reverse order to minimize chances of us being woken up before 946 - * all IOs have completed 947 - */ 948 948 for (i = j_fc_off - 1; i >= 0; i--) { 949 949 bh = journal->j_fc_wbuf[i]; 950 950 if (!bh) ··· 1610 1618 1611 1619 if (is_journal_aborted(journal)) 1612 1620 return -EIO; 1621 + if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { 1622 + jbd2_journal_abort(journal, -EIO); 1623 + return -EIO; 1624 + } 1613 1625 1614 1626 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1615 1627 jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", ··· 1682 1686 write_unlock(&journal->j_state_lock); 1683 1687 } 1684 1688 1689 + /** 1690 + * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) 1691 + * @journal: The journal to erase. 1692 + * @flags: A discard/zeroout request is sent for each physically contigous 1693 + * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or 1694 + * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation 1695 + * to perform. 1696 + * 1697 + * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes 1698 + * will be explicitly written if no hardware offload is available, see 1699 + * blkdev_issue_zeroout for more details. 1700 + */ 1701 + static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) 1702 + { 1703 + int err = 0; 1704 + unsigned long block, log_offset; /* logical */ 1705 + unsigned long long phys_block, block_start, block_stop; /* physical */ 1706 + loff_t byte_start, byte_stop, byte_count; 1707 + struct request_queue *q = bdev_get_queue(journal->j_dev); 1708 + 1709 + /* flags must be set to either discard or zeroout */ 1710 + if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || 1711 + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && 1712 + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) 1713 + return -EINVAL; 1714 + 1715 + if (!q) 1716 + return -ENXIO; 1717 + 1718 + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) 1719 + return -EOPNOTSUPP; 1720 + 1721 + /* 1722 + * lookup block mapping and issue discard/zeroout for each 1723 + * contiguous region 1724 + */ 1725 + log_offset = be32_to_cpu(journal->j_superblock->s_first); 1726 + block_start = ~0ULL; 1727 + for (block = log_offset; block < journal->j_total_len; block++) { 1728 + err = jbd2_journal_bmap(journal, block, &phys_block); 1729 + if (err) { 1730 + pr_err("JBD2: bad block at offset %lu", block); 1731 + return err; 1732 + } 1733 + 1734 + if (block_start == ~0ULL) { 1735 + block_start = phys_block; 1736 + block_stop = block_start - 1; 1737 + } 1738 + 1739 + /* 1740 + * last block not contiguous with current block, 1741 + * process last contiguous region and return to this block on 1742 + * next loop 1743 + */ 1744 + if (phys_block != block_stop + 1) { 1745 + block--; 1746 + } else { 1747 + block_stop++; 1748 + /* 1749 + * if this isn't the last block of journal, 1750 + * no need to process now because next block may also 1751 + * be part of this contiguous region 1752 + */ 1753 + if (block != journal->j_total_len - 1) 1754 + continue; 1755 + } 1756 + 1757 + /* 1758 + * end of contiguous region or this is last block of journal, 1759 + * take care of the region 1760 + */ 1761 + byte_start = block_start * journal->j_blocksize; 1762 + byte_stop = block_stop * journal->j_blocksize; 1763 + byte_count = (block_stop - block_start + 1) * 1764 + journal->j_blocksize; 1765 + 1766 + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, 1767 + byte_start, byte_stop); 1768 + 1769 + if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { 1770 + err = blkdev_issue_discard(journal->j_dev, 1771 + byte_start >> SECTOR_SHIFT, 1772 + byte_count >> SECTOR_SHIFT, 1773 + GFP_NOFS, 0); 1774 + } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { 1775 + err = blkdev_issue_zeroout(journal->j_dev, 1776 + byte_start >> SECTOR_SHIFT, 1777 + byte_count >> SECTOR_SHIFT, 1778 + GFP_NOFS, 0); 1779 + } 1780 + 1781 + if (unlikely(err != 0)) { 1782 + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", 1783 + err, block_start, block_stop); 1784 + return err; 1785 + } 1786 + 1787 + /* reset start and stop after processing a region */ 1788 + block_start = ~0ULL; 1789 + } 1790 + 1791 + return blkdev_issue_flush(journal->j_dev); 1792 + } 1685 1793 1686 1794 /** 1687 1795 * jbd2_journal_update_sb_errno() - Update error in the journal. ··· 2051 1951 } 2052 1952 2053 1953 /** 1954 + * jbd2_journal_shrink_scan() 1955 + * 1956 + * Scan the checkpointed buffer on the checkpoint list and release the 1957 + * journal_head. 1958 + */ 1959 + static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, 1960 + struct shrink_control *sc) 1961 + { 1962 + journal_t *journal = container_of(shrink, journal_t, j_shrinker); 1963 + unsigned long nr_to_scan = sc->nr_to_scan; 1964 + unsigned long nr_shrunk; 1965 + unsigned long count; 1966 + 1967 + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); 1968 + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); 1969 + 1970 + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); 1971 + 1972 + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); 1973 + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); 1974 + 1975 + return nr_shrunk; 1976 + } 1977 + 1978 + /** 1979 + * jbd2_journal_shrink_count() 1980 + * 1981 + * Count the number of checkpoint buffers on the checkpoint list. 1982 + */ 1983 + static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, 1984 + struct shrink_control *sc) 1985 + { 1986 + journal_t *journal = container_of(shrink, journal_t, j_shrinker); 1987 + unsigned long count; 1988 + 1989 + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); 1990 + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); 1991 + 1992 + return count; 1993 + } 1994 + 1995 + /** 1996 + * jbd2_journal_register_shrinker() 1997 + * @journal: Journal to act on. 1998 + * 1999 + * Init a percpu counter to record the checkpointed buffers on the checkpoint 2000 + * list and register a shrinker to release their journal_head. 2001 + */ 2002 + int jbd2_journal_register_shrinker(journal_t *journal) 2003 + { 2004 + int err; 2005 + 2006 + journal->j_shrink_transaction = NULL; 2007 + 2008 + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); 2009 + if (err) 2010 + return err; 2011 + 2012 + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; 2013 + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; 2014 + journal->j_shrinker.seeks = DEFAULT_SEEKS; 2015 + journal->j_shrinker.batch = journal->j_max_transaction_buffers; 2016 + 2017 + err = register_shrinker(&journal->j_shrinker); 2018 + if (err) { 2019 + percpu_counter_destroy(&journal->j_jh_shrink_count); 2020 + return err; 2021 + } 2022 + 2023 + return 0; 2024 + } 2025 + EXPORT_SYMBOL(jbd2_journal_register_shrinker); 2026 + 2027 + /** 2028 + * jbd2_journal_unregister_shrinker() 2029 + * @journal: Journal to act on. 2030 + * 2031 + * Unregister the checkpointed buffer shrinker and destroy the percpu counter. 2032 + */ 2033 + void jbd2_journal_unregister_shrinker(journal_t *journal) 2034 + { 2035 + percpu_counter_destroy(&journal->j_jh_shrink_count); 2036 + unregister_shrinker(&journal->j_shrinker); 2037 + } 2038 + EXPORT_SYMBOL(jbd2_journal_unregister_shrinker); 2039 + 2040 + /** 2054 2041 * jbd2_journal_destroy() - Release a journal_t structure. 2055 2042 * @journal: Journal to act on. 2056 2043 * ··· 2182 1995 J_ASSERT(journal->j_checkpoint_transactions == NULL); 2183 1996 spin_unlock(&journal->j_list_lock); 2184 1997 1998 + /* 1999 + * OK, all checkpoint transactions have been checked, now check the 2000 + * write out io error flag and abort the journal if some buffer failed 2001 + * to write back to the original location, otherwise the filesystem 2002 + * may become inconsistent. 2003 + */ 2004 + if (!is_journal_aborted(journal) && 2005 + test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) 2006 + jbd2_journal_abort(journal, -EIO); 2007 + 2185 2008 if (journal->j_sb_buffer) { 2186 2009 if (!is_journal_aborted(journal)) { 2187 2010 mutex_lock_io(&journal->j_checkpoint_mutex); ··· 2208 2011 err = -EIO; 2209 2012 brelse(journal->j_sb_buffer); 2210 2013 } 2014 + 2015 + jbd2_journal_unregister_shrinker(journal); 2211 2016 2212 2017 if (journal->j_proc_entry) 2213 2018 jbd2_stats_proc_exit(journal); ··· 2445 2246 /** 2446 2247 * jbd2_journal_flush() - Flush journal 2447 2248 * @journal: Journal to act on. 2249 + * @flags: optional operation on the journal blocks after the flush (see below) 2448 2250 * 2449 2251 * Flush all data for a given journal to disk and empty the journal. 2450 2252 * Filesystems can use this when remounting readonly to ensure that 2451 - * recovery does not need to happen on remount. 2253 + * recovery does not need to happen on remount. Optionally, a discard or zeroout 2254 + * can be issued on the journal blocks after flushing. 2255 + * 2256 + * flags: 2257 + * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks 2258 + * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks 2452 2259 */ 2453 - 2454 - int jbd2_journal_flush(journal_t *journal) 2260 + int jbd2_journal_flush(journal_t *journal, unsigned int flags) 2455 2261 { 2456 2262 int err = 0; 2457 2263 transaction_t *transaction = NULL; ··· 2510 2306 * commits of data to the journal will restore the current 2511 2307 * s_start value. */ 2512 2308 jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); 2309 + 2310 + if (flags) 2311 + err = __jbd2_journal_erase(journal, flags); 2312 + 2513 2313 mutex_unlock(&journal->j_checkpoint_mutex); 2514 2314 write_lock(&journal->j_state_lock); 2515 2315 J_ASSERT(!journal->j_running_transaction);

-17

fs/jbd2/transaction.c

··· 2123 2123 { 2124 2124 struct buffer_head *head; 2125 2125 struct buffer_head *bh; 2126 - bool has_write_io_error = false; 2127 2126 int ret = 0; 2128 2127 2129 2128 J_ASSERT(PageLocked(page)); ··· 2147 2148 jbd2_journal_put_journal_head(jh); 2148 2149 if (buffer_jbd(bh)) 2149 2150 goto busy; 2150 - 2151 - /* 2152 - * If we free a metadata buffer which has been failed to 2153 - * write out, the jbd2 checkpoint procedure will not detect 2154 - * this failure and may lead to filesystem inconsistency 2155 - * after cleanup journal tail. 2156 - */ 2157 - if (buffer_write_io_error(bh)) { 2158 - pr_err("JBD2: Error while async write back metadata bh %llu.", 2159 - (unsigned long long)bh->b_blocknr); 2160 - has_write_io_error = true; 2161 - } 2162 2151 } while ((bh = bh->b_this_page) != head); 2163 2152 2164 2153 ret = try_to_free_buffers(page); 2165 - 2166 2154 busy: 2167 - if (has_write_io_error) 2168 - jbd2_journal_abort(journal, -EIO); 2169 - 2170 2155 return ret; 2171 2156 } 2172 2157

+1 -1

fs/ocfs2/alloc.c

··· 6018 6018 * Then truncate log will be replayed resulting in cluster double free. 6019 6019 */ 6020 6020 jbd2_journal_lock_updates(journal->j_journal); 6021 - status = jbd2_journal_flush(journal->j_journal); 6021 + status = jbd2_journal_flush(journal->j_journal, 0); 6022 6022 jbd2_journal_unlock_updates(journal->j_journal); 6023 6023 if (status < 0) { 6024 6024 mlog_errno(status);

+4 -4

fs/ocfs2/journal.c

··· 308 308 } 309 309 310 310 jbd2_journal_lock_updates(journal->j_journal); 311 - status = jbd2_journal_flush(journal->j_journal); 311 + status = jbd2_journal_flush(journal->j_journal, 0); 312 312 jbd2_journal_unlock_updates(journal->j_journal); 313 313 if (status < 0) { 314 314 up_write(&journal->j_trans_barrier); ··· 1000 1000 1001 1001 if (ocfs2_mount_local(osb)) { 1002 1002 jbd2_journal_lock_updates(journal->j_journal); 1003 - status = jbd2_journal_flush(journal->j_journal); 1003 + status = jbd2_journal_flush(journal->j_journal, 0); 1004 1004 jbd2_journal_unlock_updates(journal->j_journal); 1005 1005 if (status < 0) 1006 1006 mlog_errno(status); ··· 1070 1070 1071 1071 if (replayed) { 1072 1072 jbd2_journal_lock_updates(journal->j_journal); 1073 - status = jbd2_journal_flush(journal->j_journal); 1073 + status = jbd2_journal_flush(journal->j_journal, 0); 1074 1074 jbd2_journal_unlock_updates(journal->j_journal); 1075 1075 if (status < 0) 1076 1076 mlog_errno(status); ··· 1666 1666 1667 1667 /* wipe the journal */ 1668 1668 jbd2_journal_lock_updates(journal); 1669 - status = jbd2_journal_flush(journal); 1669 + status = jbd2_journal_flush(journal, 0); 1670 1670 jbd2_journal_unlock_updates(journal); 1671 1671 if (status < 0) 1672 1672 mlog_errno(status);

-1

include/linux/fs.h

··· 2171 2171 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 2172 2172 struct dquot **(*get_dquots)(struct inode *); 2173 2173 #endif 2174 - int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 2175 2174 long (*nr_cached_objects)(struct super_block *, 2176 2175 struct shrink_control *); 2177 2176 long (*free_cached_objects)(struct super_block *,

+42 -1

include/linux/jbd2.h

··· 780 780 unsigned long j_flags; 781 781 782 782 /** 783 + * @j_atomic_flags: Atomic journaling state flags. 784 + */ 785 + unsigned long j_atomic_flags; 786 + 787 + /** 783 788 * @j_errno: 784 789 * 785 790 * Is there an outstanding uncleared error on the journal (from a prior ··· 908 903 * @j_checkpoint_mutex. [j_checkpoint_mutex] 909 904 */ 910 905 struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; 906 + 907 + /** 908 + * @j_shrinker: 909 + * 910 + * Journal head shrinker, reclaim buffer's journal head which 911 + * has been written back. 912 + */ 913 + struct shrinker j_shrinker; 914 + 915 + /** 916 + * @j_jh_shrink_count: 917 + * 918 + * Number of journal buffers on the checkpoint list. [j_list_lock] 919 + */ 920 + struct percpu_counter j_jh_shrink_count; 921 + 922 + /** 923 + * @j_shrink_transaction: 924 + * 925 + * Record next transaction will shrink on the checkpoint list. 926 + * [j_list_lock] 927 + */ 928 + transaction_t *j_shrink_transaction; 911 929 912 930 /** 913 931 * @j_head: ··· 1398 1370 * mode */ 1399 1371 #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ 1400 1372 #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ 1373 + #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 1374 + #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 1375 + #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ 1376 + JBD2_JOURNAL_FLUSH_ZEROOUT) 1377 + 1378 + /* 1379 + * Journal atomic flag definitions 1380 + */ 1381 + #define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing 1382 + * buffer back to disk */ 1401 1383 1402 1384 /* 1403 1385 * Function declarations for the journaling transaction and buffer ··· 1445 1407 1446 1408 /* Checkpoint list management */ 1447 1409 void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); 1410 + unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); 1448 1411 int __jbd2_journal_remove_checkpoint(struct journal_head *); 1449 1412 void jbd2_journal_destroy_checkpoint(journal_t *journal); 1450 1413 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); ··· 1539 1500 struct page *, unsigned int, unsigned int); 1540 1501 extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page); 1541 1502 extern int jbd2_journal_stop(handle_t *); 1542 - extern int jbd2_journal_flush (journal_t *); 1503 + extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); 1543 1504 extern void jbd2_journal_lock_updates (journal_t *); 1544 1505 extern void jbd2_journal_unlock_updates (journal_t *); 1545 1506 ··· 1556 1517 (journal_t *, unsigned long, unsigned long, unsigned long); 1557 1518 extern void jbd2_journal_clear_features 1558 1519 (journal_t *, unsigned long, unsigned long, unsigned long); 1520 + extern int jbd2_journal_register_shrinker(journal_t *journal); 1521 + extern void jbd2_journal_unregister_shrinker(journal_t *journal); 1559 1522 extern int jbd2_journal_load (journal_t *journal); 1560 1523 extern int jbd2_journal_destroy (journal_t *); 1561 1524 extern int jbd2_journal_recover (journal_t *journal);

+101

include/trace/events/jbd2.h

··· 394 394 __entry->stall_ms) 395 395 ); 396 396 397 + DECLARE_EVENT_CLASS(jbd2_journal_shrink, 398 + 399 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, 400 + unsigned long count), 401 + 402 + TP_ARGS(journal, nr_to_scan, count), 403 + 404 + TP_STRUCT__entry( 405 + __field(dev_t, dev) 406 + __field(unsigned long, nr_to_scan) 407 + __field(unsigned long, count) 408 + ), 409 + 410 + TP_fast_assign( 411 + __entry->dev = journal->j_fs_dev->bd_dev; 412 + __entry->nr_to_scan = nr_to_scan; 413 + __entry->count = count; 414 + ), 415 + 416 + TP_printk("dev %d,%d nr_to_scan %lu count %lu", 417 + MAJOR(__entry->dev), MINOR(__entry->dev), 418 + __entry->nr_to_scan, __entry->count) 419 + ); 420 + 421 + DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, 422 + 423 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), 424 + 425 + TP_ARGS(journal, nr_to_scan, count) 426 + ); 427 + 428 + DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, 429 + 430 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), 431 + 432 + TP_ARGS(journal, nr_to_scan, count) 433 + ); 434 + 435 + TRACE_EVENT(jbd2_shrink_scan_exit, 436 + 437 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, 438 + unsigned long nr_shrunk, unsigned long count), 439 + 440 + TP_ARGS(journal, nr_to_scan, nr_shrunk, count), 441 + 442 + TP_STRUCT__entry( 443 + __field(dev_t, dev) 444 + __field(unsigned long, nr_to_scan) 445 + __field(unsigned long, nr_shrunk) 446 + __field(unsigned long, count) 447 + ), 448 + 449 + TP_fast_assign( 450 + __entry->dev = journal->j_fs_dev->bd_dev; 451 + __entry->nr_to_scan = nr_to_scan; 452 + __entry->nr_shrunk = nr_shrunk; 453 + __entry->count = count; 454 + ), 455 + 456 + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", 457 + MAJOR(__entry->dev), MINOR(__entry->dev), 458 + __entry->nr_to_scan, __entry->nr_shrunk, 459 + __entry->count) 460 + ); 461 + 462 + TRACE_EVENT(jbd2_shrink_checkpoint_list, 463 + 464 + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, 465 + unsigned long nr_freed, unsigned long nr_scanned, 466 + tid_t next_tid), 467 + 468 + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, 469 + nr_scanned, next_tid), 470 + 471 + TP_STRUCT__entry( 472 + __field(dev_t, dev) 473 + __field(tid_t, first_tid) 474 + __field(tid_t, tid) 475 + __field(tid_t, last_tid) 476 + __field(unsigned long, nr_freed) 477 + __field(unsigned long, nr_scanned) 478 + __field(tid_t, next_tid) 479 + ), 480 + 481 + TP_fast_assign( 482 + __entry->dev = journal->j_fs_dev->bd_dev; 483 + __entry->first_tid = first_tid; 484 + __entry->tid = tid; 485 + __entry->last_tid = last_tid; 486 + __entry->nr_freed = nr_freed; 487 + __entry->nr_scanned = nr_scanned; 488 + __entry->next_tid = next_tid; 489 + ), 490 + 491 + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " 492 + "scanned %lu next transaction %u", 493 + MAJOR(__entry->dev), MINOR(__entry->dev), 494 + __entry->first_tid, __entry->tid, __entry->last_tid, 495 + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) 496 + ); 497 + 397 498 #endif /* _TRACE_JBD2_H */ 398 499 399 500 /* This part must be outside protection */