Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
"New features for ext4 this cycle include support for encrypted
casefold, ensure that deleted file names are cleared in directory
blocks by zeroing directory entries when they are unlinked or moved as
part of a hash tree node split. We also improve the block allocator's
performance on a freshly mounted file system by prefetching block
bitmaps.

There are also the usual cleanups and bug fixes, including fixing a
page cache invalidation race when there is mixed buffered and direct
I/O and the block size is less than page size, and allow the dax flag
to be set and cleared on inline directories"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (32 commits)
ext4: wipe ext4_dir_entry2 upon file deletion
ext4: Fix occasional generic/418 failure
fs: fix reporting supported extra file attributes for statx()
ext4: allow the dax flag to be set and cleared on inline directories
ext4: fix debug format string warning
ext4: fix trailing whitespace
ext4: fix various seppling typos
ext4: fix error return code in ext4_fc_perform_commit()
ext4: annotate data race in jbd2_journal_dirty_metadata()
ext4: annotate data race in start_this_handle()
ext4: fix ext4_error_err save negative errno into superblock
ext4: fix error code in ext4_commit_super
ext4: always panic when errors=panic is specified
ext4: delete redundant uptodate check for buffer
ext4: do not set SB_ACTIVE in ext4_orphan_cleanup()
ext4: make prefetch_block_bitmaps default
ext4: add proc files to monitor new structures
ext4: improve cr 0 / cr 1 group scanning
ext4: add MB_NUM_ORDERS macro
ext4: add mballoc stats proc file
...

+1144 -427
+27
Documentation/filesystems/ext4/directory.rst
··· 121 121 * - 0x7 122 122 - Symbolic link. 123 123 124 + To support directories that are both encrypted and casefolded directories, we 125 + must also include hash information in the directory entry. We append 126 + ``ext4_extended_dir_entry_2`` to ``ext4_dir_entry_2`` except for the entries 127 + for dot and dotdot, which are kept the same. The structure follows immediately 128 + after ``name`` and is included in the size listed by ``rec_len`` If a directory 129 + entry uses this extension, it may be up to 271 bytes. 130 + 131 + .. list-table:: 132 + :widths: 8 8 24 40 133 + :header-rows: 1 134 + 135 + * - Offset 136 + - Size 137 + - Name 138 + - Description 139 + * - 0x0 140 + - \_\_le32 141 + - hash 142 + - The hash of the directory name 143 + * - 0x4 144 + - \_\_le32 145 + - minor\_hash 146 + - The minor hash of the directory name 147 + 148 + 124 149 In order to add checksums to these classic directory blocks, a phony 125 150 ``struct ext4_dir_entry`` is placed at the end of each leaf block to 126 151 hold the checksum. The directory entry is 12 bytes long. The inode ··· 347 322 - Half MD4, unsigned. 348 323 * - 0x5 349 324 - Tea, unsigned. 325 + * - 0x6 326 + - Siphash. 350 327 351 328 Interior nodes of an htree are recorded as ``struct dx_node``, which is 352 329 also the full length of a data block:
+1 -1
fs/ext4/balloc.c
··· 239 239 ext4_group_t block_group, 240 240 struct ext4_group_desc *gdp) 241 241 { 242 - return num_clusters_in_group(sb, block_group) - 242 + return num_clusters_in_group(sb, block_group) - 243 243 ext4_num_overhead_clusters(sb, block_group, gdp); 244 244 } 245 245
+30 -11
fs/ext4/dir.c
··· 55 55 return 0; 56 56 } 57 57 58 + static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) 59 + { 60 + /* Check if . or .. , or skip if namelen is 0 */ 61 + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && 62 + (de->name[1] == '.' || de->name[1] == '\0')) 63 + return true; 64 + /* Check if this is a csum entry */ 65 + if (de->file_type == EXT4_FT_DIR_CSUM) 66 + return true; 67 + return false; 68 + } 69 + 58 70 /* 59 71 * Return 0 if the directory entry is OK, and 1 if there is a problem 60 72 * ··· 85 73 const int rlen = ext4_rec_len_from_disk(de->rec_len, 86 74 dir->i_sb->s_blocksize); 87 75 const int next_offset = ((char *) de - buf) + rlen; 76 + bool fake = is_fake_dir_entry(de); 77 + bool has_csum = ext4_has_metadata_csum(dir->i_sb); 88 78 89 - if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) 79 + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) 90 80 error_msg = "rec_len is smaller than minimal"; 91 81 else if (unlikely(rlen % 4 != 0)) 92 82 error_msg = "rec_len % 4 != 0"; 93 - else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) 83 + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, 84 + fake ? NULL : dir))) 94 85 error_msg = "rec_len is too small for name_len"; 95 86 else if (unlikely(next_offset > size)) 96 87 error_msg = "directory entry overrun"; 97 - else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && 88 + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, 89 + has_csum ? NULL : dir) && 98 90 next_offset != size)) 99 91 error_msg = "directory entry too close to block end"; 100 92 else if (unlikely(le32_to_cpu(de->inode) > ··· 110 94 if (filp) 111 95 ext4_error_file(filp, function, line, bh->b_blocknr, 112 96 "bad entry in directory: %s - offset=%u, " 113 - "inode=%u, rec_len=%d, name_len=%d, size=%d", 97 + "inode=%u, rec_len=%d, size=%d fake=%d", 114 98 error_msg, offset, le32_to_cpu(de->inode), 115 - rlen, de->name_len, size); 99 + rlen, size, fake); 116 100 else 117 101 ext4_error_inode(dir, function, line, bh->b_blocknr, 118 102 "bad entry in directory: %s - offset=%u, " 119 - "inode=%u, rec_len=%d, name_len=%d, size=%d", 103 + "inode=%u, rec_len=%d, size=%d fake=%d", 120 104 error_msg, offset, le32_to_cpu(de->inode), 121 - rlen, de->name_len, size); 105 + rlen, size, fake); 122 106 123 107 return 1; 124 108 } ··· 140 124 141 125 if (is_dx_dir(inode)) { 142 126 err = ext4_dx_readdir(file, ctx); 143 - if (err != ERR_BAD_DX_DIR) { 127 + if (err != ERR_BAD_DX_DIR) 144 128 return err; 145 - } 129 + 146 130 /* Can we just clear INDEX flag to ignore htree information? */ 147 131 if (!ext4_has_metadata_csum(sb)) { 148 132 /* ··· 240 224 * failure will be detected in the 241 225 * dirent test below. */ 242 226 if (ext4_rec_len_from_disk(de->rec_len, 243 - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) 227 + sb->s_blocksize) < ext4_dir_rec_len(1, 228 + inode)) 244 229 break; 245 230 i += ext4_rec_len_from_disk(de->rec_len, 246 231 sb->s_blocksize); ··· 282 265 283 266 /* Directory is encrypted */ 284 267 err = fscrypt_fname_disk_to_usr(inode, 285 - 0, 0, &de_name, &fstr); 268 + EXT4_DIRENT_HASH(de), 269 + EXT4_DIRENT_MINOR_HASH(de), 270 + &de_name, &fstr); 286 271 de_name = fstr; 287 272 fstr.len = save_len; 288 273 if (err)
+85 -22
fs/ext4/ext4.h
··· 162 162 #define EXT4_MB_USE_RESERVED 0x2000 163 163 /* Do strict check for free blocks while retrying block allocation */ 164 164 #define EXT4_MB_STRICT_CHECK 0x4000 165 - 165 + /* Large fragment size list lookup succeeded at least once for cr = 0 */ 166 + #define EXT4_MB_CR0_OPTIMIZED 0x8000 167 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ 168 + #define EXT4_MB_CR1_OPTIMIZED 0x00010000 169 + /* Perform linear traversal for one group */ 170 + #define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 166 171 struct ext4_allocation_request { 167 172 /* target inode for block we're allocating */ 168 173 struct inode *inode; ··· 1218 1213 #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 1219 1214 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 1220 1215 #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ 1221 - #define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 1216 + #define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 1222 1217 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 1223 1218 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 1224 1219 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ ··· 1243 1238 #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ 1244 1239 #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ 1245 1240 #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ 1246 - 1241 + #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group 1242 + * scanning in mballoc 1243 + */ 1247 1244 1248 1245 #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 1249 1246 ~EXT4_MOUNT_##opt ··· 1526 1519 unsigned int s_mb_free_pending; 1527 1520 struct list_head s_freed_data_list; /* List of blocks to be freed 1528 1521 after commit completed */ 1522 + struct rb_root s_mb_avg_fragment_size_root; 1523 + rwlock_t s_mb_rb_lock; 1524 + struct list_head *s_mb_largest_free_orders; 1525 + rwlock_t *s_mb_largest_free_orders_locks; 1529 1526 1530 1527 /* tunables */ 1531 1528 unsigned long s_stripe; 1529 + unsigned int s_mb_max_linear_groups; 1532 1530 unsigned int s_mb_stream_request; 1533 1531 unsigned int s_mb_max_to_scan; 1534 1532 unsigned int s_mb_min_to_scan; ··· 1553 1541 atomic_t s_bal_success; /* we found long enough chunks */ 1554 1542 atomic_t s_bal_allocated; /* in blocks */ 1555 1543 atomic_t s_bal_ex_scanned; /* total extents scanned */ 1544 + atomic_t s_bal_groups_scanned; /* number of groups scanned */ 1556 1545 atomic_t s_bal_goals; /* goal hits */ 1557 1546 atomic_t s_bal_breaks; /* too long searches */ 1558 1547 atomic_t s_bal_2orders; /* 2^order hits */ 1559 - spinlock_t s_bal_lock; 1560 - unsigned long s_mb_buddies_generated; 1561 - unsigned long long s_mb_generation_time; 1548 + atomic_t s_bal_cr0_bad_suggestions; 1549 + atomic_t s_bal_cr1_bad_suggestions; 1550 + atomic64_t s_bal_cX_groups_considered[4]; 1551 + atomic64_t s_bal_cX_hits[4]; 1552 + atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ 1553 + atomic_t s_mb_buddies_generated; /* number of buddies generated */ 1554 + atomic64_t s_mb_generation_time; 1562 1555 atomic_t s_mb_lost_chunks; 1563 1556 atomic_t s_mb_preallocated; 1564 1557 atomic_t s_mb_discarded; ··· 2204 2187 char name[EXT4_NAME_LEN]; /* File name */ 2205 2188 }; 2206 2189 2190 + 2191 + /* 2192 + * Encrypted Casefolded entries require saving the hash on disk. This structure 2193 + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned 2194 + * boundary. 2195 + */ 2196 + struct ext4_dir_entry_hash { 2197 + __le32 hash; 2198 + __le32 minor_hash; 2199 + }; 2200 + 2207 2201 /* 2208 2202 * The new version of the directory entry. Since EXT4 structures are 2209 2203 * stored in intel byte order, and the name_len field could never be ··· 2228 2200 __u8 file_type; /* See file type macros EXT4_FT_* below */ 2229 2201 char name[EXT4_NAME_LEN]; /* File name */ 2230 2202 }; 2203 + 2204 + /* 2205 + * Access the hashes at the end of ext4_dir_entry_2 2206 + */ 2207 + #define EXT4_DIRENT_HASHES(entry) \ 2208 + ((struct ext4_dir_entry_hash *) \ 2209 + (((void *)(entry)) + \ 2210 + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) 2211 + #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) 2212 + #define EXT4_DIRENT_MINOR_HASH(entry) \ 2213 + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) 2214 + 2215 + static inline bool ext4_hash_in_dirent(const struct inode *inode) 2216 + { 2217 + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); 2218 + } 2231 2219 2232 2220 /* 2233 2221 * This is a bogus directory entry at the end of each leaf block that ··· 2286 2242 */ 2287 2243 #define EXT4_DIR_PAD 4 2288 2244 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) 2289 - #define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ 2290 - ~EXT4_DIR_ROUND) 2291 2245 #define EXT4_MAX_REC_LEN ((1<<16)-1) 2246 + 2247 + /* 2248 + * The rec_len is dependent on the type of directory. Directories that are 2249 + * casefolded and encrypted need to store the hash as well, so we add room for 2250 + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should 2251 + * pass NULL for dir, as those entries do not use the extra fields. 2252 + */ 2253 + static inline unsigned int ext4_dir_rec_len(__u8 name_len, 2254 + const struct inode *dir) 2255 + { 2256 + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); 2257 + 2258 + if (dir && ext4_hash_in_dirent(dir)) 2259 + rec_len += sizeof(struct ext4_dir_entry_hash); 2260 + return (rec_len & ~EXT4_DIR_ROUND); 2261 + } 2292 2262 2293 2263 /* 2294 2264 * If we ever get support for fs block sizes > page_size, we'll need ··· 2360 2302 #define DX_HASH_LEGACY_UNSIGNED 3 2361 2303 #define DX_HASH_HALF_MD4_UNSIGNED 4 2362 2304 #define DX_HASH_TEA_UNSIGNED 5 2305 + #define DX_HASH_SIPHASH 6 2363 2306 2364 2307 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, 2365 2308 const void *address, unsigned int length) ··· 2415 2356 }; 2416 2357 2417 2358 #define fname_name(p) ((p)->disk_name.name) 2359 + #define fname_usr_name(p) ((p)->usr_fname->name) 2418 2360 #define fname_len(p) ((p)->disk_name.len) 2419 2361 2420 2362 /* ··· 2646 2586 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 2647 2587 2648 2588 #ifdef CONFIG_UNICODE 2649 - extern void ext4_fname_setup_ci_filename(struct inode *dir, 2589 + extern int ext4_fname_setup_ci_filename(struct inode *dir, 2650 2590 const struct qstr *iname, 2651 - struct fscrypt_str *fname); 2591 + struct ext4_filename *fname); 2652 2592 #endif 2653 2593 2654 2594 #ifdef CONFIG_FS_ENCRYPTION ··· 2679 2619 ext4_fname_from_fscrypt_name(fname, &name); 2680 2620 2681 2621 #ifdef CONFIG_UNICODE 2682 - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); 2622 + err = ext4_fname_setup_ci_filename(dir, iname, fname); 2683 2623 #endif 2684 - return 0; 2624 + return err; 2685 2625 } 2686 2626 2687 2627 static inline int ext4_fname_prepare_lookup(struct inode *dir, ··· 2698 2638 ext4_fname_from_fscrypt_name(fname, &name); 2699 2639 2700 2640 #ifdef CONFIG_UNICODE 2701 - ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); 2641 + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); 2702 2642 #endif 2703 - return 0; 2643 + return err; 2704 2644 } 2705 2645 2706 2646 static inline void ext4_fname_free_filename(struct ext4_filename *fname) ··· 2725 2665 int lookup, 2726 2666 struct ext4_filename *fname) 2727 2667 { 2668 + int err = 0; 2728 2669 fname->usr_fname = iname; 2729 2670 fname->disk_name.name = (unsigned char *) iname->name; 2730 2671 fname->disk_name.len = iname->len; 2731 2672 2732 2673 #ifdef CONFIG_UNICODE 2733 - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); 2674 + err = ext4_fname_setup_ci_filename(dir, iname, fname); 2734 2675 #endif 2735 2676 2736 - return 0; 2677 + return err; 2737 2678 } 2738 2679 2739 2680 static inline int ext4_fname_prepare_lookup(struct inode *dir, ··· 2759 2698 struct ext4_dir_entry_2 *, 2760 2699 struct buffer_head *, char *, int, 2761 2700 unsigned int); 2762 - #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ 2701 + #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ 2763 2702 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ 2764 - (de), (bh), (buf), (size), (offset))) 2703 + (de), (bh), (buf), (size), (offset))) 2765 2704 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 2766 2705 __u32 minor_hash, 2767 2706 struct ext4_dir_entry_2 *dirent, ··· 2772 2711 void *buf, int buf_size, 2773 2712 struct ext4_filename *fname, 2774 2713 struct ext4_dir_entry_2 **dest_de); 2775 - void ext4_insert_dentry(struct inode *inode, 2714 + void ext4_insert_dentry(struct inode *dir, struct inode *inode, 2776 2715 struct ext4_dir_entry_2 *de, 2777 2716 int buf_size, 2778 2717 struct ext4_filename *fname); ··· 2863 2802 2864 2803 /* mballoc.c */ 2865 2804 extern const struct seq_operations ext4_mb_seq_groups_ops; 2805 + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; 2866 2806 extern long ext4_mb_stats; 2867 2807 extern long ext4_mb_max_to_scan; 2808 + extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); 2868 2809 extern int ext4_mb_init(struct super_block *); 2869 2810 extern int ext4_mb_release(struct super_block *); 2870 2811 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, ··· 3369 3306 ext4_grpblk_t bb_free; /* total free blocks */ 3370 3307 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 3371 3308 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ 3309 + ext4_group_t bb_group; /* Group number */ 3372 3310 struct list_head bb_prealloc_list; 3373 3311 #ifdef DOUBLE_CHECK 3374 3312 void *bb_bitmap; 3375 3313 #endif 3376 3314 struct rw_semaphore alloc_sem; 3315 + struct rb_node bb_avg_fragment_size_rb; 3316 + struct list_head bb_largest_free_order_node; 3377 3317 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block 3378 3318 * regions, index is order. 3379 3319 * bb_counters[3] = 5 means ··· 3579 3513 unsigned int blocksize); 3580 3514 extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, 3581 3515 struct buffer_head *bh); 3582 - extern int ext4_ci_compare(const struct inode *parent, 3583 - const struct qstr *fname, 3584 - const struct qstr *entry, bool quick); 3585 3516 extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, 3586 3517 struct inode *inode); 3587 3518 extern int __ext4_link(struct inode *dir, struct inode *inode,
+5 -3
fs/ext4/fast_commit.c
··· 66 66 * Fast Commit Ineligibility 67 67 * ------------------------- 68 68 * Not all operations are supported by fast commits today (e.g extended 69 - * attributes). Fast commit ineligiblity is marked by calling one of the 69 + * attributes). Fast commit ineligibility is marked by calling one of the 70 70 * two following functions: 71 71 * 72 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall ··· 1088 1088 head.fc_tid = cpu_to_le32( 1089 1089 sbi->s_journal->j_running_transaction->t_tid); 1090 1090 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1091 - (u8 *)&head, &crc)) 1091 + (u8 *)&head, &crc)) { 1092 + ret = -ENOSPC; 1092 1093 goto out; 1094 + } 1093 1095 } 1094 1096 1095 1097 spin_lock(&sbi->s_fc_lock); ··· 1736 1734 } 1737 1735 1738 1736 /* Range is mapped and needs a state change */ 1739 - jbd_debug(1, "Converting from %d to %d %lld", 1737 + jbd_debug(1, "Converting from %ld to %d %lld", 1740 1738 map.m_flags & EXT4_MAP_UNWRITTEN, 1741 1739 ext4_ext_is_unwritten(ex), map.m_pblk); 1742 1740 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+21 -4
fs/ext4/file.c
··· 371 371 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, 372 372 int error, unsigned int flags) 373 373 { 374 - loff_t offset = iocb->ki_pos; 374 + loff_t pos = iocb->ki_pos; 375 375 struct inode *inode = file_inode(iocb->ki_filp); 376 376 377 377 if (error) 378 378 return error; 379 379 380 - if (size && flags & IOMAP_DIO_UNWRITTEN) 381 - return ext4_convert_unwritten_extents(NULL, inode, 382 - offset, size); 380 + if (size && flags & IOMAP_DIO_UNWRITTEN) { 381 + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); 382 + if (error < 0) 383 + return error; 384 + } 385 + /* 386 + * If we are extending the file, we have to update i_size here before 387 + * page cache gets invalidated in iomap_dio_rw(). Otherwise racing 388 + * buffered reads could zero out too much from page cache pages. Update 389 + * of on-disk size will happen later in ext4_dio_write_iter() where 390 + * we have enough information to also perform orphan list handling etc. 391 + * Note that we perform all extending writes synchronously under 392 + * i_rwsem held exclusively so i_size update is safe here in that case. 393 + * If the write was not extending, we cannot see pos > i_size here 394 + * because operations reducing i_size like truncate wait for all 395 + * outstanding DIO before updating i_size. 396 + */ 397 + pos += size; 398 + if (pos > i_size_read(inode)) 399 + i_size_write(inode, pos); 383 400 384 401 return 0; 385 402 }
+21 -4
fs/ext4/hash.c
··· 197 197 * represented, and whether or not the returned hash is 32 bits or 64 198 198 * bits. 32 bit hashes will return 0 for the minor hash. 199 199 */ 200 - static int __ext4fs_dirhash(const char *name, int len, 200 + static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, 201 201 struct dx_hash_info *hinfo) 202 202 { 203 203 __u32 hash; ··· 259 259 hash = buf[0]; 260 260 minor_hash = buf[1]; 261 261 break; 262 + case DX_HASH_SIPHASH: 263 + { 264 + struct qstr qname = QSTR_INIT(name, len); 265 + __u64 combined_hash; 266 + 267 + if (fscrypt_has_encryption_key(dir)) { 268 + combined_hash = fscrypt_fname_siphash(dir, &qname); 269 + } else { 270 + ext4_warning_inode(dir, "Siphash requires key"); 271 + return -1; 272 + } 273 + 274 + hash = (__u32)(combined_hash >> 32); 275 + minor_hash = (__u32)combined_hash; 276 + break; 277 + } 262 278 default: 263 279 hinfo->hash = 0; 264 280 return -1; ··· 296 280 unsigned char *buff; 297 281 struct qstr qstr = {.name = name, .len = len }; 298 282 299 - if (len && IS_CASEFOLDED(dir) && um) { 283 + if (len && IS_CASEFOLDED(dir) && um && 284 + (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { 300 285 buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); 301 286 if (!buff) 302 287 return -ENOMEM; ··· 308 291 goto opaque_seq; 309 292 } 310 293 311 - r = __ext4fs_dirhash(buff, dlen, hinfo); 294 + r = __ext4fs_dirhash(dir, buff, dlen, hinfo); 312 295 313 296 kfree(buff); 314 297 return r; 315 298 } 316 299 opaque_seq: 317 300 #endif 318 - return __ext4fs_dirhash(name, len, hinfo); 301 + return __ext4fs_dirhash(dir, name, len, hinfo); 319 302 }
+33 -16
fs/ext4/ialloc.c
··· 1292 1292 1293 1293 ei->i_extra_isize = sbi->s_want_extra_isize; 1294 1294 ei->i_inline_off = 0; 1295 - if (ext4_has_feature_inline_data(sb)) 1295 + if (ext4_has_feature_inline_data(sb) && 1296 + (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) 1296 1297 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1297 1298 ret = inode; 1298 1299 err = dquot_alloc_inode(inode); ··· 1514 1513 handle_t *handle; 1515 1514 ext4_fsblk_t blk; 1516 1515 int num, ret = 0, used_blks = 0; 1516 + unsigned long used_inos = 0; 1517 1517 1518 1518 /* This should not happen, but just to be sure check this */ 1519 1519 if (sb_rdonly(sb)) { ··· 1545 1543 * used inodes so we need to skip blocks with used inodes in 1546 1544 * inode table. 1547 1545 */ 1548 - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) 1549 - used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - 1550 - ext4_itable_unused_count(sb, gdp)), 1551 - sbi->s_inodes_per_block); 1546 + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { 1547 + used_inos = EXT4_INODES_PER_GROUP(sb) - 1548 + ext4_itable_unused_count(sb, gdp); 1549 + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); 1552 1550 1553 - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || 1554 - ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - 1555 - ext4_itable_unused_count(sb, gdp)) < 1556 - EXT4_FIRST_INO(sb)))) { 1557 - ext4_error(sb, "Something is wrong with group %u: " 1558 - "used itable blocks: %d; " 1559 - "itable unused count: %u", 1560 - group, used_blks, 1561 - ext4_itable_unused_count(sb, gdp)); 1562 - ret = 1; 1563 - goto err_out; 1551 + /* Bogus inode unused count? */ 1552 + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { 1553 + ext4_error(sb, "Something is wrong with group %u: " 1554 + "used itable blocks: %d; " 1555 + "itable unused count: %u", 1556 + group, used_blks, 1557 + ext4_itable_unused_count(sb, gdp)); 1558 + ret = 1; 1559 + goto err_out; 1560 + } 1561 + 1562 + used_inos += group * EXT4_INODES_PER_GROUP(sb); 1563 + /* 1564 + * Are there some uninitialized inodes in the inode table 1565 + * before the first normal inode? 1566 + */ 1567 + if ((used_blks != sbi->s_itb_per_group) && 1568 + (used_inos < EXT4_FIRST_INO(sb))) { 1569 + ext4_error(sb, "Something is wrong with group %u: " 1570 + "itable unused count: %u; " 1571 + "itables initialized count: %ld", 1572 + group, ext4_itable_unused_count(sb, gdp), 1573 + used_inos); 1574 + ret = 1; 1575 + goto err_out; 1576 + } 1564 1577 } 1565 1578 1566 1579 blk = ext4_inode_table(sb, gdp) + used_blks;
+1 -1
fs/ext4/indirect.c
··· 705 705 706 706 /* 707 707 * Truncate transactions can be complex and absolutely huge. So we need to 708 - * be able to restart the transaction at a conventient checkpoint to make 708 + * be able to restart the transaction at a convenient checkpoint to make 709 709 * sure we don't overflow the journal. 710 710 * 711 711 * Try to extend this transaction for the purposes of truncation. If
+16 -11
fs/ext4/inline.c
··· 795 795 * clear the inode state safely. 796 796 * 2. The inode has inline data, then we need to read the data, make it 797 797 * update and dirty so that ext4_da_writepages can handle it. We don't 798 - * need to start the journal since the file's metatdata isn't changed now. 798 + * need to start the journal since the file's metadata isn't changed now. 799 799 */ 800 800 static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, 801 801 struct inode *inode, ··· 1031 1031 err = ext4_journal_get_write_access(handle, iloc->bh); 1032 1032 if (err) 1033 1033 return err; 1034 - ext4_insert_dentry(inode, de, inline_size, fname); 1034 + ext4_insert_dentry(dir, inode, de, inline_size, fname); 1035 1035 1036 1036 ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); 1037 1037 ··· 1100 1100 int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; 1101 1101 int new_size = get_max_inline_xattr_value_size(dir, iloc); 1102 1102 1103 - if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) 1103 + if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) 1104 1104 return -ENOSPC; 1105 1105 1106 1106 ret = ext4_update_inline_data(handle, dir, ··· 1380 1380 fake.name_len = 1; 1381 1381 strcpy(fake.name, "."); 1382 1382 fake.rec_len = ext4_rec_len_to_disk( 1383 - EXT4_DIR_REC_LEN(fake.name_len), 1384 - inline_size); 1383 + ext4_dir_rec_len(fake.name_len, NULL), 1384 + inline_size); 1385 1385 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); 1386 1386 de = &fake; 1387 1387 pos = EXT4_INLINE_DOTDOT_OFFSET; ··· 1390 1390 fake.name_len = 2; 1391 1391 strcpy(fake.name, ".."); 1392 1392 fake.rec_len = ext4_rec_len_to_disk( 1393 - EXT4_DIR_REC_LEN(fake.name_len), 1394 - inline_size); 1393 + ext4_dir_rec_len(fake.name_len, NULL), 1394 + inline_size); 1395 1395 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); 1396 1396 de = &fake; 1397 1397 pos = EXT4_INLINE_DOTDOT_SIZE; ··· 1406 1406 } 1407 1407 } 1408 1408 1409 - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1409 + if (ext4_hash_in_dirent(dir)) { 1410 + hinfo->hash = EXT4_DIRENT_HASH(de); 1411 + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); 1412 + } else { 1413 + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1414 + } 1410 1415 if ((hinfo->hash < start_hash) || 1411 1416 ((hinfo->hash == start_hash) && 1412 1417 (hinfo->minor_hash < start_minor_hash))) ··· 1493 1488 * So we will use extra_offset and extra_size to indicate them 1494 1489 * during the inline dir iteration. 1495 1490 */ 1496 - dotdot_offset = EXT4_DIR_REC_LEN(1); 1497 - dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); 1491 + dotdot_offset = ext4_dir_rec_len(1, NULL); 1492 + dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); 1498 1493 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; 1499 1494 extra_size = extra_offset + inline_size; 1500 1495 ··· 1529 1524 * failure will be detected in the 1530 1525 * dirent test below. */ 1531 1526 if (ext4_rec_len_from_disk(de->rec_len, extra_size) 1532 - < EXT4_DIR_REC_LEN(1)) 1527 + < ext4_dir_rec_len(1, NULL)) 1533 1528 break; 1534 1529 i += ext4_rec_len_from_disk(de->rec_len, 1535 1530 extra_size);
+3 -5
fs/ext4/inode.c
··· 1066 1066 block_end = block_start + blocksize; 1067 1067 if (block_end <= from || block_start >= to) { 1068 1068 if (PageUptodate(page)) { 1069 - if (!buffer_uptodate(bh)) 1070 - set_buffer_uptodate(bh); 1069 + set_buffer_uptodate(bh); 1071 1070 } 1072 1071 continue; 1073 1072 } ··· 1091 1092 } 1092 1093 } 1093 1094 if (PageUptodate(page)) { 1094 - if (!buffer_uptodate(bh)) 1095 - set_buffer_uptodate(bh); 1095 + set_buffer_uptodate(bh); 1096 1096 continue; 1097 1097 } 1098 1098 if (!buffer_uptodate(bh) && !buffer_delay(bh) && ··· 3822 3824 * starting from file offset 'from'. The range to be zero'd must 3823 3825 * be contained with in one block. If the specified range exceeds 3824 3826 * the end of the block it will be shortened to end of the block 3825 - * that cooresponds to 'from' 3827 + * that corresponds to 'from' 3826 3828 */ 3827 3829 static int ext4_block_zero_page_range(handle_t *handle, 3828 3830 struct address_space *mapping, loff_t from, loff_t length)
+6
fs/ext4/ioctl.c
··· 316 316 static bool dax_compatible(struct inode *inode, unsigned int oldflags, 317 317 unsigned int flags) 318 318 { 319 + /* Allow the DAX flag to be changed on inline directories */ 320 + if (S_ISDIR(inode->i_mode)) { 321 + flags &= ~EXT4_INLINE_DATA_FL; 322 + oldflags &= ~EXT4_INLINE_DATA_FL; 323 + } 324 + 319 325 if (flags & EXT4_DAX_FL) { 320 326 if ((oldflags & EXT4_DAX_MUT_EXCL) || 321 327 ext4_test_inode_state(inode,
+562 -30
fs/ext4/mballoc.c
··· 127 127 * smallest multiple of the stripe value (sbi->s_stripe) which is 128 128 * greater than the default mb_group_prealloc. 129 129 * 130 + * If "mb_optimize_scan" mount option is set, we maintain in memory group info 131 + * structures in two data structures: 132 + * 133 + * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 134 + * 135 + * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 136 + * 137 + * This is an array of lists where the index in the array represents the 138 + * largest free order in the buddy bitmap of the participating group infos of 139 + * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 140 + * number of buddy bitmap orders possible) number of lists. Group-infos are 141 + * placed in appropriate lists. 142 + * 143 + * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) 144 + * 145 + * Locking: sbi->s_mb_rb_lock (rwlock) 146 + * 147 + * This is a red black tree consisting of group infos and the tree is sorted 148 + * by average fragment sizes (which is calculated as ext4_group_info->bb_free 149 + * / ext4_group_info->bb_fragments). 150 + * 151 + * When "mb_optimize_scan" mount option is set, mballoc consults the above data 152 + * structures to decide the order in which groups are to be traversed for 153 + * fulfilling an allocation request. 154 + * 155 + * At CR = 0, we look for groups which have the largest_free_order >= the order 156 + * of the request. We directly look at the largest free order list in the data 157 + * structure (1) above where largest_free_order = order of the request. If that 158 + * list is empty, we look at remaining list in the increasing order of 159 + * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. 160 + * 161 + * At CR = 1, we only consider groups where average fragment size > request 162 + * size. So, we lookup a group which has average fragment size just above or 163 + * equal to request size using our rb tree (data structure 2) in O(log N) time. 164 + * 165 + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 166 + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. 167 + * 130 168 * The regular allocator (using the buddy cache) supports a few tunables. 131 169 * 132 170 * /sys/fs/ext4/<partition>/mb_min_to_scan 133 171 * /sys/fs/ext4/<partition>/mb_max_to_scan 134 172 * /sys/fs/ext4/<partition>/mb_order2_req 173 + * /sys/fs/ext4/<partition>/mb_linear_limit 135 174 * 136 175 * The regular allocator uses buddy scan only if the request len is power of 137 176 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The ··· 187 148 * ac_g_ex. Each group is first checked based on the criteria whether it 188 149 * can be used for allocation. ext4_mb_good_group explains how the groups are 189 150 * checked. 151 + * 152 + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not 153 + * get traversed linearly. That may result in subsequent allocations being not 154 + * close to each other. And so, the underlying device may get filled up in a 155 + * non-linear fashion. While that may not matter on non-rotational devices, for 156 + * rotational devices that may result in higher seek times. "mb_linear_limit" 157 + * tells mballoc how many groups mballoc should search linearly before 158 + * performing consulting above data structures for more efficient lookups. For 159 + * non rotational devices, this value defaults to 0 and for rotational devices 160 + * this is set to MB_DEFAULT_LINEAR_LIMIT. 190 161 * 191 162 * Both the prealloc space are getting populated as above. So for the first 192 163 * request we will hit the buddy cache which will result in this prealloc ··· 348 299 * - bitlock on a group (group) 349 300 * - object (inode/locality) (object) 350 301 * - per-pa lock (pa) 302 + * - cr0 lists lock (cr0) 303 + * - cr1 tree lock (cr1) 351 304 * 352 305 * Paths: 353 306 * - new pa ··· 379 328 * group 380 329 * object 381 330 * 331 + * - allocation path (ext4_mb_regular_allocator) 332 + * group 333 + * cr0/cr1 382 334 */ 383 335 static struct kmem_cache *ext4_pspace_cachep; 384 336 static struct kmem_cache *ext4_ac_cachep; ··· 404 350 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 405 351 ext4_group_t group); 406 352 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 353 + 354 + static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 355 + ext4_group_t group, int cr); 407 356 408 357 /* 409 358 * The algorithm using this percpu seq counter goes below: ··· 801 744 } 802 745 } 803 746 747 + static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, 748 + int (*cmp)(struct rb_node *, struct rb_node *)) 749 + { 750 + struct rb_node **iter = &root->rb_node, *parent = NULL; 751 + 752 + while (*iter) { 753 + parent = *iter; 754 + if (cmp(new, *iter) > 0) 755 + iter = &((*iter)->rb_left); 756 + else 757 + iter = &((*iter)->rb_right); 758 + } 759 + 760 + rb_link_node(new, parent, iter); 761 + rb_insert_color(new, root); 762 + } 763 + 764 + static int 765 + ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) 766 + { 767 + struct ext4_group_info *grp1 = rb_entry(rb1, 768 + struct ext4_group_info, 769 + bb_avg_fragment_size_rb); 770 + struct ext4_group_info *grp2 = rb_entry(rb2, 771 + struct ext4_group_info, 772 + bb_avg_fragment_size_rb); 773 + int num_frags_1, num_frags_2; 774 + 775 + num_frags_1 = grp1->bb_fragments ? 776 + grp1->bb_free / grp1->bb_fragments : 0; 777 + num_frags_2 = grp2->bb_fragments ? 778 + grp2->bb_free / grp2->bb_fragments : 0; 779 + 780 + return (num_frags_2 - num_frags_1); 781 + } 782 + 783 + /* 784 + * Reinsert grpinfo into the avg_fragment_size tree with new average 785 + * fragment size. 786 + */ 787 + static void 788 + mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 789 + { 790 + struct ext4_sb_info *sbi = EXT4_SB(sb); 791 + 792 + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 793 + return; 794 + 795 + write_lock(&sbi->s_mb_rb_lock); 796 + if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { 797 + rb_erase(&grp->bb_avg_fragment_size_rb, 798 + &sbi->s_mb_avg_fragment_size_root); 799 + RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); 800 + } 801 + 802 + ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, 803 + &grp->bb_avg_fragment_size_rb, 804 + ext4_mb_avg_fragment_size_cmp); 805 + write_unlock(&sbi->s_mb_rb_lock); 806 + } 807 + 808 + /* 809 + * Choose next group by traversing largest_free_order lists. Updates *new_cr if 810 + * cr level needs an update. 811 + */ 812 + static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, 813 + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 814 + { 815 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 816 + struct ext4_group_info *iter, *grp; 817 + int i; 818 + 819 + if (ac->ac_status == AC_STATUS_FOUND) 820 + return; 821 + 822 + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) 823 + atomic_inc(&sbi->s_bal_cr0_bad_suggestions); 824 + 825 + grp = NULL; 826 + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 827 + if (list_empty(&sbi->s_mb_largest_free_orders[i])) 828 + continue; 829 + read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 830 + if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 831 + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 832 + continue; 833 + } 834 + grp = NULL; 835 + list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 836 + bb_largest_free_order_node) { 837 + if (sbi->s_mb_stats) 838 + atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); 839 + if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { 840 + grp = iter; 841 + break; 842 + } 843 + } 844 + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 845 + if (grp) 846 + break; 847 + } 848 + 849 + if (!grp) { 850 + /* Increment cr and search again */ 851 + *new_cr = 1; 852 + } else { 853 + *group = grp->bb_group; 854 + ac->ac_last_optimal_group = *group; 855 + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; 856 + } 857 + } 858 + 859 + /* 860 + * Choose next group by traversing average fragment size tree. Updates *new_cr 861 + * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that 862 + * the linear search should continue for one iteration since there's lock 863 + * contention on the rb tree lock. 864 + */ 865 + static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, 866 + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 867 + { 868 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 869 + int avg_fragment_size, best_so_far; 870 + struct rb_node *node, *found; 871 + struct ext4_group_info *grp; 872 + 873 + /* 874 + * If there is contention on the lock, instead of waiting for the lock 875 + * to become available, just continue searching lineraly. We'll resume 876 + * our rb tree search later starting at ac->ac_last_optimal_group. 877 + */ 878 + if (!read_trylock(&sbi->s_mb_rb_lock)) { 879 + ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; 880 + return; 881 + } 882 + 883 + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { 884 + if (sbi->s_mb_stats) 885 + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); 886 + /* We have found something at CR 1 in the past */ 887 + grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); 888 + for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; 889 + found = rb_next(found)) { 890 + grp = rb_entry(found, struct ext4_group_info, 891 + bb_avg_fragment_size_rb); 892 + if (sbi->s_mb_stats) 893 + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); 894 + if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) 895 + break; 896 + } 897 + goto done; 898 + } 899 + 900 + node = sbi->s_mb_avg_fragment_size_root.rb_node; 901 + best_so_far = 0; 902 + found = NULL; 903 + 904 + while (node) { 905 + grp = rb_entry(node, struct ext4_group_info, 906 + bb_avg_fragment_size_rb); 907 + avg_fragment_size = 0; 908 + if (ext4_mb_good_group(ac, grp->bb_group, 1)) { 909 + avg_fragment_size = grp->bb_fragments ? 910 + grp->bb_free / grp->bb_fragments : 0; 911 + if (!best_so_far || avg_fragment_size < best_so_far) { 912 + best_so_far = avg_fragment_size; 913 + found = node; 914 + } 915 + } 916 + if (avg_fragment_size > ac->ac_g_ex.fe_len) 917 + node = node->rb_right; 918 + else 919 + node = node->rb_left; 920 + } 921 + 922 + done: 923 + if (found) { 924 + grp = rb_entry(found, struct ext4_group_info, 925 + bb_avg_fragment_size_rb); 926 + *group = grp->bb_group; 927 + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 928 + } else { 929 + *new_cr = 2; 930 + } 931 + 932 + read_unlock(&sbi->s_mb_rb_lock); 933 + ac->ac_last_optimal_group = *group; 934 + } 935 + 936 + static inline int should_optimize_scan(struct ext4_allocation_context *ac) 937 + { 938 + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) 939 + return 0; 940 + if (ac->ac_criteria >= 2) 941 + return 0; 942 + if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) 943 + return 0; 944 + return 1; 945 + } 946 + 947 + /* 948 + * Return next linear group for allocation. If linear traversal should not be 949 + * performed, this function just returns the same group 950 + */ 951 + static int 952 + next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) 953 + { 954 + if (!should_optimize_scan(ac)) 955 + goto inc_and_return; 956 + 957 + if (ac->ac_groups_linear_remaining) { 958 + ac->ac_groups_linear_remaining--; 959 + goto inc_and_return; 960 + } 961 + 962 + if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { 963 + ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; 964 + goto inc_and_return; 965 + } 966 + 967 + return group; 968 + inc_and_return: 969 + /* 970 + * Artificially restricted ngroups for non-extent 971 + * files makes group > ngroups possible on first loop. 972 + */ 973 + return group + 1 >= ngroups ? 0 : group + 1; 974 + } 975 + 976 + /* 977 + * ext4_mb_choose_next_group: choose next group for allocation. 978 + * 979 + * @ac Allocation Context 980 + * @new_cr This is an output parameter. If the there is no good group 981 + * available at current CR level, this field is updated to indicate 982 + * the new cr level that should be used. 983 + * @group This is an input / output parameter. As an input it indicates the 984 + * next group that the allocator intends to use for allocation. As 985 + * output, this field indicates the next group that should be used as 986 + * determined by the optimization functions. 987 + * @ngroups Total number of groups 988 + */ 989 + static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 990 + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 991 + { 992 + *new_cr = ac->ac_criteria; 993 + 994 + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) 995 + return; 996 + 997 + if (*new_cr == 0) { 998 + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); 999 + } else if (*new_cr == 1) { 1000 + ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); 1001 + } else { 1002 + /* 1003 + * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1004 + * bb_free. But until that happens, we should never come here. 1005 + */ 1006 + WARN_ON(1); 1007 + } 1008 + } 1009 + 804 1010 /* 805 1011 * Cache the order of the largest free extent we have available in this block 806 1012 * group. ··· 1071 751 static void 1072 752 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1073 753 { 754 + struct ext4_sb_info *sbi = EXT4_SB(sb); 1074 755 int i; 1075 - int bits; 1076 756 757 + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { 758 + write_lock(&sbi->s_mb_largest_free_orders_locks[ 759 + grp->bb_largest_free_order]); 760 + list_del_init(&grp->bb_largest_free_order_node); 761 + write_unlock(&sbi->s_mb_largest_free_orders_locks[ 762 + grp->bb_largest_free_order]); 763 + } 1077 764 grp->bb_largest_free_order = -1; /* uninit */ 1078 765 1079 - bits = sb->s_blocksize_bits + 1; 1080 - for (i = bits; i >= 0; i--) { 766 + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { 1081 767 if (grp->bb_counters[i] > 0) { 1082 768 grp->bb_largest_free_order = i; 1083 769 break; 1084 770 } 771 + } 772 + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && 773 + grp->bb_largest_free_order >= 0 && grp->bb_free) { 774 + write_lock(&sbi->s_mb_largest_free_orders_locks[ 775 + grp->bb_largest_free_order]); 776 + list_add_tail(&grp->bb_largest_free_order_node, 777 + &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 778 + write_unlock(&sbi->s_mb_largest_free_orders_locks[ 779 + grp->bb_largest_free_order]); 1085 780 } 1086 781 } 1087 782 ··· 1151 816 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1152 817 1153 818 period = get_cycles() - period; 1154 - spin_lock(&sbi->s_bal_lock); 1155 - sbi->s_mb_buddies_generated++; 1156 - sbi->s_mb_generation_time += period; 1157 - spin_unlock(&sbi->s_bal_lock); 819 + atomic_inc(&sbi->s_mb_buddies_generated); 820 + atomic64_add(period, &sbi->s_mb_generation_time); 821 + mb_update_avg_fragment_size(sb, grp); 1158 822 } 1159 823 1160 824 /* The buddy information is attached the buddy cache inode ··· 1293 959 grinfo->bb_fragments = 0; 1294 960 memset(grinfo->bb_counters, 0, 1295 961 sizeof(*grinfo->bb_counters) * 1296 - (sb->s_blocksize_bits+2)); 962 + (MB_NUM_ORDERS(sb))); 1297 963 /* 1298 964 * incore got set to the group block bitmap below 1299 965 */ ··· 1853 1519 1854 1520 done: 1855 1521 mb_set_largest_free_order(sb, e4b->bd_info); 1522 + mb_update_avg_fragment_size(sb, e4b->bd_info); 1856 1523 mb_check_buddy(e4b); 1857 1524 } 1858 1525 ··· 1990 1655 } 1991 1656 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1992 1657 1658 + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); 1993 1659 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 1994 1660 mb_check_buddy(e4b); 1995 1661 ··· 2266 1930 int max; 2267 1931 2268 1932 BUG_ON(ac->ac_2order <= 0); 2269 - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1933 + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { 2270 1934 if (grp->bb_counters[i] == 0) 2271 1935 continue; 2272 1936 ··· 2445 2109 if (free < ac->ac_g_ex.fe_len) 2446 2110 return false; 2447 2111 2448 - if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) 2112 + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) 2449 2113 return true; 2450 2114 2451 2115 if (grp->bb_largest_free_order < ac->ac_2order) ··· 2484 2148 ext4_grpblk_t free; 2485 2149 int ret = 0; 2486 2150 2151 + if (sbi->s_mb_stats) 2152 + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); 2487 2153 if (should_lock) 2488 2154 ext4_lock_group(sb, group); 2489 2155 free = grp->bb_free; ··· 2653 2315 * We also support searching for power-of-two requests only for 2654 2316 * requests upto maximum buddy size we have constructed. 2655 2317 */ 2656 - if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { 2318 + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { 2657 2319 /* 2658 2320 * This should tell if fe_len is exactly power of 2 2659 2321 */ 2660 2322 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2661 2323 ac->ac_2order = array_index_nospec(i - 1, 2662 - sb->s_blocksize_bits + 2); 2324 + MB_NUM_ORDERS(sb)); 2663 2325 } 2664 2326 2665 2327 /* if stream allocation is enabled, use global goal */ ··· 2685 2347 * from the goal value specified 2686 2348 */ 2687 2349 group = ac->ac_g_ex.fe_group; 2350 + ac->ac_last_optimal_group = group; 2351 + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2688 2352 prefetch_grp = group; 2689 2353 2690 - for (i = 0; i < ngroups; group++, i++) { 2691 - int ret = 0; 2354 + for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), 2355 + i++) { 2356 + int ret = 0, new_cr; 2357 + 2692 2358 cond_resched(); 2693 - /* 2694 - * Artificially restricted ngroups for non-extent 2695 - * files makes group > ngroups possible on first loop. 2696 - */ 2697 - if (group >= ngroups) 2698 - group = 0; 2359 + 2360 + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); 2361 + if (new_cr != cr) { 2362 + cr = new_cr; 2363 + goto repeat; 2364 + } 2699 2365 2700 2366 /* 2701 2367 * Batch reads of the block allocation bitmaps ··· 2764 2422 if (ac->ac_status != AC_STATUS_CONTINUE) 2765 2423 break; 2766 2424 } 2425 + /* Processed all groups and haven't found blocks */ 2426 + if (sbi->s_mb_stats && i == ngroups) 2427 + atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2767 2428 } 2768 2429 2769 2430 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && ··· 2796 2451 goto repeat; 2797 2452 } 2798 2453 } 2454 + 2455 + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2456 + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2799 2457 out: 2800 2458 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 2801 2459 err = first_err; ··· 2898 2550 .show = ext4_mb_seq_groups_show, 2899 2551 }; 2900 2552 2553 + int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) 2554 + { 2555 + struct super_block *sb = (struct super_block *)seq->private; 2556 + struct ext4_sb_info *sbi = EXT4_SB(sb); 2557 + 2558 + seq_puts(seq, "mballoc:\n"); 2559 + if (!sbi->s_mb_stats) { 2560 + seq_puts(seq, "\tmb stats collection turned off.\n"); 2561 + seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); 2562 + return 0; 2563 + } 2564 + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); 2565 + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); 2566 + 2567 + seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); 2568 + 2569 + seq_puts(seq, "\tcr0_stats:\n"); 2570 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); 2571 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2572 + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); 2573 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2574 + atomic64_read(&sbi->s_bal_cX_failed[0])); 2575 + seq_printf(seq, "\t\tbad_suggestions: %u\n", 2576 + atomic_read(&sbi->s_bal_cr0_bad_suggestions)); 2577 + 2578 + seq_puts(seq, "\tcr1_stats:\n"); 2579 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); 2580 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2581 + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); 2582 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2583 + atomic64_read(&sbi->s_bal_cX_failed[1])); 2584 + seq_printf(seq, "\t\tbad_suggestions: %u\n", 2585 + atomic_read(&sbi->s_bal_cr1_bad_suggestions)); 2586 + 2587 + seq_puts(seq, "\tcr2_stats:\n"); 2588 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); 2589 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2590 + atomic64_read(&sbi->s_bal_cX_groups_considered[2])); 2591 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2592 + atomic64_read(&sbi->s_bal_cX_failed[2])); 2593 + 2594 + seq_puts(seq, "\tcr3_stats:\n"); 2595 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); 2596 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2597 + atomic64_read(&sbi->s_bal_cX_groups_considered[3])); 2598 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2599 + atomic64_read(&sbi->s_bal_cX_failed[3])); 2600 + seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); 2601 + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 2602 + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); 2603 + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); 2604 + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); 2605 + 2606 + seq_printf(seq, "\tbuddies_generated: %u/%u\n", 2607 + atomic_read(&sbi->s_mb_buddies_generated), 2608 + ext4_get_groups_count(sb)); 2609 + seq_printf(seq, "\tbuddies_time_used: %llu\n", 2610 + atomic64_read(&sbi->s_mb_generation_time)); 2611 + seq_printf(seq, "\tpreallocated: %u\n", 2612 + atomic_read(&sbi->s_mb_preallocated)); 2613 + seq_printf(seq, "\tdiscarded: %u\n", 2614 + atomic_read(&sbi->s_mb_discarded)); 2615 + return 0; 2616 + } 2617 + 2618 + static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 2619 + { 2620 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2621 + unsigned long position; 2622 + 2623 + read_lock(&EXT4_SB(sb)->s_mb_rb_lock); 2624 + 2625 + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) 2626 + return NULL; 2627 + position = *pos + 1; 2628 + return (void *) ((unsigned long) position); 2629 + } 2630 + 2631 + static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) 2632 + { 2633 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2634 + unsigned long position; 2635 + 2636 + ++*pos; 2637 + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) 2638 + return NULL; 2639 + position = *pos + 1; 2640 + return (void *) ((unsigned long) position); 2641 + } 2642 + 2643 + static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) 2644 + { 2645 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2646 + struct ext4_sb_info *sbi = EXT4_SB(sb); 2647 + unsigned long position = ((unsigned long) v); 2648 + struct ext4_group_info *grp; 2649 + struct rb_node *n; 2650 + unsigned int count, min, max; 2651 + 2652 + position--; 2653 + if (position >= MB_NUM_ORDERS(sb)) { 2654 + seq_puts(seq, "fragment_size_tree:\n"); 2655 + n = rb_first(&sbi->s_mb_avg_fragment_size_root); 2656 + if (!n) { 2657 + seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); 2658 + return 0; 2659 + } 2660 + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); 2661 + min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; 2662 + count = 1; 2663 + while (rb_next(n)) { 2664 + count++; 2665 + n = rb_next(n); 2666 + } 2667 + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); 2668 + max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; 2669 + 2670 + seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", 2671 + min, max, count); 2672 + return 0; 2673 + } 2674 + 2675 + if (position == 0) { 2676 + seq_printf(seq, "optimize_scan: %d\n", 2677 + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); 2678 + seq_puts(seq, "max_free_order_lists:\n"); 2679 + } 2680 + count = 0; 2681 + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 2682 + bb_largest_free_order_node) 2683 + count++; 2684 + seq_printf(seq, "\tlist_order_%u_groups: %u\n", 2685 + (unsigned int)position, count); 2686 + 2687 + return 0; 2688 + } 2689 + 2690 + static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 2691 + { 2692 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2693 + 2694 + read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); 2695 + } 2696 + 2697 + const struct seq_operations ext4_mb_seq_structs_summary_ops = { 2698 + .start = ext4_mb_seq_structs_summary_start, 2699 + .next = ext4_mb_seq_structs_summary_next, 2700 + .stop = ext4_mb_seq_structs_summary_stop, 2701 + .show = ext4_mb_seq_structs_summary_show, 2702 + }; 2703 + 2901 2704 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 2902 2705 { 2903 2706 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; ··· 3089 2590 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 3090 2591 if (old_groupinfo) 3091 2592 ext4_kvfree_array_rcu(old_groupinfo); 3092 - ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 2593 + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 3093 2594 sbi->s_group_info_size); 3094 2595 return 0; 3095 2596 } ··· 3151 2652 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3152 2653 init_rwsem(&meta_group_info[i]->alloc_sem); 3153 2654 meta_group_info[i]->bb_free_root = RB_ROOT; 2655 + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 2656 + RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); 3154 2657 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 2658 + meta_group_info[i]->bb_group = group; 3155 2659 3156 2660 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); 3157 2661 return 0; ··· 3315 2813 unsigned max; 3316 2814 int ret; 3317 2815 3318 - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2816 + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); 3319 2817 3320 2818 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 3321 2819 if (sbi->s_mb_offsets == NULL) { ··· 3323 2821 goto out; 3324 2822 } 3325 2823 3326 - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2824 + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); 3327 2825 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 3328 2826 if (sbi->s_mb_maxs == NULL) { 3329 2827 ret = -ENOMEM; ··· 3349 2847 offset_incr = offset_incr >> 1; 3350 2848 max = max >> 1; 3351 2849 i++; 3352 - } while (i <= sb->s_blocksize_bits + 1); 2850 + } while (i < MB_NUM_ORDERS(sb)); 2851 + 2852 + sbi->s_mb_avg_fragment_size_root = RB_ROOT; 2853 + sbi->s_mb_largest_free_orders = 2854 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 2855 + GFP_KERNEL); 2856 + if (!sbi->s_mb_largest_free_orders) { 2857 + ret = -ENOMEM; 2858 + goto out; 2859 + } 2860 + sbi->s_mb_largest_free_orders_locks = 2861 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 2862 + GFP_KERNEL); 2863 + if (!sbi->s_mb_largest_free_orders_locks) { 2864 + ret = -ENOMEM; 2865 + goto out; 2866 + } 2867 + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 2868 + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 2869 + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 2870 + } 2871 + rwlock_init(&sbi->s_mb_rb_lock); 3353 2872 3354 2873 spin_lock_init(&sbi->s_md_lock); 3355 - spin_lock_init(&sbi->s_bal_lock); 3356 2874 sbi->s_mb_free_pending = 0; 3357 2875 INIT_LIST_HEAD(&sbi->s_freed_data_list); 3358 2876 ··· 3423 2901 spin_lock_init(&lg->lg_prealloc_lock); 3424 2902 } 3425 2903 2904 + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) 2905 + sbi->s_mb_max_linear_groups = 0; 2906 + else 2907 + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; 3426 2908 /* init file for buddy data */ 3427 2909 ret = ext4_mb_init_backend(sb); 3428 2910 if (ret != 0) ··· 3438 2912 free_percpu(sbi->s_locality_groups); 3439 2913 sbi->s_locality_groups = NULL; 3440 2914 out: 2915 + kfree(sbi->s_mb_largest_free_orders); 2916 + kfree(sbi->s_mb_largest_free_orders_locks); 3441 2917 kfree(sbi->s_mb_offsets); 3442 2918 sbi->s_mb_offsets = NULL; 3443 2919 kfree(sbi->s_mb_maxs); ··· 3496 2968 kvfree(group_info); 3497 2969 rcu_read_unlock(); 3498 2970 } 2971 + kfree(sbi->s_mb_largest_free_orders); 2972 + kfree(sbi->s_mb_largest_free_orders_locks); 3499 2973 kfree(sbi->s_mb_offsets); 3500 2974 kfree(sbi->s_mb_maxs); 3501 2975 iput(sbi->s_buddy_cache); ··· 3508 2978 atomic_read(&sbi->s_bal_reqs), 3509 2979 atomic_read(&sbi->s_bal_success)); 3510 2980 ext4_msg(sb, KERN_INFO, 3511 - "mballoc: %u extents scanned, %u goal hits, " 2981 + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " 3512 2982 "%u 2^N hits, %u breaks, %u lost", 3513 2983 atomic_read(&sbi->s_bal_ex_scanned), 2984 + atomic_read(&sbi->s_bal_groups_scanned), 3514 2985 atomic_read(&sbi->s_bal_goals), 3515 2986 atomic_read(&sbi->s_bal_2orders), 3516 2987 atomic_read(&sbi->s_bal_breaks), 3517 2988 atomic_read(&sbi->s_mb_lost_chunks)); 3518 2989 ext4_msg(sb, KERN_INFO, 3519 - "mballoc: %lu generated and it took %Lu", 3520 - sbi->s_mb_buddies_generated, 3521 - sbi->s_mb_generation_time); 2990 + "mballoc: %u generated and it took %llu", 2991 + atomic_read(&sbi->s_mb_buddies_generated), 2992 + atomic64_read(&sbi->s_mb_generation_time)); 3522 2993 ext4_msg(sb, KERN_INFO, 3523 2994 "mballoc: %u preallocated, %u discarded", 3524 2995 atomic_read(&sbi->s_mb_preallocated), ··· 4114 3583 { 4115 3584 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4116 3585 4117 - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3586 + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { 4118 3587 atomic_inc(&sbi->s_bal_reqs); 4119 3588 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 4120 3589 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 4121 3590 atomic_inc(&sbi->s_bal_success); 4122 3591 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3592 + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); 4123 3593 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4124 3594 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4125 3595 atomic_inc(&sbi->s_bal_goals);
+22 -2
fs/ext4/mballoc.h
··· 59 59 * by the stream allocator, which purpose is to pack requests 60 60 * as close each to other as possible to produce smooth I/O traffic 61 61 * We use locality group prealloc space for stream request. 62 - * We can tune the same via /proc/fs/ext4/<parition>/stream_req 62 + * We can tune the same via /proc/fs/ext4/<partition>/stream_req 63 63 */ 64 64 #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ 65 65 ··· 77 77 * maximum length of inode prealloc list 78 78 */ 79 79 #define MB_DEFAULT_MAX_INODE_PREALLOC 512 80 + 81 + /* 82 + * Number of groups to search linearly before performing group scanning 83 + * optimization. 84 + */ 85 + #define MB_DEFAULT_LINEAR_LIMIT 4 86 + 87 + /* 88 + * Minimum number of groups that should be present in the file system to perform 89 + * group scanning optimizations. 90 + */ 91 + #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 92 + 93 + /* 94 + * Number of valid buddy orders 95 + */ 96 + #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) 80 97 81 98 struct ext4_free_data { 82 99 /* this links the free block information from sb_info */ ··· 178 161 /* copy of the best found extent taken before preallocation efforts */ 179 162 struct ext4_free_extent ac_f_ex; 180 163 164 + ext4_group_t ac_last_optimal_group; 165 + __u32 ac_groups_considered; 166 + __u32 ac_flags; /* allocation hints */ 181 167 __u16 ac_groups_scanned; 168 + __u16 ac_groups_linear_remaining; 182 169 __u16 ac_found; 183 170 __u16 ac_tail; 184 171 __u16 ac_buddy; 185 - __u16 ac_flags; /* allocation hints */ 186 172 __u8 ac_status; 187 173 __u8 ac_criteria; 188 174 __u8 ac_2order; /* if request is to allocate 2^N blocks and
+3 -3
fs/ext4/migrate.c
··· 32 32 newext.ee_block = cpu_to_le32(lb->first_block); 33 33 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); 34 34 ext4_ext_store_pblock(&newext, lb->first_pblock); 35 - /* Locking only for convinience since we are operating on temp inode */ 35 + /* Locking only for convenience since we are operating on temp inode */ 36 36 down_write(&EXT4_I(inode)->i_data_sem); 37 37 path = ext4_find_extent(inode, lb->first_block, NULL, 0); 38 38 if (IS_ERR(path)) { ··· 43 43 44 44 /* 45 45 * Calculate the credit needed to inserting this extent 46 - * Since we are doing this in loop we may accumalate extra 47 - * credit. But below we try to not accumalate too much 46 + * Since we are doing this in loop we may accumulate extra 47 + * credit. But below we try to not accumulate too much 48 48 * of them by restarting the journal. 49 49 */ 50 50 needed = ext4_ext_calc_credits_for_single_extent(inode,
+1 -1
fs/ext4/mmp.c
··· 56 56 wait_on_buffer(bh); 57 57 sb_end_write(sb); 58 58 if (unlikely(!buffer_uptodate(bh))) 59 - return 1; 59 + return -EIO; 60 60 61 61 return 0; 62 62 }
+182 -65
fs/ext4/namei.c
··· 280 280 unsigned blocksize, struct dx_hash_info *hinfo, 281 281 struct dx_map_entry map[]); 282 282 static void dx_sort_map(struct dx_map_entry *map, unsigned count); 283 - static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 284 - struct dx_map_entry *offsets, int count, unsigned blocksize); 285 - static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); 283 + static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, 284 + char *to, struct dx_map_entry *offsets, 285 + int count, unsigned int blocksize); 286 + static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, 287 + unsigned int blocksize); 286 288 static void dx_insert_block(struct dx_frame *frame, 287 289 u32 hash, ext4_lblk_t block); 288 290 static int ext4_htree_next_block(struct inode *dir, __u32 hash, ··· 576 574 577 575 static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) 578 576 { 579 - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 580 - EXT4_DIR_REC_LEN(2) - infosize; 577 + unsigned int entry_space = dir->i_sb->s_blocksize - 578 + ext4_dir_rec_len(1, NULL) - 579 + ext4_dir_rec_len(2, NULL) - infosize; 581 580 582 581 if (ext4_has_metadata_csum(dir->i_sb)) 583 582 entry_space -= sizeof(struct dx_tail); ··· 587 584 588 585 static inline unsigned dx_node_limit(struct inode *dir) 589 586 { 590 - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 587 + unsigned int entry_space = dir->i_sb->s_blocksize - 588 + ext4_dir_rec_len(0, dir); 591 589 592 590 if (ext4_has_metadata_csum(dir->i_sb)) 593 591 entry_space -= sizeof(struct dx_tail); ··· 677 673 name = fname_crypto_str.name; 678 674 len = fname_crypto_str.len; 679 675 } 680 - ext4fs_dirhash(dir, de->name, 676 + if (IS_CASEFOLDED(dir)) 677 + h.hash = EXT4_DIRENT_HASH(de); 678 + else 679 + ext4fs_dirhash(dir, de->name, 681 680 de->name_len, &h); 682 681 printk("%*.s:(E)%x.%u ", len, name, 683 682 h.hash, (unsigned) ((char *) de ··· 696 689 (unsigned) ((char *) de - base)); 697 690 #endif 698 691 } 699 - space += EXT4_DIR_REC_LEN(de->name_len); 692 + space += ext4_dir_rec_len(de->name_len, dir); 700 693 names++; 701 694 } 702 695 de = ext4_next_entry(de, size); ··· 791 784 root = (struct dx_root *) frame->bh->b_data; 792 785 if (root->info.hash_version != DX_HASH_TEA && 793 786 root->info.hash_version != DX_HASH_HALF_MD4 && 794 - root->info.hash_version != DX_HASH_LEGACY) { 787 + root->info.hash_version != DX_HASH_LEGACY && 788 + root->info.hash_version != DX_HASH_SIPHASH) { 795 789 ext4_warning_inode(dir, "Unrecognised inode hash code %u", 796 790 root->info.hash_version); 797 791 goto fail; 792 + } 793 + if (ext4_hash_in_dirent(dir)) { 794 + if (root->info.hash_version != DX_HASH_SIPHASH) { 795 + ext4_warning_inode(dir, 796 + "Hash in dirent, but hash is not SIPHASH"); 797 + goto fail; 798 + } 799 + } else { 800 + if (root->info.hash_version == DX_HASH_SIPHASH) { 801 + ext4_warning_inode(dir, 802 + "Hash code is SIPHASH, but hash not in dirent"); 803 + goto fail; 804 + } 798 805 } 799 806 if (fname) 800 807 hinfo = &fname->hinfo; ··· 816 795 if (hinfo->hash_version <= DX_HASH_TEA) 817 796 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 818 797 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 819 - if (fname && fname_name(fname)) 798 + /* hash is already computed for encrypted casefolded directory */ 799 + if (fname && fname_name(fname) && 800 + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) 820 801 ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); 821 802 hash = hinfo->hash; 822 803 ··· 979 956 * If the hash is 1, then continue only if the next page has a 980 957 * continuation hash of any value. This is used for readdir 981 958 * handling. Otherwise, check to see if the hash matches the 982 - * desired contiuation hash. If it doesn't, return since 959 + * desired continuation hash. If it doesn't, return since 983 960 * there's no point to read in the successive index pages. 984 961 */ 985 962 bhash = dx_get_hash(p->at); ··· 1020 997 struct ext4_dir_entry_2 *de, *top; 1021 998 int err = 0, count = 0; 1022 999 struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; 1000 + int csum = ext4_has_metadata_csum(dir->i_sb); 1023 1001 1024 1002 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 1025 1003 (unsigned long)block)); ··· 1029 1005 return PTR_ERR(bh); 1030 1006 1031 1007 de = (struct ext4_dir_entry_2 *) bh->b_data; 1008 + /* csum entries are not larger in the casefolded encrypted case */ 1032 1009 top = (struct ext4_dir_entry_2 *) ((char *) de + 1033 1010 dir->i_sb->s_blocksize - 1034 - EXT4_DIR_REC_LEN(0)); 1011 + ext4_dir_rec_len(0, 1012 + csum ? NULL : dir)); 1035 1013 /* Check if the directory is encrypted */ 1036 1014 if (IS_ENCRYPTED(dir)) { 1037 1015 err = fscrypt_prepare_readdir(dir); ··· 1057 1031 /* silently ignore the rest of the block */ 1058 1032 break; 1059 1033 } 1060 - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1034 + if (ext4_hash_in_dirent(dir)) { 1035 + if (de->name_len && de->inode) { 1036 + hinfo->hash = EXT4_DIRENT_HASH(de); 1037 + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); 1038 + } else { 1039 + hinfo->hash = 0; 1040 + hinfo->minor_hash = 0; 1041 + } 1042 + } else { 1043 + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1044 + } 1061 1045 if ((hinfo->hash < start_hash) || 1062 1046 ((hinfo->hash == start_hash) && 1063 1047 (hinfo->minor_hash < start_minor_hash))) ··· 1136 1100 start_hash, start_minor_hash)); 1137 1101 dir = file_inode(dir_file); 1138 1102 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { 1139 - hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1103 + if (ext4_hash_in_dirent(dir)) 1104 + hinfo.hash_version = DX_HASH_SIPHASH; 1105 + else 1106 + hinfo.hash_version = 1107 + EXT4_SB(dir->i_sb)->s_def_hash_version; 1140 1108 if (hinfo.hash_version <= DX_HASH_TEA) 1141 1109 hinfo.hash_version += 1142 1110 EXT4_SB(dir->i_sb)->s_hash_unsigned; ··· 1258 1218 1259 1219 while ((char *) de < base + blocksize) { 1260 1220 if (de->name_len && de->inode) { 1261 - ext4fs_dirhash(dir, de->name, de->name_len, &h); 1221 + if (ext4_hash_in_dirent(dir)) 1222 + h.hash = EXT4_DIRENT_HASH(de); 1223 + else 1224 + ext4fs_dirhash(dir, de->name, de->name_len, &h); 1262 1225 map_tail--; 1263 1226 map_tail->hash = h.hash; 1264 1227 map_tail->offs = ((char *) de - base)>>2; ··· 1325 1282 * Returns: 0 if the directory entry matches, more than 0 if it 1326 1283 * doesn't match or less than zero on error. 1327 1284 */ 1328 - int ext4_ci_compare(const struct inode *parent, const struct qstr *name, 1329 - const struct qstr *entry, bool quick) 1285 + static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, 1286 + u8 *de_name, size_t de_name_len, bool quick) 1330 1287 { 1331 1288 const struct super_block *sb = parent->i_sb; 1332 1289 const struct unicode_map *um = sb->s_encoding; 1290 + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); 1291 + struct qstr entry = QSTR_INIT(de_name, de_name_len); 1333 1292 int ret; 1334 1293 1335 - if (quick) 1336 - ret = utf8_strncasecmp_folded(um, name, entry); 1337 - else 1338 - ret = utf8_strncasecmp(um, name, entry); 1294 + if (IS_ENCRYPTED(parent)) { 1295 + const struct fscrypt_str encrypted_name = 1296 + FSTR_INIT(de_name, de_name_len); 1339 1297 1298 + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); 1299 + if (!decrypted_name.name) 1300 + return -ENOMEM; 1301 + ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, 1302 + &decrypted_name); 1303 + if (ret < 0) 1304 + goto out; 1305 + entry.name = decrypted_name.name; 1306 + entry.len = decrypted_name.len; 1307 + } 1308 + 1309 + if (quick) 1310 + ret = utf8_strncasecmp_folded(um, name, &entry); 1311 + else 1312 + ret = utf8_strncasecmp(um, name, &entry); 1340 1313 if (ret < 0) { 1341 1314 /* Handle invalid character sequence as either an error 1342 1315 * or as an opaque byte sequence. 1343 1316 */ 1344 1317 if (sb_has_strict_encoding(sb)) 1345 - return -EINVAL; 1346 - 1347 - if (name->len != entry->len) 1348 - return 1; 1349 - 1350 - return !!memcmp(name->name, entry->name, name->len); 1318 + ret = -EINVAL; 1319 + else if (name->len != entry.len) 1320 + ret = 1; 1321 + else 1322 + ret = !!memcmp(name->name, entry.name, entry.len); 1351 1323 } 1352 - 1324 + out: 1325 + kfree(decrypted_name.name); 1353 1326 return ret; 1354 1327 } 1355 1328 1356 - void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, 1357 - struct fscrypt_str *cf_name) 1329 + int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, 1330 + struct ext4_filename *name) 1358 1331 { 1332 + struct fscrypt_str *cf_name = &name->cf_name; 1333 + struct dx_hash_info *hinfo = &name->hinfo; 1359 1334 int len; 1360 1335 1361 1336 if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { 1362 1337 cf_name->name = NULL; 1363 - return; 1338 + return 0; 1364 1339 } 1365 1340 1366 1341 cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); 1367 1342 if (!cf_name->name) 1368 - return; 1343 + return -ENOMEM; 1369 1344 1370 1345 len = utf8_casefold(dir->i_sb->s_encoding, 1371 1346 iname, cf_name->name, ··· 1391 1330 if (len <= 0) { 1392 1331 kfree(cf_name->name); 1393 1332 cf_name->name = NULL; 1394 - return; 1395 1333 } 1396 1334 cf_name->len = (unsigned) len; 1335 + if (!IS_ENCRYPTED(dir)) 1336 + return 0; 1397 1337 1338 + hinfo->hash_version = DX_HASH_SIPHASH; 1339 + hinfo->seed = NULL; 1340 + if (cf_name->name) 1341 + ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); 1342 + else 1343 + ext4fs_dirhash(dir, iname->name, iname->len, hinfo); 1344 + return 0; 1398 1345 } 1399 1346 #endif 1400 1347 ··· 1411 1342 * 1412 1343 * Return: %true if the directory entry matches, otherwise %false. 1413 1344 */ 1414 - static inline bool ext4_match(const struct inode *parent, 1345 + static bool ext4_match(struct inode *parent, 1415 1346 const struct ext4_filename *fname, 1416 - const struct ext4_dir_entry_2 *de) 1347 + struct ext4_dir_entry_2 *de) 1417 1348 { 1418 1349 struct fscrypt_name f; 1419 - #ifdef CONFIG_UNICODE 1420 - const struct qstr entry = {.name = de->name, .len = de->name_len}; 1421 - #endif 1422 1350 1423 1351 if (!de->inode) 1424 1352 return false; ··· 1431 1365 if (fname->cf_name.name) { 1432 1366 struct qstr cf = {.name = fname->cf_name.name, 1433 1367 .len = fname->cf_name.len}; 1434 - return !ext4_ci_compare(parent, &cf, &entry, true); 1368 + if (IS_ENCRYPTED(parent)) { 1369 + if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || 1370 + fname->hinfo.minor_hash != 1371 + EXT4_DIRENT_MINOR_HASH(de)) { 1372 + 1373 + return 0; 1374 + } 1375 + } 1376 + return !ext4_ci_compare(parent, &cf, de->name, 1377 + de->name_len, true); 1435 1378 } 1436 - return !ext4_ci_compare(parent, fname->usr_fname, &entry, 1437 - false); 1379 + return !ext4_ci_compare(parent, fname->usr_fname, de->name, 1380 + de->name_len, false); 1438 1381 } 1439 1382 #endif 1440 1383 ··· 1840 1765 * Returns pointer to last entry moved. 1841 1766 */ 1842 1767 static struct ext4_dir_entry_2 * 1843 - dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, 1768 + dx_move_dirents(struct inode *dir, char *from, char *to, 1769 + struct dx_map_entry *map, int count, 1844 1770 unsigned blocksize) 1845 1771 { 1846 1772 unsigned rec_len = 0; ··· 1849 1773 while (count--) { 1850 1774 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1851 1775 (from + (map->offs<<2)); 1852 - rec_len = EXT4_DIR_REC_LEN(de->name_len); 1776 + rec_len = ext4_dir_rec_len(de->name_len, dir); 1777 + 1853 1778 memcpy (to, de, rec_len); 1854 1779 ((struct ext4_dir_entry_2 *) to)->rec_len = 1855 1780 ext4_rec_len_to_disk(rec_len, blocksize); 1781 + 1782 + /* wipe dir_entry excluding the rec_len field */ 1856 1783 de->inode = 0; 1784 + memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, 1785 + blocksize) - 1786 + offsetof(struct ext4_dir_entry_2, 1787 + name_len)); 1788 + 1857 1789 map++; 1858 1790 to += rec_len; 1859 1791 } ··· 1872 1788 * Compact each dir entry in the range to the minimal rec_len. 1873 1789 * Returns pointer to last entry in range. 1874 1790 */ 1875 - static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) 1791 + static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, 1792 + unsigned int blocksize) 1876 1793 { 1877 1794 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; 1878 1795 unsigned rec_len = 0; ··· 1882 1797 while ((char*)de < base + blocksize) { 1883 1798 next = ext4_next_entry(de, blocksize); 1884 1799 if (de->inode && de->name_len) { 1885 - rec_len = EXT4_DIR_REC_LEN(de->name_len); 1800 + rec_len = ext4_dir_rec_len(de->name_len, dir); 1886 1801 if (de > to) 1887 1802 memmove(to, de, rec_len); 1888 1803 to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); ··· 1972 1887 hash2, split, count-split)); 1973 1888 1974 1889 /* Fancy dance to stay within two buffers */ 1975 - de2 = dx_move_dirents(data1, data2, map + split, count - split, 1890 + de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, 1976 1891 blocksize); 1977 - de = dx_pack_dirents(data1, blocksize); 1892 + de = dx_pack_dirents(dir, data1, blocksize); 1978 1893 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - 1979 1894 (char *) de, 1980 1895 blocksize); ··· 2022 1937 struct ext4_dir_entry_2 **dest_de) 2023 1938 { 2024 1939 struct ext4_dir_entry_2 *de; 2025 - unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); 1940 + unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); 2026 1941 int nlen, rlen; 2027 1942 unsigned int offset = 0; 2028 1943 char *top; ··· 2035 1950 return -EFSCORRUPTED; 2036 1951 if (ext4_match(dir, fname, de)) 2037 1952 return -EEXIST; 2038 - nlen = EXT4_DIR_REC_LEN(de->name_len); 1953 + nlen = ext4_dir_rec_len(de->name_len, dir); 2039 1954 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 2040 1955 if ((de->inode ? rlen - nlen : rlen) >= reclen) 2041 1956 break; ··· 2049 1964 return 0; 2050 1965 } 2051 1966 2052 - void ext4_insert_dentry(struct inode *inode, 1967 + void ext4_insert_dentry(struct inode *dir, 1968 + struct inode *inode, 2053 1969 struct ext4_dir_entry_2 *de, 2054 1970 int buf_size, 2055 1971 struct ext4_filename *fname) ··· 2058 1972 2059 1973 int nlen, rlen; 2060 1974 2061 - nlen = EXT4_DIR_REC_LEN(de->name_len); 1975 + nlen = ext4_dir_rec_len(de->name_len, dir); 2062 1976 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 2063 1977 if (de->inode) { 2064 1978 struct ext4_dir_entry_2 *de1 = ··· 2072 1986 ext4_set_de_type(inode->i_sb, de, inode->i_mode); 2073 1987 de->name_len = fname_len(fname); 2074 1988 memcpy(de->name, fname_name(fname), fname_len(fname)); 1989 + if (ext4_hash_in_dirent(dir)) { 1990 + struct dx_hash_info *hinfo = &fname->hinfo; 1991 + 1992 + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); 1993 + EXT4_DIRENT_HASHES(de)->minor_hash = 1994 + cpu_to_le32(hinfo->minor_hash); 1995 + } 2075 1996 } 2076 1997 2077 1998 /* ··· 2115 2022 } 2116 2023 2117 2024 /* By now the buffer is marked for journaling */ 2118 - ext4_insert_dentry(inode, de, blocksize, fname); 2025 + ext4_insert_dentry(dir, inode, de, blocksize, fname); 2119 2026 2120 2027 /* 2121 2028 * XXX shouldn't update any times until successful ··· 2195 2102 data2 = bh2->b_data; 2196 2103 2197 2104 memcpy(data2, de, len); 2105 + memset(de, 0, len); /* wipe old data */ 2198 2106 de = (struct ext4_dir_entry_2 *) data2; 2199 2107 top = data2 + len; 2200 2108 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) ··· 2208 2114 2209 2115 /* Initialize the root; the dot dirents already exist */ 2210 2116 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 2211 - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), 2212 - blocksize); 2117 + de->rec_len = ext4_rec_len_to_disk( 2118 + blocksize - ext4_dir_rec_len(2, NULL), blocksize); 2213 2119 memset (&root->info, 0, sizeof(root->info)); 2214 2120 root->info.info_length = sizeof(root->info); 2215 - root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 2121 + if (ext4_hash_in_dirent(dir)) 2122 + root->info.hash_version = DX_HASH_SIPHASH; 2123 + else 2124 + root->info.hash_version = 2125 + EXT4_SB(dir->i_sb)->s_def_hash_version; 2126 + 2216 2127 entries = root->entries; 2217 2128 dx_set_block(entries, 1); 2218 2129 dx_set_count(entries, 1); ··· 2228 2129 if (fname->hinfo.hash_version <= DX_HASH_TEA) 2229 2130 fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 2230 2131 fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 2231 - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); 2132 + 2133 + /* casefolded encrypted hashes are computed on fname setup */ 2134 + if (!ext4_hash_in_dirent(dir)) 2135 + ext4fs_dirhash(dir, fname_name(fname), 2136 + fname_len(fname), &fname->hinfo); 2232 2137 2233 2138 memset(frames, 0, sizeof(frames)); 2234 2139 frame = frames; ··· 2242 2139 2243 2140 retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); 2244 2141 if (retval) 2245 - goto out_frames; 2142 + goto out_frames; 2246 2143 retval = ext4_handle_dirty_dirblock(handle, dir, bh2); 2247 2144 if (retval) 2248 - goto out_frames; 2145 + goto out_frames; 2249 2146 2250 2147 de = do_split(handle,dir, &bh2, frame, &fname->hinfo); 2251 2148 if (IS_ERR(de)) { ··· 2585 2482 entry_buf, buf_size, i)) 2586 2483 return -EFSCORRUPTED; 2587 2484 if (de == de_del) { 2588 - if (pde) 2485 + if (pde) { 2589 2486 pde->rec_len = ext4_rec_len_to_disk( 2590 2487 ext4_rec_len_from_disk(pde->rec_len, 2591 2488 blocksize) + 2592 2489 ext4_rec_len_from_disk(de->rec_len, 2593 2490 blocksize), 2594 2491 blocksize); 2595 - else 2492 + 2493 + /* wipe entire dir_entry */ 2494 + memset(de, 0, ext4_rec_len_from_disk(de->rec_len, 2495 + blocksize)); 2496 + } else { 2497 + /* wipe dir_entry excluding the rec_len field */ 2596 2498 de->inode = 0; 2499 + memset(&de->name_len, 0, 2500 + ext4_rec_len_from_disk(de->rec_len, 2501 + blocksize) - 2502 + offsetof(struct ext4_dir_entry_2, 2503 + name_len)); 2504 + } 2505 + 2597 2506 inode_inc_iversion(dir); 2598 2507 return 0; 2599 2508 } ··· 2837 2722 { 2838 2723 de->inode = cpu_to_le32(inode->i_ino); 2839 2724 de->name_len = 1; 2840 - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), 2725 + de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), 2841 2726 blocksize); 2842 2727 strcpy(de->name, "."); 2843 2728 ext4_set_de_type(inode->i_sb, de, S_IFDIR); ··· 2847 2732 de->name_len = 2; 2848 2733 if (!dotdot_real_len) 2849 2734 de->rec_len = ext4_rec_len_to_disk(blocksize - 2850 - (csum_size + EXT4_DIR_REC_LEN(1)), 2735 + (csum_size + ext4_dir_rec_len(1, NULL)), 2851 2736 blocksize); 2852 2737 else 2853 2738 de->rec_len = ext4_rec_len_to_disk( 2854 - EXT4_DIR_REC_LEN(de->name_len), blocksize); 2739 + ext4_dir_rec_len(de->name_len, NULL), 2740 + blocksize); 2855 2741 strcpy(de->name, ".."); 2856 2742 ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2857 2743 ··· 2985 2869 } 2986 2870 2987 2871 sb = inode->i_sb; 2988 - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { 2872 + if (inode->i_size < ext4_dir_rec_len(1, NULL) + 2873 + ext4_dir_rec_len(2, NULL)) { 2989 2874 EXT4_ERROR_INODE(inode, "invalid size"); 2990 2875 return true; 2991 2876 } ··· 3489 3372 * for transaction commit if we are running out of space 3490 3373 * and thus we deadlock. So we have to stop transaction now 3491 3374 * and restart it when symlink contents is written. 3492 - * 3375 + * 3493 3376 * To keep fs consistent in case of crash, we have to put inode 3494 3377 * to orphan list in the mean time. 3495 3378 */
+73 -43
fs/ext4/super.c
··· 667 667 ext4_commit_super(sb); 668 668 } 669 669 670 - if (sb_rdonly(sb) || continue_fs) 671 - return; 672 - 673 670 /* 674 671 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 675 672 * could panic during 'reboot -f' as the underlying device got already ··· 676 679 panic("EXT4-fs (device %s): panic forced after error\n", 677 680 sb->s_id); 678 681 } 682 + 683 + if (sb_rdonly(sb) || continue_fs) 684 + return; 685 + 679 686 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 680 687 /* 681 688 * Make sure updated value of ->s_mount_flags will be visible before ··· 1689 1688 Opt_dioread_nolock, Opt_dioread_lock, 1690 1689 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1691 1690 Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, 1692 - Opt_prefetch_block_bitmaps, 1691 + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, 1693 1692 #ifdef CONFIG_EXT4_DEBUG 1694 1693 Opt_fc_debug_max_replay, Opt_fc_debug_force 1695 1694 #endif ··· 1789 1788 {Opt_inlinecrypt, "inlinecrypt"}, 1790 1789 {Opt_nombcache, "nombcache"}, 1791 1790 {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ 1792 - {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, 1791 + {Opt_removed, "prefetch_block_bitmaps"}, 1792 + {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, 1793 + {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, 1793 1794 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1794 1795 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1795 1796 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ ··· 1824 1821 } 1825 1822 1826 1823 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1824 + #define DEFAULT_MB_OPTIMIZE_SCAN (-1) 1825 + 1827 1826 static const char deprecated_msg[] = 1828 1827 "Mount option \"%s\" will be removed by %s\n" 1829 1828 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; ··· 2012 2007 {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 2013 2008 {Opt_test_dummy_encryption, 0, MOPT_STRING}, 2014 2009 {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, 2015 - {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, 2010 + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, 2016 2011 MOPT_SET}, 2012 + {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, 2017 2013 #ifdef CONFIG_EXT4_DEBUG 2018 2014 {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, 2019 2015 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, ··· 2096 2090 return 1; 2097 2091 } 2098 2092 2093 + struct ext4_parsed_options { 2094 + unsigned long journal_devnum; 2095 + unsigned int journal_ioprio; 2096 + int mb_optimize_scan; 2097 + }; 2098 + 2099 2099 static int handle_mount_opt(struct super_block *sb, char *opt, int token, 2100 - substring_t *args, unsigned long *journal_devnum, 2101 - unsigned int *journal_ioprio, int is_remount) 2100 + substring_t *args, struct ext4_parsed_options *parsed_opts, 2101 + int is_remount) 2102 2102 { 2103 2103 struct ext4_sb_info *sbi = EXT4_SB(sb); 2104 2104 const struct mount_opts *m; ··· 2261 2249 "Cannot specify journal on remount"); 2262 2250 return -1; 2263 2251 } 2264 - *journal_devnum = arg; 2252 + parsed_opts->journal_devnum = arg; 2265 2253 } else if (token == Opt_journal_path) { 2266 2254 char *journal_path; 2267 2255 struct inode *journal_inode; ··· 2297 2285 return -1; 2298 2286 } 2299 2287 2300 - *journal_devnum = new_encode_dev(journal_inode->i_rdev); 2288 + parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); 2301 2289 path_put(&path); 2302 2290 kfree(journal_path); 2303 2291 } else if (token == Opt_journal_ioprio) { ··· 2306 2294 " (must be 0-7)"); 2307 2295 return -1; 2308 2296 } 2309 - *journal_ioprio = 2297 + parsed_opts->journal_ioprio = 2310 2298 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); 2311 2299 } else if (token == Opt_test_dummy_encryption) { 2312 2300 return ext4_set_test_dummy_encryption(sb, opt, &args[0], ··· 2396 2384 sbi->s_mount_opt |= m->mount_opt; 2397 2385 } else if (token == Opt_data_err_ignore) { 2398 2386 sbi->s_mount_opt &= ~m->mount_opt; 2387 + } else if (token == Opt_mb_optimize_scan) { 2388 + if (arg != 0 && arg != 1) { 2389 + ext4_msg(sb, KERN_WARNING, 2390 + "mb_optimize_scan should be set to 0 or 1."); 2391 + return -1; 2392 + } 2393 + parsed_opts->mb_optimize_scan = arg; 2399 2394 } else { 2400 2395 if (!args->from) 2401 2396 arg = 1; ··· 2430 2411 } 2431 2412 2432 2413 static int parse_options(char *options, struct super_block *sb, 2433 - unsigned long *journal_devnum, 2434 - unsigned int *journal_ioprio, 2414 + struct ext4_parsed_options *ret_opts, 2435 2415 int is_remount) 2436 2416 { 2437 2417 struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); ··· 2450 2432 */ 2451 2433 args[0].to = args[0].from = NULL; 2452 2434 token = match_token(p, tokens, args); 2453 - if (handle_mount_opt(sb, p, token, args, journal_devnum, 2454 - journal_ioprio, is_remount) < 0) 2435 + if (handle_mount_opt(sb, p, token, args, ret_opts, 2436 + is_remount) < 0) 2455 2437 return 0; 2456 2438 } 2457 2439 #ifdef CONFIG_QUOTA ··· 3041 3023 sb->s_flags &= ~SB_RDONLY; 3042 3024 } 3043 3025 #ifdef CONFIG_QUOTA 3044 - /* Needed for iput() to work correctly and not trash data */ 3045 - sb->s_flags |= SB_ACTIVE; 3046 - 3047 3026 /* 3048 3027 * Turn on quotas which were not enabled for read-only mounts if 3049 3028 * filesystem has quota feature, so that they are updated correctly. ··· 3706 3691 3707 3692 elr->lr_super = sb; 3708 3693 elr->lr_first_not_zeroed = start; 3709 - if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) 3710 - elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; 3711 - else { 3694 + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { 3712 3695 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3713 3696 elr->lr_next_group = start; 3697 + } else { 3698 + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; 3714 3699 } 3715 3700 3716 3701 /* ··· 3741 3726 goto out; 3742 3727 } 3743 3728 3744 - if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && 3729 + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && 3745 3730 (first_not_zeroed == ngroups || sb_rdonly(sb) || 3746 3731 !test_opt(sb, INIT_INODE_TABLE))) 3747 3732 goto out; ··· 4030 4015 ext4_fsblk_t sb_block = get_sb_block(&data); 4031 4016 ext4_fsblk_t logical_sb_block; 4032 4017 unsigned long offset = 0; 4033 - unsigned long journal_devnum = 0; 4034 4018 unsigned long def_mount_opts; 4035 4019 struct inode *root; 4036 4020 const char *descr; ··· 4040 4026 int needs_recovery, has_huge_files; 4041 4027 __u64 blocks_count; 4042 4028 int err = 0; 4043 - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4044 4029 ext4_group_t first_not_zeroed; 4030 + struct ext4_parsed_options parsed_opts; 4031 + 4032 + /* Set defaults for the variables that will be set during parsing */ 4033 + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4034 + parsed_opts.journal_devnum = 0; 4035 + parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; 4045 4036 4046 4037 if ((data && !orig_data) || !sbi) 4047 4038 goto out_free_base; ··· 4292 4273 GFP_KERNEL); 4293 4274 if (!s_mount_opts) 4294 4275 goto failed_mount; 4295 - if (!parse_options(s_mount_opts, sb, &journal_devnum, 4296 - &journal_ioprio, 0)) { 4276 + if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { 4297 4277 ext4_msg(sb, KERN_WARNING, 4298 4278 "failed to parse options in superblock: %s", 4299 4279 s_mount_opts); ··· 4300 4282 kfree(s_mount_opts); 4301 4283 } 4302 4284 sbi->s_def_mount_opt = sbi->s_mount_opt; 4303 - if (!parse_options((char *) data, sb, &journal_devnum, 4304 - &journal_ioprio, 0)) 4285 + if (!parse_options((char *) data, sb, &parsed_opts, 0)) 4305 4286 goto failed_mount; 4306 4287 4307 4288 #ifdef CONFIG_UNICODE ··· 4308 4291 const struct ext4_sb_encodings *encoding_info; 4309 4292 struct unicode_map *encoding; 4310 4293 __u16 encoding_flags; 4311 - 4312 - if (ext4_has_feature_encrypt(sb)) { 4313 - ext4_msg(sb, KERN_ERR, 4314 - "Can't mount with encoding and encryption"); 4315 - goto failed_mount; 4316 - } 4317 4294 4318 4295 if (ext4_sb_read_encoding(es, &encoding_info, 4319 4296 &encoding_flags)) { ··· 4785 4774 * root first: it may be modified in the journal! 4786 4775 */ 4787 4776 if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { 4788 - err = ext4_load_journal(sb, es, journal_devnum); 4777 + err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); 4789 4778 if (err) 4790 4779 goto failed_mount3a; 4791 4780 } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ··· 4885 4874 goto failed_mount_wq; 4886 4875 } 4887 4876 4888 - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4877 + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); 4889 4878 4890 4879 sbi->s_journal->j_submit_inode_data_buffers = 4891 4880 ext4_journal_submit_inode_data_buffers; ··· 4991 4980 ext4_fc_replay_cleanup(sb); 4992 4981 4993 4982 ext4_ext_init(sb); 4983 + 4984 + /* 4985 + * Enable optimize_scan if number of groups is > threshold. This can be 4986 + * turned off by passing "mb_optimize_scan=0". This can also be 4987 + * turned on forcefully by passing "mb_optimize_scan=1". 4988 + */ 4989 + if (parsed_opts.mb_optimize_scan == 1) 4990 + set_opt2(sb, MB_OPTIMIZE_SCAN); 4991 + else if (parsed_opts.mb_optimize_scan == 0) 4992 + clear_opt2(sb, MB_OPTIMIZE_SCAN); 4993 + else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) 4994 + set_opt2(sb, MB_OPTIMIZE_SCAN); 4995 + 4994 4996 err = ext4_mb_init(sb); 4995 4997 if (err) { 4996 4998 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", ··· 5020 4996 ext4_journal_commit_callback; 5021 4997 5022 4998 block = ext4_count_free_clusters(sb); 5023 - ext4_free_blocks_count_set(sbi->s_es, 4999 + ext4_free_blocks_count_set(sbi->s_es, 5024 5000 EXT4_C2B(sbi, block)); 5025 5001 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 5026 5002 GFP_KERNEL); ··· 5585 5561 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5586 5562 int error = 0; 5587 5563 5588 - if (!sbh || block_device_ejected(sb)) 5589 - return error; 5564 + if (!sbh) 5565 + return -EINVAL; 5566 + if (block_device_ejected(sb)) 5567 + return -ENODEV; 5590 5568 5591 5569 ext4_update_super(sb); 5592 5570 ··· 5839 5813 struct ext4_mount_options old_opts; 5840 5814 int enable_quota = 0; 5841 5815 ext4_group_t g; 5842 - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 5843 5816 int err = 0; 5844 5817 #ifdef CONFIG_QUOTA 5845 5818 int i, j; 5846 5819 char *to_free[EXT4_MAXQUOTAS]; 5847 5820 #endif 5848 5821 char *orig_data = kstrdup(data, GFP_KERNEL); 5822 + struct ext4_parsed_options parsed_opts; 5823 + 5824 + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 5825 + parsed_opts.journal_devnum = 0; 5849 5826 5850 5827 if (data && !orig_data) 5851 5828 return -ENOMEM; ··· 5879 5850 old_opts.s_qf_names[i] = NULL; 5880 5851 #endif 5881 5852 if (sbi->s_journal && sbi->s_journal->j_task->io_context) 5882 - journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; 5853 + parsed_opts.journal_ioprio = 5854 + sbi->s_journal->j_task->io_context->ioprio; 5883 5855 5884 5856 /* 5885 5857 * Some options can be enabled by ext4 and/or by VFS mount flag ··· 5890 5860 vfs_flags = SB_LAZYTIME | SB_I_VERSION; 5891 5861 sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); 5892 5862 5893 - if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { 5863 + if (!parse_options(data, sb, &parsed_opts, 1)) { 5894 5864 err = -EINVAL; 5895 5865 goto restore_opts; 5896 5866 } ··· 5940 5910 5941 5911 if (sbi->s_journal) { 5942 5912 ext4_init_journal_params(sb, sbi->s_journal); 5943 - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 5913 + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); 5944 5914 } 5945 5915 5946 5916 /* Flush outstanding errors before changing fs state */
+8
fs/ext4/sysfs.c
··· 215 215 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 216 216 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 217 217 EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); 218 + EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); 218 219 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 219 220 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); 220 221 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); ··· 264 263 ATTR_LIST(mb_stream_req), 265 264 ATTR_LIST(mb_group_prealloc), 266 265 ATTR_LIST(mb_max_inode_prealloc), 266 + ATTR_LIST(mb_max_linear_groups), 267 267 ATTR_LIST(max_writeback_mb_bump), 268 268 ATTR_LIST(extent_max_zeroout_kb), 269 269 ATTR_LIST(trigger_fs_error), ··· 315 313 #endif 316 314 EXT4_ATTR_FEATURE(metadata_csum_seed); 317 315 EXT4_ATTR_FEATURE(fast_commit); 316 + EXT4_ATTR_FEATURE(encrypted_casefold); 318 317 319 318 static struct attribute *ext4_feat_attrs[] = { 320 319 ATTR_LIST(lazy_itable_init), ··· 333 330 #endif 334 331 ATTR_LIST(metadata_csum_seed), 335 332 ATTR_LIST(fast_commit), 333 + ATTR_LIST(encrypted_casefold), 336 334 NULL, 337 335 }; 338 336 ATTRIBUTE_GROUPS(ext4_feat); ··· 532 528 ext4_fc_info_show, sb); 533 529 proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, 534 530 &ext4_mb_seq_groups_ops, sb); 531 + proc_create_single_data("mb_stats", 0444, sbi->s_proc, 532 + ext4_seq_mb_stats_show, sb); 533 + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, 534 + &ext4_mb_seq_structs_summary_ops, sb); 535 535 } 536 536 return 0; 537 537 }
+2 -8
fs/ext4/verity.c
··· 45 45 size_t n = min_t(size_t, count, 46 46 PAGE_SIZE - offset_in_page(pos)); 47 47 struct page *page; 48 - void *addr; 49 48 50 49 page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, 51 50 NULL); 52 51 if (IS_ERR(page)) 53 52 return PTR_ERR(page); 54 53 55 - addr = kmap_atomic(page); 56 - memcpy(buf, addr + offset_in_page(pos), n); 57 - kunmap_atomic(addr); 54 + memcpy_from_page(buf, page, offset_in_page(pos), n); 58 55 59 56 put_page(page); 60 57 ··· 77 80 PAGE_SIZE - offset_in_page(pos)); 78 81 struct page *page; 79 82 void *fsdata; 80 - void *addr; 81 83 int res; 82 84 83 85 res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, ··· 84 88 if (res) 85 89 return res; 86 90 87 - addr = kmap_atomic(page); 88 - memcpy(addr + offset_in_page(pos), buf, n); 89 - kunmap_atomic(addr); 91 + memcpy_to_page(page, offset_in_page(pos), buf, n); 90 92 91 93 res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, 92 94 page, fsdata);
+1 -1
fs/ext4/xattr.c
··· 1617 1617 * If storing the value in an external inode is an option, 1618 1618 * reserve space for xattr entries/names in the external 1619 1619 * attribute block so that a long value does not occupy the 1620 - * whole space and prevent futher entries being added. 1620 + * whole space and prevent further entries being added. 1621 1621 */ 1622 1622 if (ext4_has_feature_ea_inode(inode->i_sb) && 1623 1623 new_size && is_block &&
+2 -3
fs/jbd2/recovery.c
··· 245 245 return 0; 246 246 247 247 while (next_fc_block <= journal->j_fc_last) { 248 - jbd_debug(3, "Fast commit replay: next block %ld", 248 + jbd_debug(3, "Fast commit replay: next block %ld\n", 249 249 next_fc_block); 250 250 err = jread(&bh, journal, next_fc_block); 251 251 if (err) { 252 - jbd_debug(3, "Fast commit replay: read error"); 252 + jbd_debug(3, "Fast commit replay: read error\n"); 253 253 break; 254 254 } 255 255 256 - jbd_debug(3, "Processing fast commit blk with seq %d"); 257 256 err = journal->j_fc_replay_callback(journal, bh, pass, 258 257 next_fc_block - journal->j_fc_first, 259 258 expected_commit_id);
+10 -5
fs/jbd2/transaction.c
··· 349 349 } 350 350 351 351 alloc_transaction: 352 - if (!journal->j_running_transaction) { 352 + /* 353 + * This check is racy but it is just an optimization of allocating new 354 + * transaction early if there are high chances we'll need it. If we 355 + * guess wrong, we'll retry or free unused transaction. 356 + */ 357 + if (!data_race(journal->j_running_transaction)) { 353 358 /* 354 359 * If __GFP_FS is not present, then we may be being called from 355 360 * inside the fs writeback layer, so we MUST NOT fail. ··· 1479 1474 * crucial to catch bugs so let's do a reliable check until the 1480 1475 * lockless handling is fully proven. 1481 1476 */ 1482 - if (jh->b_transaction != transaction && 1483 - jh->b_next_transaction != transaction) { 1477 + if (data_race(jh->b_transaction != transaction && 1478 + jh->b_next_transaction != transaction)) { 1484 1479 spin_lock(&jh->b_state_lock); 1485 1480 J_ASSERT_JH(jh, jh->b_transaction == transaction || 1486 1481 jh->b_next_transaction == transaction); ··· 1488 1483 } 1489 1484 if (jh->b_modified == 1) { 1490 1485 /* If it's in our transaction it must be in BJ_Metadata list. */ 1491 - if (jh->b_transaction == transaction && 1492 - jh->b_jlist != BJ_Metadata) { 1486 + if (data_race(jh->b_transaction == transaction && 1487 + jh->b_jlist != BJ_Metadata)) { 1493 1488 spin_lock(&jh->b_state_lock); 1494 1489 if (jh->b_transaction == transaction && 1495 1490 jh->b_jlist != BJ_Metadata)
+8
fs/stat.c
··· 86 86 /* SB_NOATIME means filesystem supplies dummy atime value */ 87 87 if (inode->i_sb->s_flags & SB_NOATIME) 88 88 stat->result_mask &= ~STATX_ATIME; 89 + 90 + /* 91 + * Note: If you add another clause to set an attribute flag, please 92 + * update attributes_mask below. 93 + */ 89 94 if (IS_AUTOMOUNT(inode)) 90 95 stat->attributes |= STATX_ATTR_AUTOMOUNT; 91 96 92 97 if (IS_DAX(inode)) 93 98 stat->attributes |= STATX_ATTR_DAX; 99 + 100 + stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | 101 + STATX_ATTR_DAX); 94 102 95 103 mnt_userns = mnt_user_ns(path->mnt); 96 104 if (inode->i_op->getattr)
+21 -12
include/linux/jbd2.h
··· 61 61 #define jbd_debug(n, fmt, a...) \ 62 62 __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) 63 63 #else 64 - #define jbd_debug(n, fmt, a...) /**/ 64 + #define jbd_debug(n, fmt, a...) no_printk(fmt, ##a) 65 65 #endif 66 66 67 67 extern void *jbd2_alloc(size_t size, gfp_t flags); ··· 594 594 */ 595 595 unsigned long t_log_start; 596 596 597 - /* Number of buffers on the t_buffers list [j_list_lock] */ 597 + /* 598 + * Number of buffers on the t_buffers list [j_list_lock, no locks 599 + * needed for jbd2 thread] 600 + */ 598 601 int t_nr_buffers; 599 602 600 603 /* 601 604 * Doubly-linked circular list of all buffers reserved but not yet 602 - * modified by this transaction [j_list_lock] 605 + * modified by this transaction [j_list_lock, no locks needed fo 606 + * jbd2 thread] 603 607 */ 604 608 struct journal_head *t_reserved_list; 605 609 606 610 /* 607 611 * Doubly-linked circular list of all metadata buffers owned by this 608 - * transaction [j_list_lock] 612 + * transaction [j_list_lock, no locks needed for jbd2 thread] 609 613 */ 610 614 struct journal_head *t_buffers; 611 615 ··· 633 629 struct journal_head *t_checkpoint_io_list; 634 630 635 631 /* 636 - * Doubly-linked circular list of metadata buffers being shadowed by log 637 - * IO. The IO buffers on the iobuf list and the shadow buffers on this 638 - * list match each other one for one at all times. [j_list_lock] 632 + * Doubly-linked circular list of metadata buffers being 633 + * shadowed by log IO. The IO buffers on the iobuf list and 634 + * the shadow buffers on this list match each other one for 635 + * one at all times. [j_list_lock, no locks needed for jbd2 636 + * thread] 639 637 */ 640 638 struct journal_head *t_shadow_list; 641 639 ··· 774 768 struct journal_s 775 769 { 776 770 /** 777 - * @j_flags: General journaling state flags [j_state_lock] 771 + * @j_flags: General journaling state flags [j_state_lock, 772 + * no lock for quick racy checks] 778 773 */ 779 774 unsigned long j_flags; 780 775 ··· 815 808 /** 816 809 * @j_barrier_count: 817 810 * 818 - * Number of processes waiting to create a barrier lock [j_state_lock] 811 + * Number of processes waiting to create a barrier lock [j_state_lock, 812 + * no lock for quick racy checks] 819 813 */ 820 814 int j_barrier_count; 821 815 ··· 829 821 * @j_running_transaction: 830 822 * 831 823 * Transactions: The current running transaction... 832 - * [j_state_lock] [caller holding open handle] 824 + * [j_state_lock, no lock for quick racy checks] [caller holding 825 + * open handle] 833 826 */ 834 827 transaction_t *j_running_transaction; 835 828 ··· 1042 1033 * @j_commit_sequence: 1043 1034 * 1044 1035 * Sequence number of the most recently committed transaction 1045 - * [j_state_lock]. 1036 + * [j_state_lock, no lock for quick racy checks] 1046 1037 */ 1047 1038 tid_t j_commit_sequence; 1048 1039 ··· 1050 1041 * @j_commit_request: 1051 1042 * 1052 1043 * Sequence number of the most recent transaction wanting commit 1053 - * [j_state_lock] 1044 + * [j_state_lock, no lock for quick racy checks] 1054 1045 */ 1055 1046 tid_t j_commit_request; 1056 1047
-176
include/trace/events/ext4.h
··· 1358 1358 __entry->group, __entry->prefetch) 1359 1359 ); 1360 1360 1361 - TRACE_EVENT(ext4_direct_IO_enter, 1362 - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), 1363 - 1364 - TP_ARGS(inode, offset, len, rw), 1365 - 1366 - TP_STRUCT__entry( 1367 - __field( dev_t, dev ) 1368 - __field( ino_t, ino ) 1369 - __field( loff_t, pos ) 1370 - __field( unsigned long, len ) 1371 - __field( int, rw ) 1372 - ), 1373 - 1374 - TP_fast_assign( 1375 - __entry->dev = inode->i_sb->s_dev; 1376 - __entry->ino = inode->i_ino; 1377 - __entry->pos = offset; 1378 - __entry->len = len; 1379 - __entry->rw = rw; 1380 - ), 1381 - 1382 - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", 1383 - MAJOR(__entry->dev), MINOR(__entry->dev), 1384 - (unsigned long) __entry->ino, 1385 - __entry->pos, __entry->len, __entry->rw) 1386 - ); 1387 - 1388 - TRACE_EVENT(ext4_direct_IO_exit, 1389 - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, 1390 - int rw, int ret), 1391 - 1392 - TP_ARGS(inode, offset, len, rw, ret), 1393 - 1394 - TP_STRUCT__entry( 1395 - __field( dev_t, dev ) 1396 - __field( ino_t, ino ) 1397 - __field( loff_t, pos ) 1398 - __field( unsigned long, len ) 1399 - __field( int, rw ) 1400 - __field( int, ret ) 1401 - ), 1402 - 1403 - TP_fast_assign( 1404 - __entry->dev = inode->i_sb->s_dev; 1405 - __entry->ino = inode->i_ino; 1406 - __entry->pos = offset; 1407 - __entry->len = len; 1408 - __entry->rw = rw; 1409 - __entry->ret = ret; 1410 - ), 1411 - 1412 - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", 1413 - MAJOR(__entry->dev), MINOR(__entry->dev), 1414 - (unsigned long) __entry->ino, 1415 - __entry->pos, __entry->len, 1416 - __entry->rw, __entry->ret) 1417 - ); 1418 - 1419 1361 DECLARE_EVENT_CLASS(ext4__fallocate_mode, 1420 1362 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1421 1363 ··· 1902 1960 MAJOR(__entry->dev), MINOR(__entry->dev), 1903 1961 __entry->lblk, (unsigned long long) __entry->pblk, 1904 1962 __entry->len, show_mflags(__entry->flags), __entry->ret) 1905 - ); 1906 - 1907 - TRACE_EVENT(ext4_ext_put_in_cache, 1908 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, 1909 - ext4_fsblk_t start), 1910 - 1911 - TP_ARGS(inode, lblk, len, start), 1912 - 1913 - TP_STRUCT__entry( 1914 - __field( dev_t, dev ) 1915 - __field( ino_t, ino ) 1916 - __field( ext4_lblk_t, lblk ) 1917 - __field( unsigned int, len ) 1918 - __field( ext4_fsblk_t, start ) 1919 - ), 1920 - 1921 - TP_fast_assign( 1922 - __entry->dev = inode->i_sb->s_dev; 1923 - __entry->ino = inode->i_ino; 1924 - __entry->lblk = lblk; 1925 - __entry->len = len; 1926 - __entry->start = start; 1927 - ), 1928 - 1929 - TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu", 1930 - MAJOR(__entry->dev), MINOR(__entry->dev), 1931 - (unsigned long) __entry->ino, 1932 - (unsigned) __entry->lblk, 1933 - __entry->len, 1934 - (unsigned long long) __entry->start) 1935 - ); 1936 - 1937 - TRACE_EVENT(ext4_ext_in_cache, 1938 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret), 1939 - 1940 - TP_ARGS(inode, lblk, ret), 1941 - 1942 - TP_STRUCT__entry( 1943 - __field( dev_t, dev ) 1944 - __field( ino_t, ino ) 1945 - __field( ext4_lblk_t, lblk ) 1946 - __field( int, ret ) 1947 - ), 1948 - 1949 - TP_fast_assign( 1950 - __entry->dev = inode->i_sb->s_dev; 1951 - __entry->ino = inode->i_ino; 1952 - __entry->lblk = lblk; 1953 - __entry->ret = ret; 1954 - ), 1955 - 1956 - TP_printk("dev %d,%d ino %lu lblk %u ret %d", 1957 - MAJOR(__entry->dev), MINOR(__entry->dev), 1958 - (unsigned long) __entry->ino, 1959 - (unsigned) __entry->lblk, 1960 - __entry->ret) 1961 - 1962 - ); 1963 - 1964 - TRACE_EVENT(ext4_find_delalloc_range, 1965 - TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to, 1966 - int reverse, int found, ext4_lblk_t found_blk), 1967 - 1968 - TP_ARGS(inode, from, to, reverse, found, found_blk), 1969 - 1970 - TP_STRUCT__entry( 1971 - __field( dev_t, dev ) 1972 - __field( ino_t, ino ) 1973 - __field( ext4_lblk_t, from ) 1974 - __field( ext4_lblk_t, to ) 1975 - __field( int, reverse ) 1976 - __field( int, found ) 1977 - __field( ext4_lblk_t, found_blk ) 1978 - ), 1979 - 1980 - TP_fast_assign( 1981 - __entry->dev = inode->i_sb->s_dev; 1982 - __entry->ino = inode->i_ino; 1983 - __entry->from = from; 1984 - __entry->to = to; 1985 - __entry->reverse = reverse; 1986 - __entry->found = found; 1987 - __entry->found_blk = found_blk; 1988 - ), 1989 - 1990 - TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d " 1991 - "(blk = %u)", 1992 - MAJOR(__entry->dev), MINOR(__entry->dev), 1993 - (unsigned long) __entry->ino, 1994 - (unsigned) __entry->from, (unsigned) __entry->to, 1995 - __entry->reverse, __entry->found, 1996 - (unsigned) __entry->found_blk) 1997 - ); 1998 - 1999 - TRACE_EVENT(ext4_get_reserved_cluster_alloc, 2000 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len), 2001 - 2002 - TP_ARGS(inode, lblk, len), 2003 - 2004 - TP_STRUCT__entry( 2005 - __field( dev_t, dev ) 2006 - __field( ino_t, ino ) 2007 - __field( ext4_lblk_t, lblk ) 2008 - __field( unsigned int, len ) 2009 - ), 2010 - 2011 - TP_fast_assign( 2012 - __entry->dev = inode->i_sb->s_dev; 2013 - __entry->ino = inode->i_ino; 2014 - __entry->lblk = lblk; 2015 - __entry->len = len; 2016 - ), 2017 - 2018 - TP_printk("dev %d,%d ino %lu lblk %u len %u", 2019 - MAJOR(__entry->dev), MINOR(__entry->dev), 2020 - (unsigned long) __entry->ino, 2021 - (unsigned) __entry->lblk, 2022 - __entry->len) 2023 1963 ); 2024 1964 2025 1965 TRACE_EVENT(ext4_ext_show_extent,