Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+27

Documentation/filesystems/ext4/directory.rst

··· 121 121 * - 0x7 122 122 - Symbolic link. 123 123 124 + To support directories that are both encrypted and casefolded directories, we 125 + must also include hash information in the directory entry. We append 126 + ``ext4_extended_dir_entry_2`` to ``ext4_dir_entry_2`` except for the entries 127 + for dot and dotdot, which are kept the same. The structure follows immediately 128 + after ``name`` and is included in the size listed by ``rec_len`` If a directory 129 + entry uses this extension, it may be up to 271 bytes. 130 + 131 + .. list-table:: 132 + :widths: 8 8 24 40 133 + :header-rows: 1 134 + 135 + * - Offset 136 + - Size 137 + - Name 138 + - Description 139 + * - 0x0 140 + - \_\_le32 141 + - hash 142 + - The hash of the directory name 143 + * - 0x4 144 + - \_\_le32 145 + - minor\_hash 146 + - The minor hash of the directory name 147 + 148 + 124 149 In order to add checksums to these classic directory blocks, a phony 125 150 ``struct ext4_dir_entry`` is placed at the end of each leaf block to 126 151 hold the checksum. The directory entry is 12 bytes long. The inode ··· 347 322 - Half MD4, unsigned. 348 323 * - 0x5 349 324 - Tea, unsigned. 325 + * - 0x6 326 + - Siphash. 350 327 351 328 Interior nodes of an htree are recorded as ``struct dx_node``, which is 352 329 also the full length of a data block:

+1 -1

fs/ext4/balloc.c

··· 239 239 ext4_group_t block_group, 240 240 struct ext4_group_desc *gdp) 241 241 { 242 - return num_clusters_in_group(sb, block_group) - 242 + return num_clusters_in_group(sb, block_group) - 243 243 ext4_num_overhead_clusters(sb, block_group, gdp); 244 244 } 245 245

+30 -11

fs/ext4/dir.c

··· 55 55 return 0; 56 56 } 57 57 58 + static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) 59 + { 60 + /* Check if . or .. , or skip if namelen is 0 */ 61 + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && 62 + (de->name[1] == '.' || de->name[1] == '\0')) 63 + return true; 64 + /* Check if this is a csum entry */ 65 + if (de->file_type == EXT4_FT_DIR_CSUM) 66 + return true; 67 + return false; 68 + } 69 + 58 70 /* 59 71 * Return 0 if the directory entry is OK, and 1 if there is a problem 60 72 * ··· 85 73 const int rlen = ext4_rec_len_from_disk(de->rec_len, 86 74 dir->i_sb->s_blocksize); 87 75 const int next_offset = ((char *) de - buf) + rlen; 76 + bool fake = is_fake_dir_entry(de); 77 + bool has_csum = ext4_has_metadata_csum(dir->i_sb); 88 78 89 - if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) 79 + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) 90 80 error_msg = "rec_len is smaller than minimal"; 91 81 else if (unlikely(rlen % 4 != 0)) 92 82 error_msg = "rec_len % 4 != 0"; 93 - else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) 83 + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, 84 + fake ? NULL : dir))) 94 85 error_msg = "rec_len is too small for name_len"; 95 86 else if (unlikely(next_offset > size)) 96 87 error_msg = "directory entry overrun"; 97 - else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && 88 + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, 89 + has_csum ? NULL : dir) && 98 90 next_offset != size)) 99 91 error_msg = "directory entry too close to block end"; 100 92 else if (unlikely(le32_to_cpu(de->inode) > ··· 110 94 if (filp) 111 95 ext4_error_file(filp, function, line, bh->b_blocknr, 112 96 "bad entry in directory: %s - offset=%u, " 113 - "inode=%u, rec_len=%d, name_len=%d, size=%d", 97 + "inode=%u, rec_len=%d, size=%d fake=%d", 114 98 error_msg, offset, le32_to_cpu(de->inode), 115 - rlen, de->name_len, size); 99 + rlen, size, fake); 116 100 else 117 101 ext4_error_inode(dir, function, line, bh->b_blocknr, 118 102 "bad entry in directory: %s - offset=%u, " 119 - "inode=%u, rec_len=%d, name_len=%d, size=%d", 103 + "inode=%u, rec_len=%d, size=%d fake=%d", 120 104 error_msg, offset, le32_to_cpu(de->inode), 121 - rlen, de->name_len, size); 105 + rlen, size, fake); 122 106 123 107 return 1; 124 108 } ··· 140 124 141 125 if (is_dx_dir(inode)) { 142 126 err = ext4_dx_readdir(file, ctx); 143 - if (err != ERR_BAD_DX_DIR) { 127 + if (err != ERR_BAD_DX_DIR) 144 128 return err; 145 - } 129 + 146 130 /* Can we just clear INDEX flag to ignore htree information? */ 147 131 if (!ext4_has_metadata_csum(sb)) { 148 132 /* ··· 240 224 * failure will be detected in the 241 225 * dirent test below. */ 242 226 if (ext4_rec_len_from_disk(de->rec_len, 243 - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) 227 + sb->s_blocksize) < ext4_dir_rec_len(1, 228 + inode)) 244 229 break; 245 230 i += ext4_rec_len_from_disk(de->rec_len, 246 231 sb->s_blocksize); ··· 282 265 283 266 /* Directory is encrypted */ 284 267 err = fscrypt_fname_disk_to_usr(inode, 285 - 0, 0, &de_name, &fstr); 268 + EXT4_DIRENT_HASH(de), 269 + EXT4_DIRENT_MINOR_HASH(de), 270 + &de_name, &fstr); 286 271 de_name = fstr; 287 272 fstr.len = save_len; 288 273 if (err)

+85 -22

fs/ext4/ext4.h

··· 162 162 #define EXT4_MB_USE_RESERVED 0x2000 163 163 /* Do strict check for free blocks while retrying block allocation */ 164 164 #define EXT4_MB_STRICT_CHECK 0x4000 165 - 165 + /* Large fragment size list lookup succeeded at least once for cr = 0 */ 166 + #define EXT4_MB_CR0_OPTIMIZED 0x8000 167 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ 168 + #define EXT4_MB_CR1_OPTIMIZED 0x00010000 169 + /* Perform linear traversal for one group */ 170 + #define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 166 171 struct ext4_allocation_request { 167 172 /* target inode for block we're allocating */ 168 173 struct inode *inode; ··· 1218 1213 #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 1219 1214 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 1220 1215 #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ 1221 - #define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 1216 + #define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 1222 1217 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 1223 1218 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 1224 1219 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ ··· 1243 1238 #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ 1244 1239 #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ 1245 1240 #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ 1246 - 1241 + #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group 1242 + * scanning in mballoc 1243 + */ 1247 1244 1248 1245 #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 1249 1246 ~EXT4_MOUNT_##opt ··· 1526 1519 unsigned int s_mb_free_pending; 1527 1520 struct list_head s_freed_data_list; /* List of blocks to be freed 1528 1521 after commit completed */ 1522 + struct rb_root s_mb_avg_fragment_size_root; 1523 + rwlock_t s_mb_rb_lock; 1524 + struct list_head *s_mb_largest_free_orders; 1525 + rwlock_t *s_mb_largest_free_orders_locks; 1529 1526 1530 1527 /* tunables */ 1531 1528 unsigned long s_stripe; 1529 + unsigned int s_mb_max_linear_groups; 1532 1530 unsigned int s_mb_stream_request; 1533 1531 unsigned int s_mb_max_to_scan; 1534 1532 unsigned int s_mb_min_to_scan; ··· 1553 1541 atomic_t s_bal_success; /* we found long enough chunks */ 1554 1542 atomic_t s_bal_allocated; /* in blocks */ 1555 1543 atomic_t s_bal_ex_scanned; /* total extents scanned */ 1544 + atomic_t s_bal_groups_scanned; /* number of groups scanned */ 1556 1545 atomic_t s_bal_goals; /* goal hits */ 1557 1546 atomic_t s_bal_breaks; /* too long searches */ 1558 1547 atomic_t s_bal_2orders; /* 2^order hits */ 1559 - spinlock_t s_bal_lock; 1560 - unsigned long s_mb_buddies_generated; 1561 - unsigned long long s_mb_generation_time; 1548 + atomic_t s_bal_cr0_bad_suggestions; 1549 + atomic_t s_bal_cr1_bad_suggestions; 1550 + atomic64_t s_bal_cX_groups_considered[4]; 1551 + atomic64_t s_bal_cX_hits[4]; 1552 + atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ 1553 + atomic_t s_mb_buddies_generated; /* number of buddies generated */ 1554 + atomic64_t s_mb_generation_time; 1562 1555 atomic_t s_mb_lost_chunks; 1563 1556 atomic_t s_mb_preallocated; 1564 1557 atomic_t s_mb_discarded; ··· 2204 2187 char name[EXT4_NAME_LEN]; /* File name */ 2205 2188 }; 2206 2189 2190 + 2191 + /* 2192 + * Encrypted Casefolded entries require saving the hash on disk. This structure 2193 + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned 2194 + * boundary. 2195 + */ 2196 + struct ext4_dir_entry_hash { 2197 + __le32 hash; 2198 + __le32 minor_hash; 2199 + }; 2200 + 2207 2201 /* 2208 2202 * The new version of the directory entry. Since EXT4 structures are 2209 2203 * stored in intel byte order, and the name_len field could never be ··· 2228 2200 __u8 file_type; /* See file type macros EXT4_FT_* below */ 2229 2201 char name[EXT4_NAME_LEN]; /* File name */ 2230 2202 }; 2203 + 2204 + /* 2205 + * Access the hashes at the end of ext4_dir_entry_2 2206 + */ 2207 + #define EXT4_DIRENT_HASHES(entry) \ 2208 + ((struct ext4_dir_entry_hash *) \ 2209 + (((void *)(entry)) + \ 2210 + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) 2211 + #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) 2212 + #define EXT4_DIRENT_MINOR_HASH(entry) \ 2213 + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) 2214 + 2215 + static inline bool ext4_hash_in_dirent(const struct inode *inode) 2216 + { 2217 + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); 2218 + } 2231 2219 2232 2220 /* 2233 2221 * This is a bogus directory entry at the end of each leaf block that ··· 2286 2242 */ 2287 2243 #define EXT4_DIR_PAD 4 2288 2244 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) 2289 - #define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ 2290 - ~EXT4_DIR_ROUND) 2291 2245 #define EXT4_MAX_REC_LEN ((1<<16)-1) 2246 + 2247 + /* 2248 + * The rec_len is dependent on the type of directory. Directories that are 2249 + * casefolded and encrypted need to store the hash as well, so we add room for 2250 + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should 2251 + * pass NULL for dir, as those entries do not use the extra fields. 2252 + */ 2253 + static inline unsigned int ext4_dir_rec_len(__u8 name_len, 2254 + const struct inode *dir) 2255 + { 2256 + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); 2257 + 2258 + if (dir && ext4_hash_in_dirent(dir)) 2259 + rec_len += sizeof(struct ext4_dir_entry_hash); 2260 + return (rec_len & ~EXT4_DIR_ROUND); 2261 + } 2292 2262 2293 2263 /* 2294 2264 * If we ever get support for fs block sizes > page_size, we'll need ··· 2360 2302 #define DX_HASH_LEGACY_UNSIGNED 3 2361 2303 #define DX_HASH_HALF_MD4_UNSIGNED 4 2362 2304 #define DX_HASH_TEA_UNSIGNED 5 2305 + #define DX_HASH_SIPHASH 6 2363 2306 2364 2307 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, 2365 2308 const void *address, unsigned int length) ··· 2415 2356 }; 2416 2357 2417 2358 #define fname_name(p) ((p)->disk_name.name) 2359 + #define fname_usr_name(p) ((p)->usr_fname->name) 2418 2360 #define fname_len(p) ((p)->disk_name.len) 2419 2361 2420 2362 /* ··· 2646 2586 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 2647 2587 2648 2588 #ifdef CONFIG_UNICODE 2649 - extern void ext4_fname_setup_ci_filename(struct inode *dir, 2589 + extern int ext4_fname_setup_ci_filename(struct inode *dir, 2650 2590 const struct qstr *iname, 2651 - struct fscrypt_str *fname); 2591 + struct ext4_filename *fname); 2652 2592 #endif 2653 2593 2654 2594 #ifdef CONFIG_FS_ENCRYPTION ··· 2679 2619 ext4_fname_from_fscrypt_name(fname, &name); 2680 2620 2681 2621 #ifdef CONFIG_UNICODE 2682 - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); 2622 + err = ext4_fname_setup_ci_filename(dir, iname, fname); 2683 2623 #endif 2684 - return 0; 2624 + return err; 2685 2625 } 2686 2626 2687 2627 static inline int ext4_fname_prepare_lookup(struct inode *dir, ··· 2698 2638 ext4_fname_from_fscrypt_name(fname, &name); 2699 2639 2700 2640 #ifdef CONFIG_UNICODE 2701 - ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); 2641 + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); 2702 2642 #endif 2703 - return 0; 2643 + return err; 2704 2644 } 2705 2645 2706 2646 static inline void ext4_fname_free_filename(struct ext4_filename *fname) ··· 2725 2665 int lookup, 2726 2666 struct ext4_filename *fname) 2727 2667 { 2668 + int err = 0; 2728 2669 fname->usr_fname = iname; 2729 2670 fname->disk_name.name = (unsigned char *) iname->name; 2730 2671 fname->disk_name.len = iname->len; 2731 2672 2732 2673 #ifdef CONFIG_UNICODE 2733 - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); 2674 + err = ext4_fname_setup_ci_filename(dir, iname, fname); 2734 2675 #endif 2735 2676 2736 - return 0; 2677 + return err; 2737 2678 } 2738 2679 2739 2680 static inline int ext4_fname_prepare_lookup(struct inode *dir, ··· 2759 2698 struct ext4_dir_entry_2 *, 2760 2699 struct buffer_head *, char *, int, 2761 2700 unsigned int); 2762 - #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ 2701 + #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ 2763 2702 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ 2764 - (de), (bh), (buf), (size), (offset))) 2703 + (de), (bh), (buf), (size), (offset))) 2765 2704 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 2766 2705 __u32 minor_hash, 2767 2706 struct ext4_dir_entry_2 *dirent, ··· 2772 2711 void *buf, int buf_size, 2773 2712 struct ext4_filename *fname, 2774 2713 struct ext4_dir_entry_2 **dest_de); 2775 - void ext4_insert_dentry(struct inode *inode, 2714 + void ext4_insert_dentry(struct inode *dir, struct inode *inode, 2776 2715 struct ext4_dir_entry_2 *de, 2777 2716 int buf_size, 2778 2717 struct ext4_filename *fname); ··· 2863 2802 2864 2803 /* mballoc.c */ 2865 2804 extern const struct seq_operations ext4_mb_seq_groups_ops; 2805 + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; 2866 2806 extern long ext4_mb_stats; 2867 2807 extern long ext4_mb_max_to_scan; 2808 + extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); 2868 2809 extern int ext4_mb_init(struct super_block *); 2869 2810 extern int ext4_mb_release(struct super_block *); 2870 2811 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, ··· 3369 3306 ext4_grpblk_t bb_free; /* total free blocks */ 3370 3307 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 3371 3308 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ 3309 + ext4_group_t bb_group; /* Group number */ 3372 3310 struct list_head bb_prealloc_list; 3373 3311 #ifdef DOUBLE_CHECK 3374 3312 void *bb_bitmap; 3375 3313 #endif 3376 3314 struct rw_semaphore alloc_sem; 3315 + struct rb_node bb_avg_fragment_size_rb; 3316 + struct list_head bb_largest_free_order_node; 3377 3317 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block 3378 3318 * regions, index is order. 3379 3319 * bb_counters[3] = 5 means ··· 3579 3513 unsigned int blocksize); 3580 3514 extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, 3581 3515 struct buffer_head *bh); 3582 - extern int ext4_ci_compare(const struct inode *parent, 3583 - const struct qstr *fname, 3584 - const struct qstr *entry, bool quick); 3585 3516 extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, 3586 3517 struct inode *inode); 3587 3518 extern int __ext4_link(struct inode *dir, struct inode *inode,

+5 -3

fs/ext4/fast_commit.c

··· 66 66 * Fast Commit Ineligibility 67 67 * ------------------------- 68 68 * Not all operations are supported by fast commits today (e.g extended 69 - * attributes). Fast commit ineligiblity is marked by calling one of the 69 + * attributes). Fast commit ineligibility is marked by calling one of the 70 70 * two following functions: 71 71 * 72 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall ··· 1088 1088 head.fc_tid = cpu_to_le32( 1089 1089 sbi->s_journal->j_running_transaction->t_tid); 1090 1090 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1091 - (u8 *)&head, &crc)) 1091 + (u8 *)&head, &crc)) { 1092 + ret = -ENOSPC; 1092 1093 goto out; 1094 + } 1093 1095 } 1094 1096 1095 1097 spin_lock(&sbi->s_fc_lock); ··· 1736 1734 } 1737 1735 1738 1736 /* Range is mapped and needs a state change */ 1739 - jbd_debug(1, "Converting from %d to %d %lld", 1737 + jbd_debug(1, "Converting from %ld to %d %lld", 1740 1738 map.m_flags & EXT4_MAP_UNWRITTEN, 1741 1739 ext4_ext_is_unwritten(ex), map.m_pblk); 1742 1740 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,

+21 -4

fs/ext4/file.c

··· 371 371 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, 372 372 int error, unsigned int flags) 373 373 { 374 - loff_t offset = iocb->ki_pos; 374 + loff_t pos = iocb->ki_pos; 375 375 struct inode *inode = file_inode(iocb->ki_filp); 376 376 377 377 if (error) 378 378 return error; 379 379 380 - if (size && flags & IOMAP_DIO_UNWRITTEN) 381 - return ext4_convert_unwritten_extents(NULL, inode, 382 - offset, size); 380 + if (size && flags & IOMAP_DIO_UNWRITTEN) { 381 + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); 382 + if (error < 0) 383 + return error; 384 + } 385 + /* 386 + * If we are extending the file, we have to update i_size here before 387 + * page cache gets invalidated in iomap_dio_rw(). Otherwise racing 388 + * buffered reads could zero out too much from page cache pages. Update 389 + * of on-disk size will happen later in ext4_dio_write_iter() where 390 + * we have enough information to also perform orphan list handling etc. 391 + * Note that we perform all extending writes synchronously under 392 + * i_rwsem held exclusively so i_size update is safe here in that case. 393 + * If the write was not extending, we cannot see pos > i_size here 394 + * because operations reducing i_size like truncate wait for all 395 + * outstanding DIO before updating i_size. 396 + */ 397 + pos += size; 398 + if (pos > i_size_read(inode)) 399 + i_size_write(inode, pos); 383 400 384 401 return 0; 385 402 }

+21 -4

fs/ext4/hash.c

··· 197 197 * represented, and whether or not the returned hash is 32 bits or 64 198 198 * bits. 32 bit hashes will return 0 for the minor hash. 199 199 */ 200 - static int __ext4fs_dirhash(const char *name, int len, 200 + static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, 201 201 struct dx_hash_info *hinfo) 202 202 { 203 203 __u32 hash; ··· 259 259 hash = buf[0]; 260 260 minor_hash = buf[1]; 261 261 break; 262 + case DX_HASH_SIPHASH: 263 + { 264 + struct qstr qname = QSTR_INIT(name, len); 265 + __u64 combined_hash; 266 + 267 + if (fscrypt_has_encryption_key(dir)) { 268 + combined_hash = fscrypt_fname_siphash(dir, &qname); 269 + } else { 270 + ext4_warning_inode(dir, "Siphash requires key"); 271 + return -1; 272 + } 273 + 274 + hash = (__u32)(combined_hash >> 32); 275 + minor_hash = (__u32)combined_hash; 276 + break; 277 + } 262 278 default: 263 279 hinfo->hash = 0; 264 280 return -1; ··· 296 280 unsigned char *buff; 297 281 struct qstr qstr = {.name = name, .len = len }; 298 282 299 - if (len && IS_CASEFOLDED(dir) && um) { 283 + if (len && IS_CASEFOLDED(dir) && um && 284 + (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { 300 285 buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); 301 286 if (!buff) 302 287 return -ENOMEM; ··· 308 291 goto opaque_seq; 309 292 } 310 293 311 - r = __ext4fs_dirhash(buff, dlen, hinfo); 294 + r = __ext4fs_dirhash(dir, buff, dlen, hinfo); 312 295 313 296 kfree(buff); 314 297 return r; 315 298 } 316 299 opaque_seq: 317 300 #endif 318 - return __ext4fs_dirhash(name, len, hinfo); 301 + return __ext4fs_dirhash(dir, name, len, hinfo); 319 302 }

+33 -16

fs/ext4/ialloc.c

··· 1292 1292 1293 1293 ei->i_extra_isize = sbi->s_want_extra_isize; 1294 1294 ei->i_inline_off = 0; 1295 - if (ext4_has_feature_inline_data(sb)) 1295 + if (ext4_has_feature_inline_data(sb) && 1296 + (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) 1296 1297 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1297 1298 ret = inode; 1298 1299 err = dquot_alloc_inode(inode); ··· 1514 1513 handle_t *handle; 1515 1514 ext4_fsblk_t blk; 1516 1515 int num, ret = 0, used_blks = 0; 1516 + unsigned long used_inos = 0; 1517 1517 1518 1518 /* This should not happen, but just to be sure check this */ 1519 1519 if (sb_rdonly(sb)) { ··· 1545 1543 * used inodes so we need to skip blocks with used inodes in 1546 1544 * inode table. 1547 1545 */ 1548 - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) 1549 - used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - 1550 - ext4_itable_unused_count(sb, gdp)), 1551 - sbi->s_inodes_per_block); 1546 + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { 1547 + used_inos = EXT4_INODES_PER_GROUP(sb) - 1548 + ext4_itable_unused_count(sb, gdp); 1549 + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); 1552 1550 1553 - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || 1554 - ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - 1555 - ext4_itable_unused_count(sb, gdp)) < 1556 - EXT4_FIRST_INO(sb)))) { 1557 - ext4_error(sb, "Something is wrong with group %u: " 1558 - "used itable blocks: %d; " 1559 - "itable unused count: %u", 1560 - group, used_blks, 1561 - ext4_itable_unused_count(sb, gdp)); 1562 - ret = 1; 1563 - goto err_out; 1551 + /* Bogus inode unused count? */ 1552 + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { 1553 + ext4_error(sb, "Something is wrong with group %u: " 1554 + "used itable blocks: %d; " 1555 + "itable unused count: %u", 1556 + group, used_blks, 1557 + ext4_itable_unused_count(sb, gdp)); 1558 + ret = 1; 1559 + goto err_out; 1560 + } 1561 + 1562 + used_inos += group * EXT4_INODES_PER_GROUP(sb); 1563 + /* 1564 + * Are there some uninitialized inodes in the inode table 1565 + * before the first normal inode? 1566 + */ 1567 + if ((used_blks != sbi->s_itb_per_group) && 1568 + (used_inos < EXT4_FIRST_INO(sb))) { 1569 + ext4_error(sb, "Something is wrong with group %u: " 1570 + "itable unused count: %u; " 1571 + "itables initialized count: %ld", 1572 + group, ext4_itable_unused_count(sb, gdp), 1573 + used_inos); 1574 + ret = 1; 1575 + goto err_out; 1576 + } 1564 1577 } 1565 1578 1566 1579 blk = ext4_inode_table(sb, gdp) + used_blks;

+1 -1

fs/ext4/indirect.c

··· 705 705 706 706 /* 707 707 * Truncate transactions can be complex and absolutely huge. So we need to 708 - * be able to restart the transaction at a conventient checkpoint to make 708 + * be able to restart the transaction at a convenient checkpoint to make 709 709 * sure we don't overflow the journal. 710 710 * 711 711 * Try to extend this transaction for the purposes of truncation. If

+16 -11

fs/ext4/inline.c

··· 795 795 * clear the inode state safely. 796 796 * 2. The inode has inline data, then we need to read the data, make it 797 797 * update and dirty so that ext4_da_writepages can handle it. We don't 798 - * need to start the journal since the file's metatdata isn't changed now. 798 + * need to start the journal since the file's metadata isn't changed now. 799 799 */ 800 800 static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, 801 801 struct inode *inode, ··· 1031 1031 err = ext4_journal_get_write_access(handle, iloc->bh); 1032 1032 if (err) 1033 1033 return err; 1034 - ext4_insert_dentry(inode, de, inline_size, fname); 1034 + ext4_insert_dentry(dir, inode, de, inline_size, fname); 1035 1035 1036 1036 ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); 1037 1037 ··· 1100 1100 int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; 1101 1101 int new_size = get_max_inline_xattr_value_size(dir, iloc); 1102 1102 1103 - if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) 1103 + if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) 1104 1104 return -ENOSPC; 1105 1105 1106 1106 ret = ext4_update_inline_data(handle, dir, ··· 1380 1380 fake.name_len = 1; 1381 1381 strcpy(fake.name, "."); 1382 1382 fake.rec_len = ext4_rec_len_to_disk( 1383 - EXT4_DIR_REC_LEN(fake.name_len), 1384 - inline_size); 1383 + ext4_dir_rec_len(fake.name_len, NULL), 1384 + inline_size); 1385 1385 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); 1386 1386 de = &fake; 1387 1387 pos = EXT4_INLINE_DOTDOT_OFFSET; ··· 1390 1390 fake.name_len = 2; 1391 1391 strcpy(fake.name, ".."); 1392 1392 fake.rec_len = ext4_rec_len_to_disk( 1393 - EXT4_DIR_REC_LEN(fake.name_len), 1394 - inline_size); 1393 + ext4_dir_rec_len(fake.name_len, NULL), 1394 + inline_size); 1395 1395 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); 1396 1396 de = &fake; 1397 1397 pos = EXT4_INLINE_DOTDOT_SIZE; ··· 1406 1406 } 1407 1407 } 1408 1408 1409 - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1409 + if (ext4_hash_in_dirent(dir)) { 1410 + hinfo->hash = EXT4_DIRENT_HASH(de); 1411 + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); 1412 + } else { 1413 + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1414 + } 1410 1415 if ((hinfo->hash < start_hash) || 1411 1416 ((hinfo->hash == start_hash) && 1412 1417 (hinfo->minor_hash < start_minor_hash))) ··· 1493 1488 * So we will use extra_offset and extra_size to indicate them 1494 1489 * during the inline dir iteration. 1495 1490 */ 1496 - dotdot_offset = EXT4_DIR_REC_LEN(1); 1497 - dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); 1491 + dotdot_offset = ext4_dir_rec_len(1, NULL); 1492 + dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); 1498 1493 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; 1499 1494 extra_size = extra_offset + inline_size; 1500 1495 ··· 1529 1524 * failure will be detected in the 1530 1525 * dirent test below. */ 1531 1526 if (ext4_rec_len_from_disk(de->rec_len, extra_size) 1532 - < EXT4_DIR_REC_LEN(1)) 1527 + < ext4_dir_rec_len(1, NULL)) 1533 1528 break; 1534 1529 i += ext4_rec_len_from_disk(de->rec_len, 1535 1530 extra_size);

+3 -5

fs/ext4/inode.c

··· 1066 1066 block_end = block_start + blocksize; 1067 1067 if (block_end <= from || block_start >= to) { 1068 1068 if (PageUptodate(page)) { 1069 - if (!buffer_uptodate(bh)) 1070 - set_buffer_uptodate(bh); 1069 + set_buffer_uptodate(bh); 1071 1070 } 1072 1071 continue; 1073 1072 } ··· 1091 1092 } 1092 1093 } 1093 1094 if (PageUptodate(page)) { 1094 - if (!buffer_uptodate(bh)) 1095 - set_buffer_uptodate(bh); 1095 + set_buffer_uptodate(bh); 1096 1096 continue; 1097 1097 } 1098 1098 if (!buffer_uptodate(bh) && !buffer_delay(bh) && ··· 3822 3824 * starting from file offset 'from'. The range to be zero'd must 3823 3825 * be contained with in one block. If the specified range exceeds 3824 3826 * the end of the block it will be shortened to end of the block 3825 - * that cooresponds to 'from' 3827 + * that corresponds to 'from' 3826 3828 */ 3827 3829 static int ext4_block_zero_page_range(handle_t *handle, 3828 3830 struct address_space *mapping, loff_t from, loff_t length)

+6

fs/ext4/ioctl.c

··· 316 316 static bool dax_compatible(struct inode *inode, unsigned int oldflags, 317 317 unsigned int flags) 318 318 { 319 + /* Allow the DAX flag to be changed on inline directories */ 320 + if (S_ISDIR(inode->i_mode)) { 321 + flags &= ~EXT4_INLINE_DATA_FL; 322 + oldflags &= ~EXT4_INLINE_DATA_FL; 323 + } 324 + 319 325 if (flags & EXT4_DAX_FL) { 320 326 if ((oldflags & EXT4_DAX_MUT_EXCL) || 321 327 ext4_test_inode_state(inode,

+562 -30

fs/ext4/mballoc.c

··· 127 127 * smallest multiple of the stripe value (sbi->s_stripe) which is 128 128 * greater than the default mb_group_prealloc. 129 129 * 130 + * If "mb_optimize_scan" mount option is set, we maintain in memory group info 131 + * structures in two data structures: 132 + * 133 + * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 134 + * 135 + * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 136 + * 137 + * This is an array of lists where the index in the array represents the 138 + * largest free order in the buddy bitmap of the participating group infos of 139 + * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 140 + * number of buddy bitmap orders possible) number of lists. Group-infos are 141 + * placed in appropriate lists. 142 + * 143 + * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) 144 + * 145 + * Locking: sbi->s_mb_rb_lock (rwlock) 146 + * 147 + * This is a red black tree consisting of group infos and the tree is sorted 148 + * by average fragment sizes (which is calculated as ext4_group_info->bb_free 149 + * / ext4_group_info->bb_fragments). 150 + * 151 + * When "mb_optimize_scan" mount option is set, mballoc consults the above data 152 + * structures to decide the order in which groups are to be traversed for 153 + * fulfilling an allocation request. 154 + * 155 + * At CR = 0, we look for groups which have the largest_free_order >= the order 156 + * of the request. We directly look at the largest free order list in the data 157 + * structure (1) above where largest_free_order = order of the request. If that 158 + * list is empty, we look at remaining list in the increasing order of 159 + * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. 160 + * 161 + * At CR = 1, we only consider groups where average fragment size > request 162 + * size. So, we lookup a group which has average fragment size just above or 163 + * equal to request size using our rb tree (data structure 2) in O(log N) time. 164 + * 165 + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 166 + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. 167 + * 130 168 * The regular allocator (using the buddy cache) supports a few tunables. 131 169 * 132 170 * /sys/fs/ext4/<partition>/mb_min_to_scan 133 171 * /sys/fs/ext4/<partition>/mb_max_to_scan 134 172 * /sys/fs/ext4/<partition>/mb_order2_req 173 + * /sys/fs/ext4/<partition>/mb_linear_limit 135 174 * 136 175 * The regular allocator uses buddy scan only if the request len is power of 137 176 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The ··· 187 148 * ac_g_ex. Each group is first checked based on the criteria whether it 188 149 * can be used for allocation. ext4_mb_good_group explains how the groups are 189 150 * checked. 151 + * 152 + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not 153 + * get traversed linearly. That may result in subsequent allocations being not 154 + * close to each other. And so, the underlying device may get filled up in a 155 + * non-linear fashion. While that may not matter on non-rotational devices, for 156 + * rotational devices that may result in higher seek times. "mb_linear_limit" 157 + * tells mballoc how many groups mballoc should search linearly before 158 + * performing consulting above data structures for more efficient lookups. For 159 + * non rotational devices, this value defaults to 0 and for rotational devices 160 + * this is set to MB_DEFAULT_LINEAR_LIMIT. 190 161 * 191 162 * Both the prealloc space are getting populated as above. So for the first 192 163 * request we will hit the buddy cache which will result in this prealloc ··· 348 299 * - bitlock on a group (group) 349 300 * - object (inode/locality) (object) 350 301 * - per-pa lock (pa) 302 + * - cr0 lists lock (cr0) 303 + * - cr1 tree lock (cr1) 351 304 * 352 305 * Paths: 353 306 * - new pa ··· 379 328 * group 380 329 * object 381 330 * 331 + * - allocation path (ext4_mb_regular_allocator) 332 + * group 333 + * cr0/cr1 382 334 */ 383 335 static struct kmem_cache *ext4_pspace_cachep; 384 336 static struct kmem_cache *ext4_ac_cachep; ··· 404 350 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 405 351 ext4_group_t group); 406 352 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 353 + 354 + static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 355 + ext4_group_t group, int cr); 407 356 408 357 /* 409 358 * The algorithm using this percpu seq counter goes below: ··· 801 744 } 802 745 } 803 746 747 + static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, 748 + int (*cmp)(struct rb_node *, struct rb_node *)) 749 + { 750 + struct rb_node **iter = &root->rb_node, *parent = NULL; 751 + 752 + while (*iter) { 753 + parent = *iter; 754 + if (cmp(new, *iter) > 0) 755 + iter = &((*iter)->rb_left); 756 + else 757 + iter = &((*iter)->rb_right); 758 + } 759 + 760 + rb_link_node(new, parent, iter); 761 + rb_insert_color(new, root); 762 + } 763 + 764 + static int 765 + ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) 766 + { 767 + struct ext4_group_info *grp1 = rb_entry(rb1, 768 + struct ext4_group_info, 769 + bb_avg_fragment_size_rb); 770 + struct ext4_group_info *grp2 = rb_entry(rb2, 771 + struct ext4_group_info, 772 + bb_avg_fragment_size_rb); 773 + int num_frags_1, num_frags_2; 774 + 775 + num_frags_1 = grp1->bb_fragments ? 776 + grp1->bb_free / grp1->bb_fragments : 0; 777 + num_frags_2 = grp2->bb_fragments ? 778 + grp2->bb_free / grp2->bb_fragments : 0; 779 + 780 + return (num_frags_2 - num_frags_1); 781 + } 782 + 783 + /* 784 + * Reinsert grpinfo into the avg_fragment_size tree with new average 785 + * fragment size. 786 + */ 787 + static void 788 + mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 789 + { 790 + struct ext4_sb_info *sbi = EXT4_SB(sb); 791 + 792 + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 793 + return; 794 + 795 + write_lock(&sbi->s_mb_rb_lock); 796 + if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { 797 + rb_erase(&grp->bb_avg_fragment_size_rb, 798 + &sbi->s_mb_avg_fragment_size_root); 799 + RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); 800 + } 801 + 802 + ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, 803 + &grp->bb_avg_fragment_size_rb, 804 + ext4_mb_avg_fragment_size_cmp); 805 + write_unlock(&sbi->s_mb_rb_lock); 806 + } 807 + 808 + /* 809 + * Choose next group by traversing largest_free_order lists. Updates *new_cr if 810 + * cr level needs an update. 811 + */ 812 + static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, 813 + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 814 + { 815 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 816 + struct ext4_group_info *iter, *grp; 817 + int i; 818 + 819 + if (ac->ac_status == AC_STATUS_FOUND) 820 + return; 821 + 822 + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) 823 + atomic_inc(&sbi->s_bal_cr0_bad_suggestions); 824 + 825 + grp = NULL; 826 + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 827 + if (list_empty(&sbi->s_mb_largest_free_orders[i])) 828 + continue; 829 + read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 830 + if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 831 + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 832 + continue; 833 + } 834 + grp = NULL; 835 + list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 836 + bb_largest_free_order_node) { 837 + if (sbi->s_mb_stats) 838 + atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); 839 + if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { 840 + grp = iter; 841 + break; 842 + } 843 + } 844 + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 845 + if (grp) 846 + break; 847 + } 848 + 849 + if (!grp) { 850 + /* Increment cr and search again */ 851 + *new_cr = 1; 852 + } else { 853 + *group = grp->bb_group; 854 + ac->ac_last_optimal_group = *group; 855 + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; 856 + } 857 + } 858 + 859 + /* 860 + * Choose next group by traversing average fragment size tree. Updates *new_cr 861 + * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that 862 + * the linear search should continue for one iteration since there's lock 863 + * contention on the rb tree lock. 864 + */ 865 + static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, 866 + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 867 + { 868 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 869 + int avg_fragment_size, best_so_far; 870 + struct rb_node *node, *found; 871 + struct ext4_group_info *grp; 872 + 873 + /* 874 + * If there is contention on the lock, instead of waiting for the lock 875 + * to become available, just continue searching lineraly. We'll resume 876 + * our rb tree search later starting at ac->ac_last_optimal_group. 877 + */ 878 + if (!read_trylock(&sbi->s_mb_rb_lock)) { 879 + ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; 880 + return; 881 + } 882 + 883 + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { 884 + if (sbi->s_mb_stats) 885 + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); 886 + /* We have found something at CR 1 in the past */ 887 + grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); 888 + for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; 889 + found = rb_next(found)) { 890 + grp = rb_entry(found, struct ext4_group_info, 891 + bb_avg_fragment_size_rb); 892 + if (sbi->s_mb_stats) 893 + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); 894 + if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) 895 + break; 896 + } 897 + goto done; 898 + } 899 + 900 + node = sbi->s_mb_avg_fragment_size_root.rb_node; 901 + best_so_far = 0; 902 + found = NULL; 903 + 904 + while (node) { 905 + grp = rb_entry(node, struct ext4_group_info, 906 + bb_avg_fragment_size_rb); 907 + avg_fragment_size = 0; 908 + if (ext4_mb_good_group(ac, grp->bb_group, 1)) { 909 + avg_fragment_size = grp->bb_fragments ? 910 + grp->bb_free / grp->bb_fragments : 0; 911 + if (!best_so_far || avg_fragment_size < best_so_far) { 912 + best_so_far = avg_fragment_size; 913 + found = node; 914 + } 915 + } 916 + if (avg_fragment_size > ac->ac_g_ex.fe_len) 917 + node = node->rb_right; 918 + else 919 + node = node->rb_left; 920 + } 921 + 922 + done: 923 + if (found) { 924 + grp = rb_entry(found, struct ext4_group_info, 925 + bb_avg_fragment_size_rb); 926 + *group = grp->bb_group; 927 + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 928 + } else { 929 + *new_cr = 2; 930 + } 931 + 932 + read_unlock(&sbi->s_mb_rb_lock); 933 + ac->ac_last_optimal_group = *group; 934 + } 935 + 936 + static inline int should_optimize_scan(struct ext4_allocation_context *ac) 937 + { 938 + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) 939 + return 0; 940 + if (ac->ac_criteria >= 2) 941 + return 0; 942 + if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) 943 + return 0; 944 + return 1; 945 + } 946 + 947 + /* 948 + * Return next linear group for allocation. If linear traversal should not be 949 + * performed, this function just returns the same group 950 + */ 951 + static int 952 + next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) 953 + { 954 + if (!should_optimize_scan(ac)) 955 + goto inc_and_return; 956 + 957 + if (ac->ac_groups_linear_remaining) { 958 + ac->ac_groups_linear_remaining--; 959 + goto inc_and_return; 960 + } 961 + 962 + if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { 963 + ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; 964 + goto inc_and_return; 965 + } 966 + 967 + return group; 968 + inc_and_return: 969 + /* 970 + * Artificially restricted ngroups for non-extent 971 + * files makes group > ngroups possible on first loop. 972 + */ 973 + return group + 1 >= ngroups ? 0 : group + 1; 974 + } 975 + 976 + /* 977 + * ext4_mb_choose_next_group: choose next group for allocation. 978 + * 979 + * @ac Allocation Context 980 + * @new_cr This is an output parameter. If the there is no good group 981 + * available at current CR level, this field is updated to indicate 982 + * the new cr level that should be used. 983 + * @group This is an input / output parameter. As an input it indicates the 984 + * next group that the allocator intends to use for allocation. As 985 + * output, this field indicates the next group that should be used as 986 + * determined by the optimization functions. 987 + * @ngroups Total number of groups 988 + */ 989 + static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 990 + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 991 + { 992 + *new_cr = ac->ac_criteria; 993 + 994 + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) 995 + return; 996 + 997 + if (*new_cr == 0) { 998 + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); 999 + } else if (*new_cr == 1) { 1000 + ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); 1001 + } else { 1002 + /* 1003 + * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1004 + * bb_free. But until that happens, we should never come here. 1005 + */ 1006 + WARN_ON(1); 1007 + } 1008 + } 1009 + 804 1010 /* 805 1011 * Cache the order of the largest free extent we have available in this block 806 1012 * group. ··· 1071 751 static void 1072 752 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1073 753 { 754 + struct ext4_sb_info *sbi = EXT4_SB(sb); 1074 755 int i; 1075 - int bits; 1076 756 757 + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { 758 + write_lock(&sbi->s_mb_largest_free_orders_locks[ 759 + grp->bb_largest_free_order]); 760 + list_del_init(&grp->bb_largest_free_order_node); 761 + write_unlock(&sbi->s_mb_largest_free_orders_locks[ 762 + grp->bb_largest_free_order]); 763 + } 1077 764 grp->bb_largest_free_order = -1; /* uninit */ 1078 765 1079 - bits = sb->s_blocksize_bits + 1; 1080 - for (i = bits; i >= 0; i--) { 766 + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { 1081 767 if (grp->bb_counters[i] > 0) { 1082 768 grp->bb_largest_free_order = i; 1083 769 break; 1084 770 } 771 + } 772 + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && 773 + grp->bb_largest_free_order >= 0 && grp->bb_free) { 774 + write_lock(&sbi->s_mb_largest_free_orders_locks[ 775 + grp->bb_largest_free_order]); 776 + list_add_tail(&grp->bb_largest_free_order_node, 777 + &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 778 + write_unlock(&sbi->s_mb_largest_free_orders_locks[ 779 + grp->bb_largest_free_order]); 1085 780 } 1086 781 } 1087 782 ··· 1151 816 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1152 817 1153 818 period = get_cycles() - period; 1154 - spin_lock(&sbi->s_bal_lock); 1155 - sbi->s_mb_buddies_generated++; 1156 - sbi->s_mb_generation_time += period; 1157 - spin_unlock(&sbi->s_bal_lock); 819 + atomic_inc(&sbi->s_mb_buddies_generated); 820 + atomic64_add(period, &sbi->s_mb_generation_time); 821 + mb_update_avg_fragment_size(sb, grp); 1158 822 } 1159 823 1160 824 /* The buddy information is attached the buddy cache inode ··· 1293 959 grinfo->bb_fragments = 0; 1294 960 memset(grinfo->bb_counters, 0, 1295 961 sizeof(*grinfo->bb_counters) * 1296 - (sb->s_blocksize_bits+2)); 962 + (MB_NUM_ORDERS(sb))); 1297 963 /* 1298 964 * incore got set to the group block bitmap below 1299 965 */ ··· 1853 1519 1854 1520 done: 1855 1521 mb_set_largest_free_order(sb, e4b->bd_info); 1522 + mb_update_avg_fragment_size(sb, e4b->bd_info); 1856 1523 mb_check_buddy(e4b); 1857 1524 } 1858 1525 ··· 1990 1655 } 1991 1656 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1992 1657 1658 + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); 1993 1659 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 1994 1660 mb_check_buddy(e4b); 1995 1661 ··· 2266 1930 int max; 2267 1931 2268 1932 BUG_ON(ac->ac_2order <= 0); 2269 - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1933 + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { 2270 1934 if (grp->bb_counters[i] == 0) 2271 1935 continue; 2272 1936 ··· 2445 2109 if (free < ac->ac_g_ex.fe_len) 2446 2110 return false; 2447 2111 2448 - if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) 2112 + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) 2449 2113 return true; 2450 2114 2451 2115 if (grp->bb_largest_free_order < ac->ac_2order) ··· 2484 2148 ext4_grpblk_t free; 2485 2149 int ret = 0; 2486 2150 2151 + if (sbi->s_mb_stats) 2152 + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); 2487 2153 if (should_lock) 2488 2154 ext4_lock_group(sb, group); 2489 2155 free = grp->bb_free; ··· 2653 2315 * We also support searching for power-of-two requests only for 2654 2316 * requests upto maximum buddy size we have constructed. 2655 2317 */ 2656 - if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { 2318 + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { 2657 2319 /* 2658 2320 * This should tell if fe_len is exactly power of 2 2659 2321 */ 2660 2322 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2661 2323 ac->ac_2order = array_index_nospec(i - 1, 2662 - sb->s_blocksize_bits + 2); 2324 + MB_NUM_ORDERS(sb)); 2663 2325 } 2664 2326 2665 2327 /* if stream allocation is enabled, use global goal */ ··· 2685 2347 * from the goal value specified 2686 2348 */ 2687 2349 group = ac->ac_g_ex.fe_group; 2350 + ac->ac_last_optimal_group = group; 2351 + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2688 2352 prefetch_grp = group; 2689 2353 2690 - for (i = 0; i < ngroups; group++, i++) { 2691 - int ret = 0; 2354 + for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), 2355 + i++) { 2356 + int ret = 0, new_cr; 2357 + 2692 2358 cond_resched(); 2693 - /* 2694 - * Artificially restricted ngroups for non-extent 2695 - * files makes group > ngroups possible on first loop. 2696 - */ 2697 - if (group >= ngroups) 2698 - group = 0; 2359 + 2360 + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); 2361 + if (new_cr != cr) { 2362 + cr = new_cr; 2363 + goto repeat; 2364 + } 2699 2365 2700 2366 /* 2701 2367 * Batch reads of the block allocation bitmaps ··· 2764 2422 if (ac->ac_status != AC_STATUS_CONTINUE) 2765 2423 break; 2766 2424 } 2425 + /* Processed all groups and haven't found blocks */ 2426 + if (sbi->s_mb_stats && i == ngroups) 2427 + atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2767 2428 } 2768 2429 2769 2430 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && ··· 2796 2451 goto repeat; 2797 2452 } 2798 2453 } 2454 + 2455 + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2456 + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2799 2457 out: 2800 2458 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 2801 2459 err = first_err; ··· 2898 2550 .show = ext4_mb_seq_groups_show, 2899 2551 }; 2900 2552 2553 + int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) 2554 + { 2555 + struct super_block *sb = (struct super_block *)seq->private; 2556 + struct ext4_sb_info *sbi = EXT4_SB(sb); 2557 + 2558 + seq_puts(seq, "mballoc:\n"); 2559 + if (!sbi->s_mb_stats) { 2560 + seq_puts(seq, "\tmb stats collection turned off.\n"); 2561 + seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); 2562 + return 0; 2563 + } 2564 + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); 2565 + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); 2566 + 2567 + seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); 2568 + 2569 + seq_puts(seq, "\tcr0_stats:\n"); 2570 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); 2571 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2572 + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); 2573 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2574 + atomic64_read(&sbi->s_bal_cX_failed[0])); 2575 + seq_printf(seq, "\t\tbad_suggestions: %u\n", 2576 + atomic_read(&sbi->s_bal_cr0_bad_suggestions)); 2577 + 2578 + seq_puts(seq, "\tcr1_stats:\n"); 2579 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); 2580 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2581 + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); 2582 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2583 + atomic64_read(&sbi->s_bal_cX_failed[1])); 2584 + seq_printf(seq, "\t\tbad_suggestions: %u\n", 2585 + atomic_read(&sbi->s_bal_cr1_bad_suggestions)); 2586 + 2587 + seq_puts(seq, "\tcr2_stats:\n"); 2588 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); 2589 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2590 + atomic64_read(&sbi->s_bal_cX_groups_considered[2])); 2591 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2592 + atomic64_read(&sbi->s_bal_cX_failed[2])); 2593 + 2594 + seq_puts(seq, "\tcr3_stats:\n"); 2595 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); 2596 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2597 + atomic64_read(&sbi->s_bal_cX_groups_considered[3])); 2598 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2599 + atomic64_read(&sbi->s_bal_cX_failed[3])); 2600 + seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); 2601 + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 2602 + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); 2603 + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); 2604 + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); 2605 + 2606 + seq_printf(seq, "\tbuddies_generated: %u/%u\n", 2607 + atomic_read(&sbi->s_mb_buddies_generated), 2608 + ext4_get_groups_count(sb)); 2609 + seq_printf(seq, "\tbuddies_time_used: %llu\n", 2610 + atomic64_read(&sbi->s_mb_generation_time)); 2611 + seq_printf(seq, "\tpreallocated: %u\n", 2612 + atomic_read(&sbi->s_mb_preallocated)); 2613 + seq_printf(seq, "\tdiscarded: %u\n", 2614 + atomic_read(&sbi->s_mb_discarded)); 2615 + return 0; 2616 + } 2617 + 2618 + static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 2619 + { 2620 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2621 + unsigned long position; 2622 + 2623 + read_lock(&EXT4_SB(sb)->s_mb_rb_lock); 2624 + 2625 + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) 2626 + return NULL; 2627 + position = *pos + 1; 2628 + return (void *) ((unsigned long) position); 2629 + } 2630 + 2631 + static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) 2632 + { 2633 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2634 + unsigned long position; 2635 + 2636 + ++*pos; 2637 + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) 2638 + return NULL; 2639 + position = *pos + 1; 2640 + return (void *) ((unsigned long) position); 2641 + } 2642 + 2643 + static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) 2644 + { 2645 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2646 + struct ext4_sb_info *sbi = EXT4_SB(sb); 2647 + unsigned long position = ((unsigned long) v); 2648 + struct ext4_group_info *grp; 2649 + struct rb_node *n; 2650 + unsigned int count, min, max; 2651 + 2652 + position--; 2653 + if (position >= MB_NUM_ORDERS(sb)) { 2654 + seq_puts(seq, "fragment_size_tree:\n"); 2655 + n = rb_first(&sbi->s_mb_avg_fragment_size_root); 2656 + if (!n) { 2657 + seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); 2658 + return 0; 2659 + } 2660 + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); 2661 + min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; 2662 + count = 1; 2663 + while (rb_next(n)) { 2664 + count++; 2665 + n = rb_next(n); 2666 + } 2667 + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); 2668 + max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; 2669 + 2670 + seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", 2671 + min, max, count); 2672 + return 0; 2673 + } 2674 + 2675 + if (position == 0) { 2676 + seq_printf(seq, "optimize_scan: %d\n", 2677 + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); 2678 + seq_puts(seq, "max_free_order_lists:\n"); 2679 + } 2680 + count = 0; 2681 + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 2682 + bb_largest_free_order_node) 2683 + count++; 2684 + seq_printf(seq, "\tlist_order_%u_groups: %u\n", 2685 + (unsigned int)position, count); 2686 + 2687 + return 0; 2688 + } 2689 + 2690 + static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 2691 + { 2692 + struct super_block *sb = PDE_DATA(file_inode(seq->file)); 2693 + 2694 + read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); 2695 + } 2696 + 2697 + const struct seq_operations ext4_mb_seq_structs_summary_ops = { 2698 + .start = ext4_mb_seq_structs_summary_start, 2699 + .next = ext4_mb_seq_structs_summary_next, 2700 + .stop = ext4_mb_seq_structs_summary_stop, 2701 + .show = ext4_mb_seq_structs_summary_show, 2702 + }; 2703 + 2901 2704 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 2902 2705 { 2903 2706 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; ··· 3089 2590 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 3090 2591 if (old_groupinfo) 3091 2592 ext4_kvfree_array_rcu(old_groupinfo); 3092 - ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 2593 + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 3093 2594 sbi->s_group_info_size); 3094 2595 return 0; 3095 2596 } ··· 3151 2652 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3152 2653 init_rwsem(&meta_group_info[i]->alloc_sem); 3153 2654 meta_group_info[i]->bb_free_root = RB_ROOT; 2655 + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 2656 + RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); 3154 2657 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 2658 + meta_group_info[i]->bb_group = group; 3155 2659 3156 2660 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); 3157 2661 return 0; ··· 3315 2813 unsigned max; 3316 2814 int ret; 3317 2815 3318 - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2816 + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); 3319 2817 3320 2818 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 3321 2819 if (sbi->s_mb_offsets == NULL) { ··· 3323 2821 goto out; 3324 2822 } 3325 2823 3326 - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2824 + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); 3327 2825 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 3328 2826 if (sbi->s_mb_maxs == NULL) { 3329 2827 ret = -ENOMEM; ··· 3349 2847 offset_incr = offset_incr >> 1; 3350 2848 max = max >> 1; 3351 2849 i++; 3352 - } while (i <= sb->s_blocksize_bits + 1); 2850 + } while (i < MB_NUM_ORDERS(sb)); 2851 + 2852 + sbi->s_mb_avg_fragment_size_root = RB_ROOT; 2853 + sbi->s_mb_largest_free_orders = 2854 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 2855 + GFP_KERNEL); 2856 + if (!sbi->s_mb_largest_free_orders) { 2857 + ret = -ENOMEM; 2858 + goto out; 2859 + } 2860 + sbi->s_mb_largest_free_orders_locks = 2861 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 2862 + GFP_KERNEL); 2863 + if (!sbi->s_mb_largest_free_orders_locks) { 2864 + ret = -ENOMEM; 2865 + goto out; 2866 + } 2867 + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 2868 + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 2869 + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 2870 + } 2871 + rwlock_init(&sbi->s_mb_rb_lock); 3353 2872 3354 2873 spin_lock_init(&sbi->s_md_lock); 3355 - spin_lock_init(&sbi->s_bal_lock); 3356 2874 sbi->s_mb_free_pending = 0; 3357 2875 INIT_LIST_HEAD(&sbi->s_freed_data_list); 3358 2876 ··· 3423 2901 spin_lock_init(&lg->lg_prealloc_lock); 3424 2902 } 3425 2903 2904 + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) 2905 + sbi->s_mb_max_linear_groups = 0; 2906 + else 2907 + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; 3426 2908 /* init file for buddy data */ 3427 2909 ret = ext4_mb_init_backend(sb); 3428 2910 if (ret != 0) ··· 3438 2912 free_percpu(sbi->s_locality_groups); 3439 2913 sbi->s_locality_groups = NULL; 3440 2914 out: 2915 + kfree(sbi->s_mb_largest_free_orders); 2916 + kfree(sbi->s_mb_largest_free_orders_locks); 3441 2917 kfree(sbi->s_mb_offsets); 3442 2918 sbi->s_mb_offsets = NULL; 3443 2919 kfree(sbi->s_mb_maxs); ··· 3496 2968 kvfree(group_info); 3497 2969 rcu_read_unlock(); 3498 2970 } 2971 + kfree(sbi->s_mb_largest_free_orders); 2972 + kfree(sbi->s_mb_largest_free_orders_locks); 3499 2973 kfree(sbi->s_mb_offsets); 3500 2974 kfree(sbi->s_mb_maxs); 3501 2975 iput(sbi->s_buddy_cache); ··· 3508 2978 atomic_read(&sbi->s_bal_reqs), 3509 2979 atomic_read(&sbi->s_bal_success)); 3510 2980 ext4_msg(sb, KERN_INFO, 3511 - "mballoc: %u extents scanned, %u goal hits, " 2981 + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " 3512 2982 "%u 2^N hits, %u breaks, %u lost", 3513 2983 atomic_read(&sbi->s_bal_ex_scanned), 2984 + atomic_read(&sbi->s_bal_groups_scanned), 3514 2985 atomic_read(&sbi->s_bal_goals), 3515 2986 atomic_read(&sbi->s_bal_2orders), 3516 2987 atomic_read(&sbi->s_bal_breaks), 3517 2988 atomic_read(&sbi->s_mb_lost_chunks)); 3518 2989 ext4_msg(sb, KERN_INFO, 3519 - "mballoc: %lu generated and it took %Lu", 3520 - sbi->s_mb_buddies_generated, 3521 - sbi->s_mb_generation_time); 2990 + "mballoc: %u generated and it took %llu", 2991 + atomic_read(&sbi->s_mb_buddies_generated), 2992 + atomic64_read(&sbi->s_mb_generation_time)); 3522 2993 ext4_msg(sb, KERN_INFO, 3523 2994 "mballoc: %u preallocated, %u discarded", 3524 2995 atomic_read(&sbi->s_mb_preallocated), ··· 4114 3583 { 4115 3584 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4116 3585 4117 - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3586 + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { 4118 3587 atomic_inc(&sbi->s_bal_reqs); 4119 3588 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 4120 3589 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 4121 3590 atomic_inc(&sbi->s_bal_success); 4122 3591 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3592 + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); 4123 3593 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4124 3594 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4125 3595 atomic_inc(&sbi->s_bal_goals);

+22 -2

fs/ext4/mballoc.h

··· 59 59 * by the stream allocator, which purpose is to pack requests 60 60 * as close each to other as possible to produce smooth I/O traffic 61 61 * We use locality group prealloc space for stream request. 62 - * We can tune the same via /proc/fs/ext4/<parition>/stream_req 62 + * We can tune the same via /proc/fs/ext4/<partition>/stream_req 63 63 */ 64 64 #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ 65 65 ··· 77 77 * maximum length of inode prealloc list 78 78 */ 79 79 #define MB_DEFAULT_MAX_INODE_PREALLOC 512 80 + 81 + /* 82 + * Number of groups to search linearly before performing group scanning 83 + * optimization. 84 + */ 85 + #define MB_DEFAULT_LINEAR_LIMIT 4 86 + 87 + /* 88 + * Minimum number of groups that should be present in the file system to perform 89 + * group scanning optimizations. 90 + */ 91 + #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 92 + 93 + /* 94 + * Number of valid buddy orders 95 + */ 96 + #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) 80 97 81 98 struct ext4_free_data { 82 99 /* this links the free block information from sb_info */ ··· 178 161 /* copy of the best found extent taken before preallocation efforts */ 179 162 struct ext4_free_extent ac_f_ex; 180 163 164 + ext4_group_t ac_last_optimal_group; 165 + __u32 ac_groups_considered; 166 + __u32 ac_flags; /* allocation hints */ 181 167 __u16 ac_groups_scanned; 168 + __u16 ac_groups_linear_remaining; 182 169 __u16 ac_found; 183 170 __u16 ac_tail; 184 171 __u16 ac_buddy; 185 - __u16 ac_flags; /* allocation hints */ 186 172 __u8 ac_status; 187 173 __u8 ac_criteria; 188 174 __u8 ac_2order; /* if request is to allocate 2^N blocks and

+3 -3

fs/ext4/migrate.c

··· 32 32 newext.ee_block = cpu_to_le32(lb->first_block); 33 33 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); 34 34 ext4_ext_store_pblock(&newext, lb->first_pblock); 35 - /* Locking only for convinience since we are operating on temp inode */ 35 + /* Locking only for convenience since we are operating on temp inode */ 36 36 down_write(&EXT4_I(inode)->i_data_sem); 37 37 path = ext4_find_extent(inode, lb->first_block, NULL, 0); 38 38 if (IS_ERR(path)) { ··· 43 43 44 44 /* 45 45 * Calculate the credit needed to inserting this extent 46 - * Since we are doing this in loop we may accumalate extra 47 - * credit. But below we try to not accumalate too much 46 + * Since we are doing this in loop we may accumulate extra 47 + * credit. But below we try to not accumulate too much 48 48 * of them by restarting the journal. 49 49 */ 50 50 needed = ext4_ext_calc_credits_for_single_extent(inode,

+1 -1

fs/ext4/mmp.c

··· 56 56 wait_on_buffer(bh); 57 57 sb_end_write(sb); 58 58 if (unlikely(!buffer_uptodate(bh))) 59 - return 1; 59 + return -EIO; 60 60 61 61 return 0; 62 62 }

+182 -65

fs/ext4/namei.c

··· 280 280 unsigned blocksize, struct dx_hash_info *hinfo, 281 281 struct dx_map_entry map[]); 282 282 static void dx_sort_map(struct dx_map_entry *map, unsigned count); 283 - static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 284 - struct dx_map_entry *offsets, int count, unsigned blocksize); 285 - static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); 283 + static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, 284 + char *to, struct dx_map_entry *offsets, 285 + int count, unsigned int blocksize); 286 + static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, 287 + unsigned int blocksize); 286 288 static void dx_insert_block(struct dx_frame *frame, 287 289 u32 hash, ext4_lblk_t block); 288 290 static int ext4_htree_next_block(struct inode *dir, __u32 hash, ··· 576 574 577 575 static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) 578 576 { 579 - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 580 - EXT4_DIR_REC_LEN(2) - infosize; 577 + unsigned int entry_space = dir->i_sb->s_blocksize - 578 + ext4_dir_rec_len(1, NULL) - 579 + ext4_dir_rec_len(2, NULL) - infosize; 581 580 582 581 if (ext4_has_metadata_csum(dir->i_sb)) 583 582 entry_space -= sizeof(struct dx_tail); ··· 587 584 588 585 static inline unsigned dx_node_limit(struct inode *dir) 589 586 { 590 - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 587 + unsigned int entry_space = dir->i_sb->s_blocksize - 588 + ext4_dir_rec_len(0, dir); 591 589 592 590 if (ext4_has_metadata_csum(dir->i_sb)) 593 591 entry_space -= sizeof(struct dx_tail); ··· 677 673 name = fname_crypto_str.name; 678 674 len = fname_crypto_str.len; 679 675 } 680 - ext4fs_dirhash(dir, de->name, 676 + if (IS_CASEFOLDED(dir)) 677 + h.hash = EXT4_DIRENT_HASH(de); 678 + else 679 + ext4fs_dirhash(dir, de->name, 681 680 de->name_len, &h); 682 681 printk("%*.s:(E)%x.%u ", len, name, 683 682 h.hash, (unsigned) ((char *) de ··· 696 689 (unsigned) ((char *) de - base)); 697 690 #endif 698 691 } 699 - space += EXT4_DIR_REC_LEN(de->name_len); 692 + space += ext4_dir_rec_len(de->name_len, dir); 700 693 names++; 701 694 } 702 695 de = ext4_next_entry(de, size); ··· 791 784 root = (struct dx_root *) frame->bh->b_data; 792 785 if (root->info.hash_version != DX_HASH_TEA && 793 786 root->info.hash_version != DX_HASH_HALF_MD4 && 794 - root->info.hash_version != DX_HASH_LEGACY) { 787 + root->info.hash_version != DX_HASH_LEGACY && 788 + root->info.hash_version != DX_HASH_SIPHASH) { 795 789 ext4_warning_inode(dir, "Unrecognised inode hash code %u", 796 790 root->info.hash_version); 797 791 goto fail; 792 + } 793 + if (ext4_hash_in_dirent(dir)) { 794 + if (root->info.hash_version != DX_HASH_SIPHASH) { 795 + ext4_warning_inode(dir, 796 + "Hash in dirent, but hash is not SIPHASH"); 797 + goto fail; 798 + } 799 + } else { 800 + if (root->info.hash_version == DX_HASH_SIPHASH) { 801 + ext4_warning_inode(dir, 802 + "Hash code is SIPHASH, but hash not in dirent"); 803 + goto fail; 804 + } 798 805 } 799 806 if (fname) 800 807 hinfo = &fname->hinfo; ··· 816 795 if (hinfo->hash_version <= DX_HASH_TEA) 817 796 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 818 797 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 819 - if (fname && fname_name(fname)) 798 + /* hash is already computed for encrypted casefolded directory */ 799 + if (fname && fname_name(fname) && 800 + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) 820 801 ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); 821 802 hash = hinfo->hash; 822 803 ··· 979 956 * If the hash is 1, then continue only if the next page has a 980 957 * continuation hash of any value. This is used for readdir 981 958 * handling. Otherwise, check to see if the hash matches the 982 - * desired contiuation hash. If it doesn't, return since 959 + * desired continuation hash. If it doesn't, return since 983 960 * there's no point to read in the successive index pages. 984 961 */ 985 962 bhash = dx_get_hash(p->at); ··· 1020 997 struct ext4_dir_entry_2 *de, *top; 1021 998 int err = 0, count = 0; 1022 999 struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; 1000 + int csum = ext4_has_metadata_csum(dir->i_sb); 1023 1001 1024 1002 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 1025 1003 (unsigned long)block)); ··· 1029 1005 return PTR_ERR(bh); 1030 1006 1031 1007 de = (struct ext4_dir_entry_2 *) bh->b_data; 1008 + /* csum entries are not larger in the casefolded encrypted case */ 1032 1009 top = (struct ext4_dir_entry_2 *) ((char *) de + 1033 1010 dir->i_sb->s_blocksize - 1034 - EXT4_DIR_REC_LEN(0)); 1011 + ext4_dir_rec_len(0, 1012 + csum ? NULL : dir)); 1035 1013 /* Check if the directory is encrypted */ 1036 1014 if (IS_ENCRYPTED(dir)) { 1037 1015 err = fscrypt_prepare_readdir(dir); ··· 1057 1031 /* silently ignore the rest of the block */ 1058 1032 break; 1059 1033 } 1060 - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1034 + if (ext4_hash_in_dirent(dir)) { 1035 + if (de->name_len && de->inode) { 1036 + hinfo->hash = EXT4_DIRENT_HASH(de); 1037 + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); 1038 + } else { 1039 + hinfo->hash = 0; 1040 + hinfo->minor_hash = 0; 1041 + } 1042 + } else { 1043 + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); 1044 + } 1061 1045 if ((hinfo->hash < start_hash) || 1062 1046 ((hinfo->hash == start_hash) && 1063 1047 (hinfo->minor_hash < start_minor_hash))) ··· 1136 1100 start_hash, start_minor_hash)); 1137 1101 dir = file_inode(dir_file); 1138 1102 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { 1139 - hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1103 + if (ext4_hash_in_dirent(dir)) 1104 + hinfo.hash_version = DX_HASH_SIPHASH; 1105 + else 1106 + hinfo.hash_version = 1107 + EXT4_SB(dir->i_sb)->s_def_hash_version; 1140 1108 if (hinfo.hash_version <= DX_HASH_TEA) 1141 1109 hinfo.hash_version += 1142 1110 EXT4_SB(dir->i_sb)->s_hash_unsigned; ··· 1258 1218 1259 1219 while ((char *) de < base + blocksize) { 1260 1220 if (de->name_len && de->inode) { 1261 - ext4fs_dirhash(dir, de->name, de->name_len, &h); 1221 + if (ext4_hash_in_dirent(dir)) 1222 + h.hash = EXT4_DIRENT_HASH(de); 1223 + else 1224 + ext4fs_dirhash(dir, de->name, de->name_len, &h); 1262 1225 map_tail--; 1263 1226 map_tail->hash = h.hash; 1264 1227 map_tail->offs = ((char *) de - base)>>2; ··· 1325 1282 * Returns: 0 if the directory entry matches, more than 0 if it 1326 1283 * doesn't match or less than zero on error. 1327 1284 */ 1328 - int ext4_ci_compare(const struct inode *parent, const struct qstr *name, 1329 - const struct qstr *entry, bool quick) 1285 + static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, 1286 + u8 *de_name, size_t de_name_len, bool quick) 1330 1287 { 1331 1288 const struct super_block *sb = parent->i_sb; 1332 1289 const struct unicode_map *um = sb->s_encoding; 1290 + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); 1291 + struct qstr entry = QSTR_INIT(de_name, de_name_len); 1333 1292 int ret; 1334 1293 1335 - if (quick) 1336 - ret = utf8_strncasecmp_folded(um, name, entry); 1337 - else 1338 - ret = utf8_strncasecmp(um, name, entry); 1294 + if (IS_ENCRYPTED(parent)) { 1295 + const struct fscrypt_str encrypted_name = 1296 + FSTR_INIT(de_name, de_name_len); 1339 1297 1298 + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); 1299 + if (!decrypted_name.name) 1300 + return -ENOMEM; 1301 + ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, 1302 + &decrypted_name); 1303 + if (ret < 0) 1304 + goto out; 1305 + entry.name = decrypted_name.name; 1306 + entry.len = decrypted_name.len; 1307 + } 1308 + 1309 + if (quick) 1310 + ret = utf8_strncasecmp_folded(um, name, &entry); 1311 + else 1312 + ret = utf8_strncasecmp(um, name, &entry); 1340 1313 if (ret < 0) { 1341 1314 /* Handle invalid character sequence as either an error 1342 1315 * or as an opaque byte sequence. 1343 1316 */ 1344 1317 if (sb_has_strict_encoding(sb)) 1345 - return -EINVAL; 1346 - 1347 - if (name->len != entry->len) 1348 - return 1; 1349 - 1350 - return !!memcmp(name->name, entry->name, name->len); 1318 + ret = -EINVAL; 1319 + else if (name->len != entry.len) 1320 + ret = 1; 1321 + else 1322 + ret = !!memcmp(name->name, entry.name, entry.len); 1351 1323 } 1352 - 1324 + out: 1325 + kfree(decrypted_name.name); 1353 1326 return ret; 1354 1327 } 1355 1328 1356 - void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, 1357 - struct fscrypt_str *cf_name) 1329 + int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, 1330 + struct ext4_filename *name) 1358 1331 { 1332 + struct fscrypt_str *cf_name = &name->cf_name; 1333 + struct dx_hash_info *hinfo = &name->hinfo; 1359 1334 int len; 1360 1335 1361 1336 if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { 1362 1337 cf_name->name = NULL; 1363 - return; 1338 + return 0; 1364 1339 } 1365 1340 1366 1341 cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); 1367 1342 if (!cf_name->name) 1368 - return; 1343 + return -ENOMEM; 1369 1344 1370 1345 len = utf8_casefold(dir->i_sb->s_encoding, 1371 1346 iname, cf_name->name, ··· 1391 1330 if (len <= 0) { 1392 1331 kfree(cf_name->name); 1393 1332 cf_name->name = NULL; 1394 - return; 1395 1333 } 1396 1334 cf_name->len = (unsigned) len; 1335 + if (!IS_ENCRYPTED(dir)) 1336 + return 0; 1397 1337 1338 + hinfo->hash_version = DX_HASH_SIPHASH; 1339 + hinfo->seed = NULL; 1340 + if (cf_name->name) 1341 + ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); 1342 + else 1343 + ext4fs_dirhash(dir, iname->name, iname->len, hinfo); 1344 + return 0; 1398 1345 } 1399 1346 #endif 1400 1347 ··· 1411 1342 * 1412 1343 * Return: %true if the directory entry matches, otherwise %false. 1413 1344 */ 1414 - static inline bool ext4_match(const struct inode *parent, 1345 + static bool ext4_match(struct inode *parent, 1415 1346 const struct ext4_filename *fname, 1416 - const struct ext4_dir_entry_2 *de) 1347 + struct ext4_dir_entry_2 *de) 1417 1348 { 1418 1349 struct fscrypt_name f; 1419 - #ifdef CONFIG_UNICODE 1420 - const struct qstr entry = {.name = de->name, .len = de->name_len}; 1421 - #endif 1422 1350 1423 1351 if (!de->inode) 1424 1352 return false; ··· 1431 1365 if (fname->cf_name.name) { 1432 1366 struct qstr cf = {.name = fname->cf_name.name, 1433 1367 .len = fname->cf_name.len}; 1434 - return !ext4_ci_compare(parent, &cf, &entry, true); 1368 + if (IS_ENCRYPTED(parent)) { 1369 + if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || 1370 + fname->hinfo.minor_hash != 1371 + EXT4_DIRENT_MINOR_HASH(de)) { 1372 + 1373 + return 0; 1374 + } 1375 + } 1376 + return !ext4_ci_compare(parent, &cf, de->name, 1377 + de->name_len, true); 1435 1378 } 1436 - return !ext4_ci_compare(parent, fname->usr_fname, &entry, 1437 - false); 1379 + return !ext4_ci_compare(parent, fname->usr_fname, de->name, 1380 + de->name_len, false); 1438 1381 } 1439 1382 #endif 1440 1383 ··· 1840 1765 * Returns pointer to last entry moved. 1841 1766 */ 1842 1767 static struct ext4_dir_entry_2 * 1843 - dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, 1768 + dx_move_dirents(struct inode *dir, char *from, char *to, 1769 + struct dx_map_entry *map, int count, 1844 1770 unsigned blocksize) 1845 1771 { 1846 1772 unsigned rec_len = 0; ··· 1849 1773 while (count--) { 1850 1774 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1851 1775 (from + (map->offs<<2)); 1852 - rec_len = EXT4_DIR_REC_LEN(de->name_len); 1776 + rec_len = ext4_dir_rec_len(de->name_len, dir); 1777 + 1853 1778 memcpy (to, de, rec_len); 1854 1779 ((struct ext4_dir_entry_2 *) to)->rec_len = 1855 1780 ext4_rec_len_to_disk(rec_len, blocksize); 1781 + 1782 + /* wipe dir_entry excluding the rec_len field */ 1856 1783 de->inode = 0; 1784 + memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, 1785 + blocksize) - 1786 + offsetof(struct ext4_dir_entry_2, 1787 + name_len)); 1788 + 1857 1789 map++; 1858 1790 to += rec_len; 1859 1791 } ··· 1872 1788 * Compact each dir entry in the range to the minimal rec_len. 1873 1789 * Returns pointer to last entry in range. 1874 1790 */ 1875 - static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) 1791 + static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, 1792 + unsigned int blocksize) 1876 1793 { 1877 1794 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; 1878 1795 unsigned rec_len = 0; ··· 1882 1797 while ((char*)de < base + blocksize) { 1883 1798 next = ext4_next_entry(de, blocksize); 1884 1799 if (de->inode && de->name_len) { 1885 - rec_len = EXT4_DIR_REC_LEN(de->name_len); 1800 + rec_len = ext4_dir_rec_len(de->name_len, dir); 1886 1801 if (de > to) 1887 1802 memmove(to, de, rec_len); 1888 1803 to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); ··· 1972 1887 hash2, split, count-split)); 1973 1888 1974 1889 /* Fancy dance to stay within two buffers */ 1975 - de2 = dx_move_dirents(data1, data2, map + split, count - split, 1890 + de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, 1976 1891 blocksize); 1977 - de = dx_pack_dirents(data1, blocksize); 1892 + de = dx_pack_dirents(dir, data1, blocksize); 1978 1893 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - 1979 1894 (char *) de, 1980 1895 blocksize); ··· 2022 1937 struct ext4_dir_entry_2 **dest_de) 2023 1938 { 2024 1939 struct ext4_dir_entry_2 *de; 2025 - unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); 1940 + unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); 2026 1941 int nlen, rlen; 2027 1942 unsigned int offset = 0; 2028 1943 char *top; ··· 2035 1950 return -EFSCORRUPTED; 2036 1951 if (ext4_match(dir, fname, de)) 2037 1952 return -EEXIST; 2038 - nlen = EXT4_DIR_REC_LEN(de->name_len); 1953 + nlen = ext4_dir_rec_len(de->name_len, dir); 2039 1954 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 2040 1955 if ((de->inode ? rlen - nlen : rlen) >= reclen) 2041 1956 break; ··· 2049 1964 return 0; 2050 1965 } 2051 1966 2052 - void ext4_insert_dentry(struct inode *inode, 1967 + void ext4_insert_dentry(struct inode *dir, 1968 + struct inode *inode, 2053 1969 struct ext4_dir_entry_2 *de, 2054 1970 int buf_size, 2055 1971 struct ext4_filename *fname) ··· 2058 1972 2059 1973 int nlen, rlen; 2060 1974 2061 - nlen = EXT4_DIR_REC_LEN(de->name_len); 1975 + nlen = ext4_dir_rec_len(de->name_len, dir); 2062 1976 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 2063 1977 if (de->inode) { 2064 1978 struct ext4_dir_entry_2 *de1 = ··· 2072 1986 ext4_set_de_type(inode->i_sb, de, inode->i_mode); 2073 1987 de->name_len = fname_len(fname); 2074 1988 memcpy(de->name, fname_name(fname), fname_len(fname)); 1989 + if (ext4_hash_in_dirent(dir)) { 1990 + struct dx_hash_info *hinfo = &fname->hinfo; 1991 + 1992 + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); 1993 + EXT4_DIRENT_HASHES(de)->minor_hash = 1994 + cpu_to_le32(hinfo->minor_hash); 1995 + } 2075 1996 } 2076 1997 2077 1998 /* ··· 2115 2022 } 2116 2023 2117 2024 /* By now the buffer is marked for journaling */ 2118 - ext4_insert_dentry(inode, de, blocksize, fname); 2025 + ext4_insert_dentry(dir, inode, de, blocksize, fname); 2119 2026 2120 2027 /* 2121 2028 * XXX shouldn't update any times until successful ··· 2195 2102 data2 = bh2->b_data; 2196 2103 2197 2104 memcpy(data2, de, len); 2105 + memset(de, 0, len); /* wipe old data */ 2198 2106 de = (struct ext4_dir_entry_2 *) data2; 2199 2107 top = data2 + len; 2200 2108 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) ··· 2208 2114 2209 2115 /* Initialize the root; the dot dirents already exist */ 2210 2116 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 2211 - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), 2212 - blocksize); 2117 + de->rec_len = ext4_rec_len_to_disk( 2118 + blocksize - ext4_dir_rec_len(2, NULL), blocksize); 2213 2119 memset (&root->info, 0, sizeof(root->info)); 2214 2120 root->info.info_length = sizeof(root->info); 2215 - root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 2121 + if (ext4_hash_in_dirent(dir)) 2122 + root->info.hash_version = DX_HASH_SIPHASH; 2123 + else 2124 + root->info.hash_version = 2125 + EXT4_SB(dir->i_sb)->s_def_hash_version; 2126 + 2216 2127 entries = root->entries; 2217 2128 dx_set_block(entries, 1); 2218 2129 dx_set_count(entries, 1); ··· 2228 2129 if (fname->hinfo.hash_version <= DX_HASH_TEA) 2229 2130 fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 2230 2131 fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 2231 - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); 2132 + 2133 + /* casefolded encrypted hashes are computed on fname setup */ 2134 + if (!ext4_hash_in_dirent(dir)) 2135 + ext4fs_dirhash(dir, fname_name(fname), 2136 + fname_len(fname), &fname->hinfo); 2232 2137 2233 2138 memset(frames, 0, sizeof(frames)); 2234 2139 frame = frames; ··· 2242 2139 2243 2140 retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); 2244 2141 if (retval) 2245 - goto out_frames; 2142 + goto out_frames; 2246 2143 retval = ext4_handle_dirty_dirblock(handle, dir, bh2); 2247 2144 if (retval) 2248 - goto out_frames; 2145 + goto out_frames; 2249 2146 2250 2147 de = do_split(handle,dir, &bh2, frame, &fname->hinfo); 2251 2148 if (IS_ERR(de)) { ··· 2585 2482 entry_buf, buf_size, i)) 2586 2483 return -EFSCORRUPTED; 2587 2484 if (de == de_del) { 2588 - if (pde) 2485 + if (pde) { 2589 2486 pde->rec_len = ext4_rec_len_to_disk( 2590 2487 ext4_rec_len_from_disk(pde->rec_len, 2591 2488 blocksize) + 2592 2489 ext4_rec_len_from_disk(de->rec_len, 2593 2490 blocksize), 2594 2491 blocksize); 2595 - else 2492 + 2493 + /* wipe entire dir_entry */ 2494 + memset(de, 0, ext4_rec_len_from_disk(de->rec_len, 2495 + blocksize)); 2496 + } else { 2497 + /* wipe dir_entry excluding the rec_len field */ 2596 2498 de->inode = 0; 2499 + memset(&de->name_len, 0, 2500 + ext4_rec_len_from_disk(de->rec_len, 2501 + blocksize) - 2502 + offsetof(struct ext4_dir_entry_2, 2503 + name_len)); 2504 + } 2505 + 2597 2506 inode_inc_iversion(dir); 2598 2507 return 0; 2599 2508 } ··· 2837 2722 { 2838 2723 de->inode = cpu_to_le32(inode->i_ino); 2839 2724 de->name_len = 1; 2840 - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), 2725 + de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), 2841 2726 blocksize); 2842 2727 strcpy(de->name, "."); 2843 2728 ext4_set_de_type(inode->i_sb, de, S_IFDIR); ··· 2847 2732 de->name_len = 2; 2848 2733 if (!dotdot_real_len) 2849 2734 de->rec_len = ext4_rec_len_to_disk(blocksize - 2850 - (csum_size + EXT4_DIR_REC_LEN(1)), 2735 + (csum_size + ext4_dir_rec_len(1, NULL)), 2851 2736 blocksize); 2852 2737 else 2853 2738 de->rec_len = ext4_rec_len_to_disk( 2854 - EXT4_DIR_REC_LEN(de->name_len), blocksize); 2739 + ext4_dir_rec_len(de->name_len, NULL), 2740 + blocksize); 2855 2741 strcpy(de->name, ".."); 2856 2742 ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2857 2743 ··· 2985 2869 } 2986 2870 2987 2871 sb = inode->i_sb; 2988 - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { 2872 + if (inode->i_size < ext4_dir_rec_len(1, NULL) + 2873 + ext4_dir_rec_len(2, NULL)) { 2989 2874 EXT4_ERROR_INODE(inode, "invalid size"); 2990 2875 return true; 2991 2876 } ··· 3489 3372 * for transaction commit if we are running out of space 3490 3373 * and thus we deadlock. So we have to stop transaction now 3491 3374 * and restart it when symlink contents is written. 3492 - * 3375 + * 3493 3376 * To keep fs consistent in case of crash, we have to put inode 3494 3377 * to orphan list in the mean time. 3495 3378 */

+73 -43

fs/ext4/super.c

··· 667 667 ext4_commit_super(sb); 668 668 } 669 669 670 - if (sb_rdonly(sb) || continue_fs) 671 - return; 672 - 673 670 /* 674 671 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 675 672 * could panic during 'reboot -f' as the underlying device got already ··· 676 679 panic("EXT4-fs (device %s): panic forced after error\n", 677 680 sb->s_id); 678 681 } 682 + 683 + if (sb_rdonly(sb) || continue_fs) 684 + return; 685 + 679 686 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 680 687 /* 681 688 * Make sure updated value of ->s_mount_flags will be visible before ··· 1689 1688 Opt_dioread_nolock, Opt_dioread_lock, 1690 1689 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1691 1690 Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, 1692 - Opt_prefetch_block_bitmaps, 1691 + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, 1693 1692 #ifdef CONFIG_EXT4_DEBUG 1694 1693 Opt_fc_debug_max_replay, Opt_fc_debug_force 1695 1694 #endif ··· 1789 1788 {Opt_inlinecrypt, "inlinecrypt"}, 1790 1789 {Opt_nombcache, "nombcache"}, 1791 1790 {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ 1792 - {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, 1791 + {Opt_removed, "prefetch_block_bitmaps"}, 1792 + {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, 1793 + {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, 1793 1794 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1794 1795 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1795 1796 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ ··· 1824 1821 } 1825 1822 1826 1823 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1824 + #define DEFAULT_MB_OPTIMIZE_SCAN (-1) 1825 + 1827 1826 static const char deprecated_msg[] = 1828 1827 "Mount option \"%s\" will be removed by %s\n" 1829 1828 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; ··· 2012 2007 {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 2013 2008 {Opt_test_dummy_encryption, 0, MOPT_STRING}, 2014 2009 {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, 2015 - {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, 2010 + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, 2016 2011 MOPT_SET}, 2012 + {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, 2017 2013 #ifdef CONFIG_EXT4_DEBUG 2018 2014 {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, 2019 2015 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, ··· 2096 2090 return 1; 2097 2091 } 2098 2092 2093 + struct ext4_parsed_options { 2094 + unsigned long journal_devnum; 2095 + unsigned int journal_ioprio; 2096 + int mb_optimize_scan; 2097 + }; 2098 + 2099 2099 static int handle_mount_opt(struct super_block *sb, char *opt, int token, 2100 - substring_t *args, unsigned long *journal_devnum, 2101 - unsigned int *journal_ioprio, int is_remount) 2100 + substring_t *args, struct ext4_parsed_options *parsed_opts, 2101 + int is_remount) 2102 2102 { 2103 2103 struct ext4_sb_info *sbi = EXT4_SB(sb); 2104 2104 const struct mount_opts *m; ··· 2261 2249 "Cannot specify journal on remount"); 2262 2250 return -1; 2263 2251 } 2264 - *journal_devnum = arg; 2252 + parsed_opts->journal_devnum = arg; 2265 2253 } else if (token == Opt_journal_path) { 2266 2254 char *journal_path; 2267 2255 struct inode *journal_inode; ··· 2297 2285 return -1; 2298 2286 } 2299 2287 2300 - *journal_devnum = new_encode_dev(journal_inode->i_rdev); 2288 + parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); 2301 2289 path_put(&path); 2302 2290 kfree(journal_path); 2303 2291 } else if (token == Opt_journal_ioprio) { ··· 2306 2294 " (must be 0-7)"); 2307 2295 return -1; 2308 2296 } 2309 - *journal_ioprio = 2297 + parsed_opts->journal_ioprio = 2310 2298 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); 2311 2299 } else if (token == Opt_test_dummy_encryption) { 2312 2300 return ext4_set_test_dummy_encryption(sb, opt, &args[0], ··· 2396 2384 sbi->s_mount_opt |= m->mount_opt; 2397 2385 } else if (token == Opt_data_err_ignore) { 2398 2386 sbi->s_mount_opt &= ~m->mount_opt; 2387 + } else if (token == Opt_mb_optimize_scan) { 2388 + if (arg != 0 && arg != 1) { 2389 + ext4_msg(sb, KERN_WARNING, 2390 + "mb_optimize_scan should be set to 0 or 1."); 2391 + return -1; 2392 + } 2393 + parsed_opts->mb_optimize_scan = arg; 2399 2394 } else { 2400 2395 if (!args->from) 2401 2396 arg = 1; ··· 2430 2411 } 2431 2412 2432 2413 static int parse_options(char *options, struct super_block *sb, 2433 - unsigned long *journal_devnum, 2434 - unsigned int *journal_ioprio, 2414 + struct ext4_parsed_options *ret_opts, 2435 2415 int is_remount) 2436 2416 { 2437 2417 struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); ··· 2450 2432 */ 2451 2433 args[0].to = args[0].from = NULL; 2452 2434 token = match_token(p, tokens, args); 2453 - if (handle_mount_opt(sb, p, token, args, journal_devnum, 2454 - journal_ioprio, is_remount) < 0) 2435 + if (handle_mount_opt(sb, p, token, args, ret_opts, 2436 + is_remount) < 0) 2455 2437 return 0; 2456 2438 } 2457 2439 #ifdef CONFIG_QUOTA ··· 3041 3023 sb->s_flags &= ~SB_RDONLY; 3042 3024 } 3043 3025 #ifdef CONFIG_QUOTA 3044 - /* Needed for iput() to work correctly and not trash data */ 3045 - sb->s_flags |= SB_ACTIVE; 3046 - 3047 3026 /* 3048 3027 * Turn on quotas which were not enabled for read-only mounts if 3049 3028 * filesystem has quota feature, so that they are updated correctly. ··· 3706 3691 3707 3692 elr->lr_super = sb; 3708 3693 elr->lr_first_not_zeroed = start; 3709 - if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) 3710 - elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; 3711 - else { 3694 + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { 3712 3695 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3713 3696 elr->lr_next_group = start; 3697 + } else { 3698 + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; 3714 3699 } 3715 3700 3716 3701 /* ··· 3741 3726 goto out; 3742 3727 } 3743 3728 3744 - if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && 3729 + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && 3745 3730 (first_not_zeroed == ngroups || sb_rdonly(sb) || 3746 3731 !test_opt(sb, INIT_INODE_TABLE))) 3747 3732 goto out; ··· 4030 4015 ext4_fsblk_t sb_block = get_sb_block(&data); 4031 4016 ext4_fsblk_t logical_sb_block; 4032 4017 unsigned long offset = 0; 4033 - unsigned long journal_devnum = 0; 4034 4018 unsigned long def_mount_opts; 4035 4019 struct inode *root; 4036 4020 const char *descr; ··· 4040 4026 int needs_recovery, has_huge_files; 4041 4027 __u64 blocks_count; 4042 4028 int err = 0; 4043 - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4044 4029 ext4_group_t first_not_zeroed; 4030 + struct ext4_parsed_options parsed_opts; 4031 + 4032 + /* Set defaults for the variables that will be set during parsing */ 4033 + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4034 + parsed_opts.journal_devnum = 0; 4035 + parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; 4045 4036 4046 4037 if ((data && !orig_data) || !sbi) 4047 4038 goto out_free_base; ··· 4292 4273 GFP_KERNEL); 4293 4274 if (!s_mount_opts) 4294 4275 goto failed_mount; 4295 - if (!parse_options(s_mount_opts, sb, &journal_devnum, 4296 - &journal_ioprio, 0)) { 4276 + if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { 4297 4277 ext4_msg(sb, KERN_WARNING, 4298 4278 "failed to parse options in superblock: %s", 4299 4279 s_mount_opts); ··· 4300 4282 kfree(s_mount_opts); 4301 4283 } 4302 4284 sbi->s_def_mount_opt = sbi->s_mount_opt; 4303 - if (!parse_options((char *) data, sb, &journal_devnum, 4304 - &journal_ioprio, 0)) 4285 + if (!parse_options((char *) data, sb, &parsed_opts, 0)) 4305 4286 goto failed_mount; 4306 4287 4307 4288 #ifdef CONFIG_UNICODE ··· 4308 4291 const struct ext4_sb_encodings *encoding_info; 4309 4292 struct unicode_map *encoding; 4310 4293 __u16 encoding_flags; 4311 - 4312 - if (ext4_has_feature_encrypt(sb)) { 4313 - ext4_msg(sb, KERN_ERR, 4314 - "Can't mount with encoding and encryption"); 4315 - goto failed_mount; 4316 - } 4317 4294 4318 4295 if (ext4_sb_read_encoding(es, &encoding_info, 4319 4296 &encoding_flags)) { ··· 4785 4774 * root first: it may be modified in the journal! 4786 4775 */ 4787 4776 if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { 4788 - err = ext4_load_journal(sb, es, journal_devnum); 4777 + err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); 4789 4778 if (err) 4790 4779 goto failed_mount3a; 4791 4780 } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ··· 4885 4874 goto failed_mount_wq; 4886 4875 } 4887 4876 4888 - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4877 + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); 4889 4878 4890 4879 sbi->s_journal->j_submit_inode_data_buffers = 4891 4880 ext4_journal_submit_inode_data_buffers; ··· 4991 4980 ext4_fc_replay_cleanup(sb); 4992 4981 4993 4982 ext4_ext_init(sb); 4983 + 4984 + /* 4985 + * Enable optimize_scan if number of groups is > threshold. This can be 4986 + * turned off by passing "mb_optimize_scan=0". This can also be 4987 + * turned on forcefully by passing "mb_optimize_scan=1". 4988 + */ 4989 + if (parsed_opts.mb_optimize_scan == 1) 4990 + set_opt2(sb, MB_OPTIMIZE_SCAN); 4991 + else if (parsed_opts.mb_optimize_scan == 0) 4992 + clear_opt2(sb, MB_OPTIMIZE_SCAN); 4993 + else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) 4994 + set_opt2(sb, MB_OPTIMIZE_SCAN); 4995 + 4994 4996 err = ext4_mb_init(sb); 4995 4997 if (err) { 4996 4998 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", ··· 5020 4996 ext4_journal_commit_callback; 5021 4997 5022 4998 block = ext4_count_free_clusters(sb); 5023 - ext4_free_blocks_count_set(sbi->s_es, 4999 + ext4_free_blocks_count_set(sbi->s_es, 5024 5000 EXT4_C2B(sbi, block)); 5025 5001 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 5026 5002 GFP_KERNEL); ··· 5585 5561 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5586 5562 int error = 0; 5587 5563 5588 - if (!sbh || block_device_ejected(sb)) 5589 - return error; 5564 + if (!sbh) 5565 + return -EINVAL; 5566 + if (block_device_ejected(sb)) 5567 + return -ENODEV; 5590 5568 5591 5569 ext4_update_super(sb); 5592 5570 ··· 5839 5813 struct ext4_mount_options old_opts; 5840 5814 int enable_quota = 0; 5841 5815 ext4_group_t g; 5842 - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 5843 5816 int err = 0; 5844 5817 #ifdef CONFIG_QUOTA 5845 5818 int i, j; 5846 5819 char *to_free[EXT4_MAXQUOTAS]; 5847 5820 #endif 5848 5821 char *orig_data = kstrdup(data, GFP_KERNEL); 5822 + struct ext4_parsed_options parsed_opts; 5823 + 5824 + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 5825 + parsed_opts.journal_devnum = 0; 5849 5826 5850 5827 if (data && !orig_data) 5851 5828 return -ENOMEM; ··· 5879 5850 old_opts.s_qf_names[i] = NULL; 5880 5851 #endif 5881 5852 if (sbi->s_journal && sbi->s_journal->j_task->io_context) 5882 - journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; 5853 + parsed_opts.journal_ioprio = 5854 + sbi->s_journal->j_task->io_context->ioprio; 5883 5855 5884 5856 /* 5885 5857 * Some options can be enabled by ext4 and/or by VFS mount flag ··· 5890 5860 vfs_flags = SB_LAZYTIME | SB_I_VERSION; 5891 5861 sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); 5892 5862 5893 - if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { 5863 + if (!parse_options(data, sb, &parsed_opts, 1)) { 5894 5864 err = -EINVAL; 5895 5865 goto restore_opts; 5896 5866 } ··· 5940 5910 5941 5911 if (sbi->s_journal) { 5942 5912 ext4_init_journal_params(sb, sbi->s_journal); 5943 - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 5913 + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); 5944 5914 } 5945 5915 5946 5916 /* Flush outstanding errors before changing fs state */

+8

fs/ext4/sysfs.c

··· 215 215 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 216 216 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 217 217 EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); 218 + EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); 218 219 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 219 220 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); 220 221 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); ··· 264 263 ATTR_LIST(mb_stream_req), 265 264 ATTR_LIST(mb_group_prealloc), 266 265 ATTR_LIST(mb_max_inode_prealloc), 266 + ATTR_LIST(mb_max_linear_groups), 267 267 ATTR_LIST(max_writeback_mb_bump), 268 268 ATTR_LIST(extent_max_zeroout_kb), 269 269 ATTR_LIST(trigger_fs_error), ··· 315 313 #endif 316 314 EXT4_ATTR_FEATURE(metadata_csum_seed); 317 315 EXT4_ATTR_FEATURE(fast_commit); 316 + EXT4_ATTR_FEATURE(encrypted_casefold); 318 317 319 318 static struct attribute *ext4_feat_attrs[] = { 320 319 ATTR_LIST(lazy_itable_init), ··· 333 330 #endif 334 331 ATTR_LIST(metadata_csum_seed), 335 332 ATTR_LIST(fast_commit), 333 + ATTR_LIST(encrypted_casefold), 336 334 NULL, 337 335 }; 338 336 ATTRIBUTE_GROUPS(ext4_feat); ··· 532 528 ext4_fc_info_show, sb); 533 529 proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, 534 530 &ext4_mb_seq_groups_ops, sb); 531 + proc_create_single_data("mb_stats", 0444, sbi->s_proc, 532 + ext4_seq_mb_stats_show, sb); 533 + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, 534 + &ext4_mb_seq_structs_summary_ops, sb); 535 535 } 536 536 return 0; 537 537 }

+2 -8

fs/ext4/verity.c

··· 45 45 size_t n = min_t(size_t, count, 46 46 PAGE_SIZE - offset_in_page(pos)); 47 47 struct page *page; 48 - void *addr; 49 48 50 49 page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, 51 50 NULL); 52 51 if (IS_ERR(page)) 53 52 return PTR_ERR(page); 54 53 55 - addr = kmap_atomic(page); 56 - memcpy(buf, addr + offset_in_page(pos), n); 57 - kunmap_atomic(addr); 54 + memcpy_from_page(buf, page, offset_in_page(pos), n); 58 55 59 56 put_page(page); 60 57 ··· 77 80 PAGE_SIZE - offset_in_page(pos)); 78 81 struct page *page; 79 82 void *fsdata; 80 - void *addr; 81 83 int res; 82 84 83 85 res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, ··· 84 88 if (res) 85 89 return res; 86 90 87 - addr = kmap_atomic(page); 88 - memcpy(addr + offset_in_page(pos), buf, n); 89 - kunmap_atomic(addr); 91 + memcpy_to_page(page, offset_in_page(pos), buf, n); 90 92 91 93 res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, 92 94 page, fsdata);

+1 -1

fs/ext4/xattr.c

··· 1617 1617 * If storing the value in an external inode is an option, 1618 1618 * reserve space for xattr entries/names in the external 1619 1619 * attribute block so that a long value does not occupy the 1620 - * whole space and prevent futher entries being added. 1620 + * whole space and prevent further entries being added. 1621 1621 */ 1622 1622 if (ext4_has_feature_ea_inode(inode->i_sb) && 1623 1623 new_size && is_block &&

+2 -3

fs/jbd2/recovery.c

··· 245 245 return 0; 246 246 247 247 while (next_fc_block <= journal->j_fc_last) { 248 - jbd_debug(3, "Fast commit replay: next block %ld", 248 + jbd_debug(3, "Fast commit replay: next block %ld\n", 249 249 next_fc_block); 250 250 err = jread(&bh, journal, next_fc_block); 251 251 if (err) { 252 - jbd_debug(3, "Fast commit replay: read error"); 252 + jbd_debug(3, "Fast commit replay: read error\n"); 253 253 break; 254 254 } 255 255 256 - jbd_debug(3, "Processing fast commit blk with seq %d"); 257 256 err = journal->j_fc_replay_callback(journal, bh, pass, 258 257 next_fc_block - journal->j_fc_first, 259 258 expected_commit_id);

+10 -5

fs/jbd2/transaction.c

··· 349 349 } 350 350 351 351 alloc_transaction: 352 - if (!journal->j_running_transaction) { 352 + /* 353 + * This check is racy but it is just an optimization of allocating new 354 + * transaction early if there are high chances we'll need it. If we 355 + * guess wrong, we'll retry or free unused transaction. 356 + */ 357 + if (!data_race(journal->j_running_transaction)) { 353 358 /* 354 359 * If __GFP_FS is not present, then we may be being called from 355 360 * inside the fs writeback layer, so we MUST NOT fail. ··· 1479 1474 * crucial to catch bugs so let's do a reliable check until the 1480 1475 * lockless handling is fully proven. 1481 1476 */ 1482 - if (jh->b_transaction != transaction && 1483 - jh->b_next_transaction != transaction) { 1477 + if (data_race(jh->b_transaction != transaction && 1478 + jh->b_next_transaction != transaction)) { 1484 1479 spin_lock(&jh->b_state_lock); 1485 1480 J_ASSERT_JH(jh, jh->b_transaction == transaction || 1486 1481 jh->b_next_transaction == transaction); ··· 1488 1483 } 1489 1484 if (jh->b_modified == 1) { 1490 1485 /* If it's in our transaction it must be in BJ_Metadata list. */ 1491 - if (jh->b_transaction == transaction && 1492 - jh->b_jlist != BJ_Metadata) { 1486 + if (data_race(jh->b_transaction == transaction && 1487 + jh->b_jlist != BJ_Metadata)) { 1493 1488 spin_lock(&jh->b_state_lock); 1494 1489 if (jh->b_transaction == transaction && 1495 1490 jh->b_jlist != BJ_Metadata)

+8

fs/stat.c

··· 86 86 /* SB_NOATIME means filesystem supplies dummy atime value */ 87 87 if (inode->i_sb->s_flags & SB_NOATIME) 88 88 stat->result_mask &= ~STATX_ATIME; 89 + 90 + /* 91 + * Note: If you add another clause to set an attribute flag, please 92 + * update attributes_mask below. 93 + */ 89 94 if (IS_AUTOMOUNT(inode)) 90 95 stat->attributes |= STATX_ATTR_AUTOMOUNT; 91 96 92 97 if (IS_DAX(inode)) 93 98 stat->attributes |= STATX_ATTR_DAX; 99 + 100 + stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | 101 + STATX_ATTR_DAX); 94 102 95 103 mnt_userns = mnt_user_ns(path->mnt); 96 104 if (inode->i_op->getattr)

+21 -12

include/linux/jbd2.h

··· 61 61 #define jbd_debug(n, fmt, a...) \ 62 62 __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) 63 63 #else 64 - #define jbd_debug(n, fmt, a...) /**/ 64 + #define jbd_debug(n, fmt, a...) no_printk(fmt, ##a) 65 65 #endif 66 66 67 67 extern void *jbd2_alloc(size_t size, gfp_t flags); ··· 594 594 */ 595 595 unsigned long t_log_start; 596 596 597 - /* Number of buffers on the t_buffers list [j_list_lock] */ 597 + /* 598 + * Number of buffers on the t_buffers list [j_list_lock, no locks 599 + * needed for jbd2 thread] 600 + */ 598 601 int t_nr_buffers; 599 602 600 603 /* 601 604 * Doubly-linked circular list of all buffers reserved but not yet 602 - * modified by this transaction [j_list_lock] 605 + * modified by this transaction [j_list_lock, no locks needed fo 606 + * jbd2 thread] 603 607 */ 604 608 struct journal_head *t_reserved_list; 605 609 606 610 /* 607 611 * Doubly-linked circular list of all metadata buffers owned by this 608 - * transaction [j_list_lock] 612 + * transaction [j_list_lock, no locks needed for jbd2 thread] 609 613 */ 610 614 struct journal_head *t_buffers; 611 615 ··· 633 629 struct journal_head *t_checkpoint_io_list; 634 630 635 631 /* 636 - * Doubly-linked circular list of metadata buffers being shadowed by log 637 - * IO. The IO buffers on the iobuf list and the shadow buffers on this 638 - * list match each other one for one at all times. [j_list_lock] 632 + * Doubly-linked circular list of metadata buffers being 633 + * shadowed by log IO. The IO buffers on the iobuf list and 634 + * the shadow buffers on this list match each other one for 635 + * one at all times. [j_list_lock, no locks needed for jbd2 636 + * thread] 639 637 */ 640 638 struct journal_head *t_shadow_list; 641 639 ··· 774 768 struct journal_s 775 769 { 776 770 /** 777 - * @j_flags: General journaling state flags [j_state_lock] 771 + * @j_flags: General journaling state flags [j_state_lock, 772 + * no lock for quick racy checks] 778 773 */ 779 774 unsigned long j_flags; 780 775 ··· 815 808 /** 816 809 * @j_barrier_count: 817 810 * 818 - * Number of processes waiting to create a barrier lock [j_state_lock] 811 + * Number of processes waiting to create a barrier lock [j_state_lock, 812 + * no lock for quick racy checks] 819 813 */ 820 814 int j_barrier_count; 821 815 ··· 829 821 * @j_running_transaction: 830 822 * 831 823 * Transactions: The current running transaction... 832 - * [j_state_lock] [caller holding open handle] 824 + * [j_state_lock, no lock for quick racy checks] [caller holding 825 + * open handle] 833 826 */ 834 827 transaction_t *j_running_transaction; 835 828 ··· 1042 1033 * @j_commit_sequence: 1043 1034 * 1044 1035 * Sequence number of the most recently committed transaction 1045 - * [j_state_lock]. 1036 + * [j_state_lock, no lock for quick racy checks] 1046 1037 */ 1047 1038 tid_t j_commit_sequence; 1048 1039 ··· 1050 1041 * @j_commit_request: 1051 1042 * 1052 1043 * Sequence number of the most recent transaction wanting commit 1053 - * [j_state_lock] 1044 + * [j_state_lock, no lock for quick racy checks] 1054 1045 */ 1055 1046 tid_t j_commit_request; 1056 1047

-176

include/trace/events/ext4.h

··· 1358 1358 __entry->group, __entry->prefetch) 1359 1359 ); 1360 1360 1361 - TRACE_EVENT(ext4_direct_IO_enter, 1362 - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), 1363 - 1364 - TP_ARGS(inode, offset, len, rw), 1365 - 1366 - TP_STRUCT__entry( 1367 - __field( dev_t, dev ) 1368 - __field( ino_t, ino ) 1369 - __field( loff_t, pos ) 1370 - __field( unsigned long, len ) 1371 - __field( int, rw ) 1372 - ), 1373 - 1374 - TP_fast_assign( 1375 - __entry->dev = inode->i_sb->s_dev; 1376 - __entry->ino = inode->i_ino; 1377 - __entry->pos = offset; 1378 - __entry->len = len; 1379 - __entry->rw = rw; 1380 - ), 1381 - 1382 - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", 1383 - MAJOR(__entry->dev), MINOR(__entry->dev), 1384 - (unsigned long) __entry->ino, 1385 - __entry->pos, __entry->len, __entry->rw) 1386 - ); 1387 - 1388 - TRACE_EVENT(ext4_direct_IO_exit, 1389 - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, 1390 - int rw, int ret), 1391 - 1392 - TP_ARGS(inode, offset, len, rw, ret), 1393 - 1394 - TP_STRUCT__entry( 1395 - __field( dev_t, dev ) 1396 - __field( ino_t, ino ) 1397 - __field( loff_t, pos ) 1398 - __field( unsigned long, len ) 1399 - __field( int, rw ) 1400 - __field( int, ret ) 1401 - ), 1402 - 1403 - TP_fast_assign( 1404 - __entry->dev = inode->i_sb->s_dev; 1405 - __entry->ino = inode->i_ino; 1406 - __entry->pos = offset; 1407 - __entry->len = len; 1408 - __entry->rw = rw; 1409 - __entry->ret = ret; 1410 - ), 1411 - 1412 - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", 1413 - MAJOR(__entry->dev), MINOR(__entry->dev), 1414 - (unsigned long) __entry->ino, 1415 - __entry->pos, __entry->len, 1416 - __entry->rw, __entry->ret) 1417 - ); 1418 - 1419 1361 DECLARE_EVENT_CLASS(ext4__fallocate_mode, 1420 1362 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1421 1363 ··· 1902 1960 MAJOR(__entry->dev), MINOR(__entry->dev), 1903 1961 __entry->lblk, (unsigned long long) __entry->pblk, 1904 1962 __entry->len, show_mflags(__entry->flags), __entry->ret) 1905 - ); 1906 - 1907 - TRACE_EVENT(ext4_ext_put_in_cache, 1908 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, 1909 - ext4_fsblk_t start), 1910 - 1911 - TP_ARGS(inode, lblk, len, start), 1912 - 1913 - TP_STRUCT__entry( 1914 - __field( dev_t, dev ) 1915 - __field( ino_t, ino ) 1916 - __field( ext4_lblk_t, lblk ) 1917 - __field( unsigned int, len ) 1918 - __field( ext4_fsblk_t, start ) 1919 - ), 1920 - 1921 - TP_fast_assign( 1922 - __entry->dev = inode->i_sb->s_dev; 1923 - __entry->ino = inode->i_ino; 1924 - __entry->lblk = lblk; 1925 - __entry->len = len; 1926 - __entry->start = start; 1927 - ), 1928 - 1929 - TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu", 1930 - MAJOR(__entry->dev), MINOR(__entry->dev), 1931 - (unsigned long) __entry->ino, 1932 - (unsigned) __entry->lblk, 1933 - __entry->len, 1934 - (unsigned long long) __entry->start) 1935 - ); 1936 - 1937 - TRACE_EVENT(ext4_ext_in_cache, 1938 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret), 1939 - 1940 - TP_ARGS(inode, lblk, ret), 1941 - 1942 - TP_STRUCT__entry( 1943 - __field( dev_t, dev ) 1944 - __field( ino_t, ino ) 1945 - __field( ext4_lblk_t, lblk ) 1946 - __field( int, ret ) 1947 - ), 1948 - 1949 - TP_fast_assign( 1950 - __entry->dev = inode->i_sb->s_dev; 1951 - __entry->ino = inode->i_ino; 1952 - __entry->lblk = lblk; 1953 - __entry->ret = ret; 1954 - ), 1955 - 1956 - TP_printk("dev %d,%d ino %lu lblk %u ret %d", 1957 - MAJOR(__entry->dev), MINOR(__entry->dev), 1958 - (unsigned long) __entry->ino, 1959 - (unsigned) __entry->lblk, 1960 - __entry->ret) 1961 - 1962 - ); 1963 - 1964 - TRACE_EVENT(ext4_find_delalloc_range, 1965 - TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to, 1966 - int reverse, int found, ext4_lblk_t found_blk), 1967 - 1968 - TP_ARGS(inode, from, to, reverse, found, found_blk), 1969 - 1970 - TP_STRUCT__entry( 1971 - __field( dev_t, dev ) 1972 - __field( ino_t, ino ) 1973 - __field( ext4_lblk_t, from ) 1974 - __field( ext4_lblk_t, to ) 1975 - __field( int, reverse ) 1976 - __field( int, found ) 1977 - __field( ext4_lblk_t, found_blk ) 1978 - ), 1979 - 1980 - TP_fast_assign( 1981 - __entry->dev = inode->i_sb->s_dev; 1982 - __entry->ino = inode->i_ino; 1983 - __entry->from = from; 1984 - __entry->to = to; 1985 - __entry->reverse = reverse; 1986 - __entry->found = found; 1987 - __entry->found_blk = found_blk; 1988 - ), 1989 - 1990 - TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d " 1991 - "(blk = %u)", 1992 - MAJOR(__entry->dev), MINOR(__entry->dev), 1993 - (unsigned long) __entry->ino, 1994 - (unsigned) __entry->from, (unsigned) __entry->to, 1995 - __entry->reverse, __entry->found, 1996 - (unsigned) __entry->found_blk) 1997 - ); 1998 - 1999 - TRACE_EVENT(ext4_get_reserved_cluster_alloc, 2000 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len), 2001 - 2002 - TP_ARGS(inode, lblk, len), 2003 - 2004 - TP_STRUCT__entry( 2005 - __field( dev_t, dev ) 2006 - __field( ino_t, ino ) 2007 - __field( ext4_lblk_t, lblk ) 2008 - __field( unsigned int, len ) 2009 - ), 2010 - 2011 - TP_fast_assign( 2012 - __entry->dev = inode->i_sb->s_dev; 2013 - __entry->ino = inode->i_ino; 2014 - __entry->lblk = lblk; 2015 - __entry->len = len; 2016 - ), 2017 - 2018 - TP_printk("dev %d,%d ino %lu lblk %u len %u", 2019 - MAJOR(__entry->dev), MINOR(__entry->dev), 2020 - (unsigned long) __entry->ino, 2021 - (unsigned) __entry->lblk, 2022 - __entry->len) 2023 1963 ); 2024 1964 2025 1965 TRACE_EVENT(ext4_ext_show_extent,