Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+11 -11

Documentation/filesystems/dax.txt

··· 58 58 Filesystem support consists of 59 59 - adding support to mark inodes as being DAX by setting the S_DAX flag in 60 60 i_flags 61 - - implementing the direct_IO address space operation, and calling 62 - dax_do_io() instead of blockdev_direct_IO() if S_DAX is set 61 + - implementing ->read_iter and ->write_iter operations which use dax_iomap_rw() 62 + when inode has S_DAX flag set 63 63 - implementing an mmap file operation for DAX files which sets the 64 64 VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to 65 - include handlers for fault, pmd_fault and page_mkwrite (which should 66 - probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the 67 - appropriate get_block() callback) 68 - - calling dax_truncate_page() instead of block_truncate_page() for DAX files 69 - - calling dax_zero_page_range() instead of zero_user() for DAX files 65 + include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These 66 + handlers should probably call dax_iomap_fault() (for fault and page_mkwrite 67 + handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate 68 + iomap operations. 69 + - calling iomap_zero_range() passing appropriate iomap operations instead of 70 + block_truncate_page() for DAX files 70 71 - ensuring that there is sufficient locking between reads, writes, 71 72 truncates and page faults 72 73 73 - The get_block() callback passed to the DAX functions may return 74 - uninitialised extents. If it does, it must ensure that simultaneous 75 - calls to get_block() (for example by a page-fault racing with a read() 76 - or a write()) work correctly. 74 + The iomap handlers for allocating blocks must make sure that allocated blocks 75 + are zeroed out and converted to written extents before being returned to avoid 76 + exposure of uninitialized data through mmap. 77 77 78 78 These filesystems may be used for inspiration: 79 79 - ext2: see Documentation/filesystems/ext2.txt

+6 -7

Documentation/filesystems/ext4.txt

··· 351 351 interoperability with older kernels which only 352 352 store and expect 16-bit values. 353 353 354 - block_validity This options allows to enables/disables the in-kernel 354 + block_validity(*) These options enable or disable the in-kernel 355 355 noblock_validity facility for tracking filesystem metadata blocks 356 - within internal data structures. This allows multi- 357 - block allocator and other routines to quickly locate 358 - extents which might overlap with filesystem metadata 359 - blocks. This option is intended for debugging 360 - purposes and since it negatively affects the 361 - performance, it is off by default. 356 + within internal data structures. This allows multi- 357 + block allocator and other routines to notice 358 + bugs or corrupted allocation bitmaps which cause 359 + blocks to be allocated which overlap with 360 + filesystem metadata blocks. 362 361 363 362 dioread_lock Controls whether or not ext4 should use the DIO read 364 363 dioread_nolock locking. If the dioread_nolock option is specified

+1

MAINTAINERS

··· 5240 5240 FS-CRYPTO: FILE SYSTEM LEVEL ENCRYPTION SUPPORT 5241 5241 M: Theodore Y. Ts'o <tytso@mit.edu> 5242 5242 M: Jaegeuk Kim <jaegeuk@kernel.org> 5243 + L: linux-fsdevel@vger.kernel.org 5243 5244 S: Supported 5244 5245 F: fs/crypto/ 5245 5246 F: include/linux/fscrypto.h

-1

fs/Kconfig

··· 55 55 depends on FS_DAX 56 56 depends on ZONE_DEVICE 57 57 depends on TRANSPARENT_HUGEPAGE 58 - depends on BROKEN 59 58 60 59 endif # BLOCK 61 60

-2

fs/crypto/Kconfig

··· 8 8 select CRYPTO_XTS 9 9 select CRYPTO_CTS 10 10 select CRYPTO_CTR 11 - select CRYPTO_SHA256 12 11 select KEYS 13 - select ENCRYPTED_KEYS 14 12 help 15 13 Enable encryption of files and directories. This 16 14 feature is similar to ecryptfs, but it is more memory

+85 -38

fs/crypto/crypto.c

··· 27 27 #include <linux/bio.h> 28 28 #include <linux/dcache.h> 29 29 #include <linux/namei.h> 30 - #include <linux/fscrypto.h> 30 + #include "fscrypt_private.h" 31 31 32 32 static unsigned int num_prealloc_crypto_pages = 32; 33 33 static unsigned int num_prealloc_crypto_ctxs = 128; ··· 63 63 { 64 64 unsigned long flags; 65 65 66 - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { 66 + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { 67 67 mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); 68 68 ctx->w.bounce_page = NULL; 69 69 } ··· 88 88 * Return: An allocated and initialized encryption context on success; error 89 89 * value or NULL otherwise. 90 90 */ 91 - struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) 91 + struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) 92 92 { 93 93 struct fscrypt_ctx *ctx = NULL; 94 94 struct fscrypt_info *ci = inode->i_crypt_info; ··· 121 121 } else { 122 122 ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; 123 123 } 124 - ctx->flags &= ~FS_WRITE_PATH_FL; 124 + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; 125 125 return ctx; 126 126 } 127 127 EXPORT_SYMBOL(fscrypt_get_ctx); ··· 146 146 FS_ENCRYPT, 147 147 } fscrypt_direction_t; 148 148 149 - static int do_page_crypto(struct inode *inode, 150 - fscrypt_direction_t rw, pgoff_t index, 149 + static int do_page_crypto(const struct inode *inode, 150 + fscrypt_direction_t rw, u64 lblk_num, 151 151 struct page *src_page, struct page *dest_page, 152 + unsigned int len, unsigned int offs, 152 153 gfp_t gfp_flags) 153 154 { 154 155 struct { ··· 162 161 struct fscrypt_info *ci = inode->i_crypt_info; 163 162 struct crypto_skcipher *tfm = ci->ci_ctfm; 164 163 int res = 0; 164 + 165 + BUG_ON(len == 0); 165 166 166 167 req = skcipher_request_alloc(tfm, gfp_flags); 167 168 if (!req) { ··· 178 175 page_crypt_complete, &ecr); 179 176 180 177 BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); 181 - xts_tweak.index = cpu_to_le64(index); 178 + xts_tweak.index = cpu_to_le64(lblk_num); 182 179 memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); 183 180 184 181 sg_init_table(&dst, 1); 185 - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); 182 + sg_set_page(&dst, dest_page, len, offs); 186 183 sg_init_table(&src, 1); 187 - sg_set_page(&src, src_page, PAGE_SIZE, 0); 188 - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); 184 + sg_set_page(&src, src_page, len, offs); 185 + skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); 189 186 if (rw == FS_DECRYPT) 190 187 res = crypto_skcipher_decrypt(req); 191 188 else ··· 210 207 ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); 211 208 if (ctx->w.bounce_page == NULL) 212 209 return ERR_PTR(-ENOMEM); 213 - ctx->flags |= FS_WRITE_PATH_FL; 210 + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; 214 211 return ctx->w.bounce_page; 215 212 } 216 213 217 214 /** 218 215 * fscypt_encrypt_page() - Encrypts a page 219 - * @inode: The inode for which the encryption should take place 220 - * @plaintext_page: The page to encrypt. Must be locked. 221 - * @gfp_flags: The gfp flag for memory allocation 216 + * @inode: The inode for which the encryption should take place 217 + * @page: The page to encrypt. Must be locked for bounce-page 218 + * encryption. 219 + * @len: Length of data to encrypt in @page and encrypted 220 + * data in returned page. 221 + * @offs: Offset of data within @page and returned 222 + * page holding encrypted data. 223 + * @lblk_num: Logical block number. This must be unique for multiple 224 + * calls with same inode, except when overwriting 225 + * previously written data. 226 + * @gfp_flags: The gfp flag for memory allocation 222 227 * 223 - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx 224 - * encryption context. 228 + * Encrypts @page using the ctx encryption context. Performs encryption 229 + * either in-place or into a newly allocated bounce page. 230 + * Called on the page write path. 225 231 * 226 - * Called on the page write path. The caller must call 232 + * Bounce page allocation is the default. 233 + * In this case, the contents of @page are encrypted and stored in an 234 + * allocated bounce page. @page has to be locked and the caller must call 227 235 * fscrypt_restore_control_page() on the returned ciphertext page to 228 236 * release the bounce buffer and the encryption context. 229 237 * 230 - * Return: An allocated page with the encrypted content on success. Else, an 238 + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in 239 + * fscrypt_operations. Here, the input-page is returned with its content 240 + * encrypted. 241 + * 242 + * Return: A page with the encrypted content on success. Else, an 231 243 * error value or NULL. 232 244 */ 233 - struct page *fscrypt_encrypt_page(struct inode *inode, 234 - struct page *plaintext_page, gfp_t gfp_flags) 245 + struct page *fscrypt_encrypt_page(const struct inode *inode, 246 + struct page *page, 247 + unsigned int len, 248 + unsigned int offs, 249 + u64 lblk_num, gfp_t gfp_flags) 250 + 235 251 { 236 252 struct fscrypt_ctx *ctx; 237 - struct page *ciphertext_page = NULL; 253 + struct page *ciphertext_page = page; 238 254 int err; 239 255 240 - BUG_ON(!PageLocked(plaintext_page)); 256 + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); 257 + 258 + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { 259 + /* with inplace-encryption we just encrypt the page */ 260 + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, 261 + page, ciphertext_page, 262 + len, offs, gfp_flags); 263 + if (err) 264 + return ERR_PTR(err); 265 + 266 + return ciphertext_page; 267 + } 268 + 269 + BUG_ON(!PageLocked(page)); 241 270 242 271 ctx = fscrypt_get_ctx(inode, gfp_flags); 243 272 if (IS_ERR(ctx)) ··· 280 245 if (IS_ERR(ciphertext_page)) 281 246 goto errout; 282 247 283 - ctx->w.control_page = plaintext_page; 284 - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, 285 - plaintext_page, ciphertext_page, 286 - gfp_flags); 248 + ctx->w.control_page = page; 249 + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, 250 + page, ciphertext_page, 251 + len, offs, gfp_flags); 287 252 if (err) { 288 253 ciphertext_page = ERR_PTR(err); 289 254 goto errout; ··· 300 265 EXPORT_SYMBOL(fscrypt_encrypt_page); 301 266 302 267 /** 303 - * f2crypt_decrypt_page() - Decrypts a page in-place 304 - * @page: The page to decrypt. Must be locked. 268 + * fscrypt_decrypt_page() - Decrypts a page in-place 269 + * @inode: The corresponding inode for the page to decrypt. 270 + * @page: The page to decrypt. Must be locked in case 271 + * it is a writeback page (FS_CFLG_OWN_PAGES unset). 272 + * @len: Number of bytes in @page to be decrypted. 273 + * @offs: Start of data in @page. 274 + * @lblk_num: Logical block number. 305 275 * 306 276 * Decrypts page in-place using the ctx encryption context. 307 277 * ··· 314 274 * 315 275 * Return: Zero on success, non-zero otherwise. 316 276 */ 317 - int fscrypt_decrypt_page(struct page *page) 277 + int fscrypt_decrypt_page(const struct inode *inode, struct page *page, 278 + unsigned int len, unsigned int offs, u64 lblk_num) 318 279 { 319 - BUG_ON(!PageLocked(page)); 280 + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) 281 + BUG_ON(!PageLocked(page)); 320 282 321 - return do_page_crypto(page->mapping->host, 322 - FS_DECRYPT, page->index, page, page, GFP_NOFS); 283 + return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len, 284 + offs, GFP_NOFS); 323 285 } 324 286 EXPORT_SYMBOL(fscrypt_decrypt_page); 325 287 326 - int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, 288 + int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, 327 289 sector_t pblk, unsigned int len) 328 290 { 329 291 struct fscrypt_ctx *ctx; ··· 348 306 while (len--) { 349 307 err = do_page_crypto(inode, FS_ENCRYPT, lblk, 350 308 ZERO_PAGE(0), ciphertext_page, 351 - GFP_NOFS); 309 + PAGE_SIZE, 0, GFP_NOFS); 352 310 if (err) 353 311 goto errout; 354 312 ··· 456 414 457 415 bio_for_each_segment_all(bv, bio, i) { 458 416 struct page *page = bv->bv_page; 459 - int ret = fscrypt_decrypt_page(page); 417 + int ret = fscrypt_decrypt_page(page->mapping->host, page, 418 + PAGE_SIZE, 0, page->index); 460 419 461 420 if (ret) { 462 421 WARN_ON_ONCE(1); ··· 525 482 526 483 /** 527 484 * fscrypt_initialize() - allocate major buffers for fs encryption. 485 + * @cop_flags: fscrypt operations flags 528 486 * 529 487 * We only call this when we start accessing encrypted files, since it 530 488 * results in memory getting allocated that wouldn't otherwise be used. 531 489 * 532 490 * Return: Zero on success, non-zero otherwise. 533 491 */ 534 - int fscrypt_initialize(void) 492 + int fscrypt_initialize(unsigned int cop_flags) 535 493 { 536 494 int i, res = -ENOMEM; 537 495 538 - if (fscrypt_bounce_page_pool) 496 + /* 497 + * No need to allocate a bounce page pool if there already is one or 498 + * this FS won't use it. 499 + */ 500 + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) 539 501 return 0; 540 502 541 503 mutex_lock(&fscrypt_init_mutex); ··· 569 521 mutex_unlock(&fscrypt_init_mutex); 570 522 return res; 571 523 } 572 - EXPORT_SYMBOL(fscrypt_initialize); 573 524 574 525 /** 575 526 * fscrypt_init() - Set up for fs encryption.

+4 -4

fs/crypto/fname.c

··· 12 12 13 13 #include <linux/scatterlist.h> 14 14 #include <linux/ratelimit.h> 15 - #include <linux/fscrypto.h> 15 + #include "fscrypt_private.h" 16 16 17 17 /** 18 18 * fname_crypt_complete() - completion callback for filename crypto ··· 209 209 return cp - dst; 210 210 } 211 211 212 - u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) 212 + u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) 213 213 { 214 214 int padding = 32; 215 215 struct fscrypt_info *ci = inode->i_crypt_info; ··· 227 227 * Allocates an output buffer that is sufficient for the crypto operation 228 228 * specified by the context and the direction. 229 229 */ 230 - int fscrypt_fname_alloc_buffer(struct inode *inode, 230 + int fscrypt_fname_alloc_buffer(const struct inode *inode, 231 231 u32 ilen, struct fscrypt_str *crypto_str) 232 232 { 233 233 unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); ··· 350 350 fname->disk_name.len = iname->len; 351 351 return 0; 352 352 } 353 - ret = get_crypt_info(dir); 353 + ret = fscrypt_get_crypt_info(dir); 354 354 if (ret && ret != -EOPNOTSUPP) 355 355 return ret; 356 356

+93

fs/crypto/fscrypt_private.h

··· 1 + /* 2 + * fscrypt_private.h 3 + * 4 + * Copyright (C) 2015, Google, Inc. 5 + * 6 + * This contains encryption key functions. 7 + * 8 + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. 9 + */ 10 + 11 + #ifndef _FSCRYPT_PRIVATE_H 12 + #define _FSCRYPT_PRIVATE_H 13 + 14 + #include <linux/fscrypto.h> 15 + 16 + #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 17 + 18 + /* Encryption parameters */ 19 + #define FS_XTS_TWEAK_SIZE 16 20 + #define FS_AES_128_ECB_KEY_SIZE 16 21 + #define FS_AES_256_GCM_KEY_SIZE 32 22 + #define FS_AES_256_CBC_KEY_SIZE 32 23 + #define FS_AES_256_CTS_KEY_SIZE 32 24 + #define FS_AES_256_XTS_KEY_SIZE 64 25 + #define FS_MAX_KEY_SIZE 64 26 + 27 + #define FS_KEY_DESC_PREFIX "fscrypt:" 28 + #define FS_KEY_DESC_PREFIX_SIZE 8 29 + 30 + #define FS_KEY_DERIVATION_NONCE_SIZE 16 31 + 32 + /** 33 + * Encryption context for inode 34 + * 35 + * Protector format: 36 + * 1 byte: Protector format (1 = this version) 37 + * 1 byte: File contents encryption mode 38 + * 1 byte: File names encryption mode 39 + * 1 byte: Flags 40 + * 8 bytes: Master Key descriptor 41 + * 16 bytes: Encryption Key derivation nonce 42 + */ 43 + struct fscrypt_context { 44 + u8 format; 45 + u8 contents_encryption_mode; 46 + u8 filenames_encryption_mode; 47 + u8 flags; 48 + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; 49 + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; 50 + } __packed; 51 + 52 + #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 53 + 54 + /* This is passed in from userspace into the kernel keyring */ 55 + struct fscrypt_key { 56 + u32 mode; 57 + u8 raw[FS_MAX_KEY_SIZE]; 58 + u32 size; 59 + } __packed; 60 + 61 + /* 62 + * A pointer to this structure is stored in the file system's in-core 63 + * representation of an inode. 64 + */ 65 + struct fscrypt_info { 66 + u8 ci_data_mode; 67 + u8 ci_filename_mode; 68 + u8 ci_flags; 69 + struct crypto_skcipher *ci_ctfm; 70 + struct key *ci_keyring_key; 71 + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; 72 + }; 73 + 74 + #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 75 + #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 76 + 77 + struct fscrypt_completion_result { 78 + struct completion completion; 79 + int res; 80 + }; 81 + 82 + #define DECLARE_FS_COMPLETION_RESULT(ecr) \ 83 + struct fscrypt_completion_result ecr = { \ 84 + COMPLETION_INITIALIZER((ecr).completion), 0 } 85 + 86 + 87 + /* crypto.c */ 88 + int fscrypt_initialize(unsigned int cop_flags); 89 + 90 + /* keyinfo.c */ 91 + extern int fscrypt_get_crypt_info(struct inode *); 92 + 93 + #endif /* _FSCRYPT_PRIVATE_H */

+4 -4

fs/crypto/keyinfo.c

··· 10 10 11 11 #include <keys/user-type.h> 12 12 #include <linux/scatterlist.h> 13 - #include <linux/fscrypto.h> 13 + #include "fscrypt_private.h" 14 14 15 15 static void derive_crypt_complete(struct crypto_async_request *req, int rc) 16 16 { ··· 178 178 kmem_cache_free(fscrypt_info_cachep, ci); 179 179 } 180 180 181 - int get_crypt_info(struct inode *inode) 181 + int fscrypt_get_crypt_info(struct inode *inode) 182 182 { 183 183 struct fscrypt_info *crypt_info; 184 184 struct fscrypt_context ctx; ··· 188 188 u8 *raw_key = NULL; 189 189 int res; 190 190 191 - res = fscrypt_initialize(); 191 + res = fscrypt_initialize(inode->i_sb->s_cop->flags); 192 192 if (res) 193 193 return res; 194 194 ··· 327 327 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | 328 328 (1 << KEY_FLAG_REVOKED) | 329 329 (1 << KEY_FLAG_DEAD))))) 330 - return get_crypt_info(inode); 330 + return fscrypt_get_crypt_info(inode); 331 331 return 0; 332 332 } 333 333 EXPORT_SYMBOL(fscrypt_get_encryption_info);

+22 -14

fs/crypto/policy.c

··· 10 10 11 11 #include <linux/random.h> 12 12 #include <linux/string.h> 13 - #include <linux/fscrypto.h> 14 13 #include <linux/mount.h> 14 + #include "fscrypt_private.h" 15 15 16 16 static int inode_has_encryption_context(struct inode *inode) 17 17 { ··· 93 93 return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); 94 94 } 95 95 96 - int fscrypt_process_policy(struct file *filp, 97 - const struct fscrypt_policy *policy) 96 + int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) 98 97 { 98 + struct fscrypt_policy policy; 99 99 struct inode *inode = file_inode(filp); 100 100 int ret; 101 + 102 + if (copy_from_user(&policy, arg, sizeof(policy))) 103 + return -EFAULT; 101 104 102 105 if (!inode_owner_or_capable(inode)) 103 106 return -EACCES; 104 107 105 - if (policy->version != 0) 108 + if (policy.version != 0) 106 109 return -EINVAL; 107 110 108 111 ret = mnt_want_write_file(filp); ··· 123 120 ret = -ENOTEMPTY; 124 121 else 125 122 ret = create_encryption_context_from_policy(inode, 126 - policy); 123 + &policy); 127 124 } else if (!is_encryption_context_consistent_with_policy(inode, 128 - policy)) { 125 + &policy)) { 129 126 printk(KERN_WARNING 130 127 "%s: Policy inconsistent with encryption context\n", 131 128 __func__); ··· 137 134 mnt_drop_write_file(filp); 138 135 return ret; 139 136 } 140 - EXPORT_SYMBOL(fscrypt_process_policy); 137 + EXPORT_SYMBOL(fscrypt_ioctl_set_policy); 141 138 142 - int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) 139 + int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) 143 140 { 141 + struct inode *inode = file_inode(filp); 144 142 struct fscrypt_context ctx; 143 + struct fscrypt_policy policy; 145 144 int res; 146 145 147 146 if (!inode->i_sb->s_cop->get_context || ··· 156 151 if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) 157 152 return -EINVAL; 158 153 159 - policy->version = 0; 160 - policy->contents_encryption_mode = ctx.contents_encryption_mode; 161 - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; 162 - policy->flags = ctx.flags; 163 - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, 154 + policy.version = 0; 155 + policy.contents_encryption_mode = ctx.contents_encryption_mode; 156 + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; 157 + policy.flags = ctx.flags; 158 + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, 164 159 FS_KEY_DESCRIPTOR_SIZE); 160 + 161 + if (copy_to_user(arg, &policy, sizeof(policy))) 162 + return -EFAULT; 165 163 return 0; 166 164 } 167 - EXPORT_SYMBOL(fscrypt_get_policy); 165 + EXPORT_SYMBOL(fscrypt_ioctl_get_policy); 168 166 169 167 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) 170 168 {

+508 -724

fs/dax.c

··· 34 34 #include <linux/iomap.h> 35 35 #include "internal.h" 36 36 37 - /* 38 - * We use lowest available bit in exceptional entry for locking, other two 39 - * bits to determine entry type. In total 3 special bits. 40 - */ 41 - #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 42 - #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 43 - #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 44 - #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 45 - #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) 46 - #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 47 - #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 48 - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 49 - RADIX_TREE_EXCEPTIONAL_ENTRY)) 50 - 51 37 /* We choose 4096 entries - same as per-zone page wait tables */ 52 38 #define DAX_WAIT_TABLE_BITS 12 53 39 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 54 40 55 - wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 41 + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 56 42 57 43 static int __init init_dax_wait_table(void) 58 44 { ··· 49 63 return 0; 50 64 } 51 65 fs_initcall(init_dax_wait_table); 52 - 53 - static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 54 - pgoff_t index) 55 - { 56 - unsigned long hash = hash_long((unsigned long)mapping ^ index, 57 - DAX_WAIT_TABLE_BITS); 58 - return wait_table + hash; 59 - } 60 66 61 67 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 62 68 { ··· 76 98 blk_queue_exit(bdev->bd_queue); 77 99 } 78 100 101 + static int dax_is_pmd_entry(void *entry) 102 + { 103 + return (unsigned long)entry & RADIX_DAX_PMD; 104 + } 105 + 106 + static int dax_is_pte_entry(void *entry) 107 + { 108 + return !((unsigned long)entry & RADIX_DAX_PMD); 109 + } 110 + 111 + static int dax_is_zero_entry(void *entry) 112 + { 113 + return (unsigned long)entry & RADIX_DAX_HZP; 114 + } 115 + 116 + static int dax_is_empty_entry(void *entry) 117 + { 118 + return (unsigned long)entry & RADIX_DAX_EMPTY; 119 + } 120 + 79 121 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 80 122 { 81 123 struct page *page = alloc_pages(GFP_KERNEL, 0); ··· 116 118 return page; 117 119 } 118 120 119 - static bool buffer_written(struct buffer_head *bh) 120 - { 121 - return buffer_mapped(bh) && !buffer_unwritten(bh); 122 - } 123 - 124 - /* 125 - * When ext4 encounters a hole, it returns without modifying the buffer_head 126 - * which means that we can't trust b_size. To cope with this, we set b_state 127 - * to 0 before calling get_block and, if any bit is set, we know we can trust 128 - * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 129 - * and would save us time calling get_block repeatedly. 130 - */ 131 - static bool buffer_size_valid(struct buffer_head *bh) 132 - { 133 - return bh->b_state != 0; 134 - } 135 - 136 - 137 - static sector_t to_sector(const struct buffer_head *bh, 138 - const struct inode *inode) 139 - { 140 - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 141 - 142 - return sector; 143 - } 144 - 145 - static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 146 - loff_t start, loff_t end, get_block_t get_block, 147 - struct buffer_head *bh) 148 - { 149 - loff_t pos = start, max = start, bh_max = start; 150 - bool hole = false; 151 - struct block_device *bdev = NULL; 152 - int rw = iov_iter_rw(iter), rc; 153 - long map_len = 0; 154 - struct blk_dax_ctl dax = { 155 - .addr = ERR_PTR(-EIO), 156 - }; 157 - unsigned blkbits = inode->i_blkbits; 158 - sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 159 - >> blkbits; 160 - 161 - if (rw == READ) 162 - end = min(end, i_size_read(inode)); 163 - 164 - while (pos < end) { 165 - size_t len; 166 - if (pos == max) { 167 - long page = pos >> PAGE_SHIFT; 168 - sector_t block = page << (PAGE_SHIFT - blkbits); 169 - unsigned first = pos - (block << blkbits); 170 - long size; 171 - 172 - if (pos == bh_max) { 173 - bh->b_size = PAGE_ALIGN(end - pos); 174 - bh->b_state = 0; 175 - rc = get_block(inode, block, bh, rw == WRITE); 176 - if (rc) 177 - break; 178 - if (!buffer_size_valid(bh)) 179 - bh->b_size = 1 << blkbits; 180 - bh_max = pos - first + bh->b_size; 181 - bdev = bh->b_bdev; 182 - /* 183 - * We allow uninitialized buffers for writes 184 - * beyond EOF as those cannot race with faults 185 - */ 186 - WARN_ON_ONCE( 187 - (buffer_new(bh) && block < file_blks) || 188 - (rw == WRITE && buffer_unwritten(bh))); 189 - } else { 190 - unsigned done = bh->b_size - 191 - (bh_max - (pos - first)); 192 - bh->b_blocknr += done >> blkbits; 193 - bh->b_size -= done; 194 - } 195 - 196 - hole = rw == READ && !buffer_written(bh); 197 - if (hole) { 198 - size = bh->b_size - first; 199 - } else { 200 - dax_unmap_atomic(bdev, &dax); 201 - dax.sector = to_sector(bh, inode); 202 - dax.size = bh->b_size; 203 - map_len = dax_map_atomic(bdev, &dax); 204 - if (map_len < 0) { 205 - rc = map_len; 206 - break; 207 - } 208 - dax.addr += first; 209 - size = map_len - first; 210 - } 211 - /* 212 - * pos + size is one past the last offset for IO, 213 - * so pos + size can overflow loff_t at extreme offsets. 214 - * Cast to u64 to catch this and get the true minimum. 215 - */ 216 - max = min_t(u64, pos + size, end); 217 - } 218 - 219 - if (iov_iter_rw(iter) == WRITE) { 220 - len = copy_from_iter_pmem(dax.addr, max - pos, iter); 221 - } else if (!hole) 222 - len = copy_to_iter((void __force *) dax.addr, max - pos, 223 - iter); 224 - else 225 - len = iov_iter_zero(max - pos, iter); 226 - 227 - if (!len) { 228 - rc = -EFAULT; 229 - break; 230 - } 231 - 232 - pos += len; 233 - if (!IS_ERR(dax.addr)) 234 - dax.addr += len; 235 - } 236 - 237 - dax_unmap_atomic(bdev, &dax); 238 - 239 - return (pos == start) ? rc : pos - start; 240 - } 241 - 242 - /** 243 - * dax_do_io - Perform I/O to a DAX file 244 - * @iocb: The control block for this I/O 245 - * @inode: The file which the I/O is directed at 246 - * @iter: The addresses to do I/O from or to 247 - * @get_block: The filesystem method used to translate file offsets to blocks 248 - * @end_io: A filesystem callback for I/O completion 249 - * @flags: See below 250 - * 251 - * This function uses the same locking scheme as do_blockdev_direct_IO: 252 - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 253 - * caller for writes. For reads, we take and release the i_mutex ourselves. 254 - * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 255 - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 256 - * is in progress. 257 - */ 258 - ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 259 - struct iov_iter *iter, get_block_t get_block, 260 - dio_iodone_t end_io, int flags) 261 - { 262 - struct buffer_head bh; 263 - ssize_t retval = -EINVAL; 264 - loff_t pos = iocb->ki_pos; 265 - loff_t end = pos + iov_iter_count(iter); 266 - 267 - memset(&bh, 0, sizeof(bh)); 268 - bh.b_bdev = inode->i_sb->s_bdev; 269 - 270 - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 271 - inode_lock(inode); 272 - 273 - /* Protects against truncate */ 274 - if (!(flags & DIO_SKIP_DIO_COUNT)) 275 - inode_dio_begin(inode); 276 - 277 - retval = dax_io(inode, iter, pos, end, get_block, &bh); 278 - 279 - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 280 - inode_unlock(inode); 281 - 282 - if (end_io) { 283 - int err; 284 - 285 - err = end_io(iocb, pos, retval, bh.b_private); 286 - if (err) 287 - retval = err; 288 - } 289 - 290 - if (!(flags & DIO_SKIP_DIO_COUNT)) 291 - inode_dio_end(inode); 292 - return retval; 293 - } 294 - EXPORT_SYMBOL_GPL(dax_do_io); 295 - 296 121 /* 297 122 * DAX radix tree locking 298 123 */ 299 124 struct exceptional_entry_key { 300 125 struct address_space *mapping; 301 - unsigned long index; 126 + pgoff_t entry_start; 302 127 }; 303 128 304 129 struct wait_exceptional_entry_queue { 305 130 wait_queue_t wait; 306 131 struct exceptional_entry_key key; 307 132 }; 133 + 134 + static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 135 + pgoff_t index, void *entry, struct exceptional_entry_key *key) 136 + { 137 + unsigned long hash; 138 + 139 + /* 140 + * If 'entry' is a PMD, align the 'index' that we use for the wait 141 + * queue to the start of that PMD. This ensures that all offsets in 142 + * the range covered by the PMD map to the same bit lock. 143 + */ 144 + if (dax_is_pmd_entry(entry)) 145 + index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 146 + 147 + key->mapping = mapping; 148 + key->entry_start = index; 149 + 150 + hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 151 + return wait_table + hash; 152 + } 308 153 309 154 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 310 155 int sync, void *keyp) ··· 157 316 container_of(wait, struct wait_exceptional_entry_queue, wait); 158 317 159 318 if (key->mapping != ewait->key.mapping || 160 - key->index != ewait->key.index) 319 + key->entry_start != ewait->key.entry_start) 161 320 return 0; 162 321 return autoremove_wake_function(wait, mode, sync, NULL); 163 322 } ··· 213 372 static void *get_unlocked_mapping_entry(struct address_space *mapping, 214 373 pgoff_t index, void ***slotp) 215 374 { 216 - void *ret, **slot; 375 + void *entry, **slot; 217 376 struct wait_exceptional_entry_queue ewait; 218 - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 377 + wait_queue_head_t *wq; 219 378 220 379 init_wait(&ewait.wait); 221 380 ewait.wait.func = wake_exceptional_entry_func; 222 - ewait.key.mapping = mapping; 223 - ewait.key.index = index; 224 381 225 382 for (;;) { 226 - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, 383 + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 227 384 &slot); 228 - if (!ret || !radix_tree_exceptional_entry(ret) || 385 + if (!entry || !radix_tree_exceptional_entry(entry) || 229 386 !slot_locked(mapping, slot)) { 230 387 if (slotp) 231 388 *slotp = slot; 232 - return ret; 389 + return entry; 233 390 } 391 + 392 + wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 234 393 prepare_to_wait_exclusive(wq, &ewait.wait, 235 394 TASK_UNINTERRUPTIBLE); 236 395 spin_unlock_irq(&mapping->tree_lock); ··· 238 397 finish_wait(wq, &ewait.wait); 239 398 spin_lock_irq(&mapping->tree_lock); 240 399 } 241 - } 242 - 243 - /* 244 - * Find radix tree entry at given index. If it points to a page, return with 245 - * the page locked. If it points to the exceptional entry, return with the 246 - * radix tree entry locked. If the radix tree doesn't contain given index, 247 - * create empty exceptional entry for the index and return with it locked. 248 - * 249 - * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 250 - * persistent memory the benefit is doubtful. We can add that later if we can 251 - * show it helps. 252 - */ 253 - static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 254 - { 255 - void *ret, **slot; 256 - 257 - restart: 258 - spin_lock_irq(&mapping->tree_lock); 259 - ret = get_unlocked_mapping_entry(mapping, index, &slot); 260 - /* No entry for given index? Make sure radix tree is big enough. */ 261 - if (!ret) { 262 - int err; 263 - 264 - spin_unlock_irq(&mapping->tree_lock); 265 - err = radix_tree_preload( 266 - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 267 - if (err) 268 - return ERR_PTR(err); 269 - ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 270 - RADIX_DAX_ENTRY_LOCK); 271 - spin_lock_irq(&mapping->tree_lock); 272 - err = radix_tree_insert(&mapping->page_tree, index, ret); 273 - radix_tree_preload_end(); 274 - if (err) { 275 - spin_unlock_irq(&mapping->tree_lock); 276 - /* Someone already created the entry? */ 277 - if (err == -EEXIST) 278 - goto restart; 279 - return ERR_PTR(err); 280 - } 281 - /* Good, we have inserted empty locked entry into the tree. */ 282 - mapping->nrexceptional++; 283 - spin_unlock_irq(&mapping->tree_lock); 284 - return ret; 285 - } 286 - /* Normal page in radix tree? */ 287 - if (!radix_tree_exceptional_entry(ret)) { 288 - struct page *page = ret; 289 - 290 - get_page(page); 291 - spin_unlock_irq(&mapping->tree_lock); 292 - lock_page(page); 293 - /* Page got truncated? Retry... */ 294 - if (unlikely(page->mapping != mapping)) { 295 - unlock_page(page); 296 - put_page(page); 297 - goto restart; 298 - } 299 - return page; 300 - } 301 - ret = lock_slot(mapping, slot); 302 - spin_unlock_irq(&mapping->tree_lock); 303 - return ret; 304 - } 305 - 306 - void dax_wake_mapping_entry_waiter(struct address_space *mapping, 307 - pgoff_t index, bool wake_all) 308 - { 309 - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 310 - 311 - /* 312 - * Checking for locked entry and prepare_to_wait_exclusive() happens 313 - * under mapping->tree_lock, ditto for entry handling in our callers. 314 - * So at this point all tasks that could have seen our entry locked 315 - * must be in the waitqueue and the following check will see them. 316 - */ 317 - if (waitqueue_active(wq)) { 318 - struct exceptional_entry_key key; 319 - 320 - key.mapping = mapping; 321 - key.index = index; 322 - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 323 - } 324 - } 325 - 326 - void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 327 - { 328 - void *ret, **slot; 329 - 330 - spin_lock_irq(&mapping->tree_lock); 331 - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 332 - if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || 333 - !slot_locked(mapping, slot))) { 334 - spin_unlock_irq(&mapping->tree_lock); 335 - return; 336 - } 337 - unlock_slot(mapping, slot); 338 - spin_unlock_irq(&mapping->tree_lock); 339 - dax_wake_mapping_entry_waiter(mapping, index, false); 340 400 } 341 401 342 402 static void put_locked_mapping_entry(struct address_space *mapping, ··· 262 520 return; 263 521 264 522 /* We have to wake up next waiter for the radix tree entry lock */ 265 - dax_wake_mapping_entry_waiter(mapping, index, false); 523 + dax_wake_mapping_entry_waiter(mapping, index, entry, false); 524 + } 525 + 526 + /* 527 + * Find radix tree entry at given index. If it points to a page, return with 528 + * the page locked. If it points to the exceptional entry, return with the 529 + * radix tree entry locked. If the radix tree doesn't contain given index, 530 + * create empty exceptional entry for the index and return with it locked. 531 + * 532 + * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 533 + * either return that locked entry or will return an error. This error will 534 + * happen if there are any 4k entries (either zero pages or DAX entries) 535 + * within the 2MiB range that we are requesting. 536 + * 537 + * We always favor 4k entries over 2MiB entries. There isn't a flow where we 538 + * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 539 + * insertion will fail if it finds any 4k entries already in the tree, and a 540 + * 4k insertion will cause an existing 2MiB entry to be unmapped and 541 + * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 542 + * well as 2MiB empty entries. 543 + * 544 + * The exception to this downgrade path is for 2MiB DAX PMD entries that have 545 + * real storage backing them. We will leave these real 2MiB DAX entries in 546 + * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 547 + * 548 + * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 549 + * persistent memory the benefit is doubtful. We can add that later if we can 550 + * show it helps. 551 + */ 552 + static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 553 + unsigned long size_flag) 554 + { 555 + bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 556 + void *entry, **slot; 557 + 558 + restart: 559 + spin_lock_irq(&mapping->tree_lock); 560 + entry = get_unlocked_mapping_entry(mapping, index, &slot); 561 + 562 + if (entry) { 563 + if (size_flag & RADIX_DAX_PMD) { 564 + if (!radix_tree_exceptional_entry(entry) || 565 + dax_is_pte_entry(entry)) { 566 + put_unlocked_mapping_entry(mapping, index, 567 + entry); 568 + entry = ERR_PTR(-EEXIST); 569 + goto out_unlock; 570 + } 571 + } else { /* trying to grab a PTE entry */ 572 + if (radix_tree_exceptional_entry(entry) && 573 + dax_is_pmd_entry(entry) && 574 + (dax_is_zero_entry(entry) || 575 + dax_is_empty_entry(entry))) { 576 + pmd_downgrade = true; 577 + } 578 + } 579 + } 580 + 581 + /* No entry for given index? Make sure radix tree is big enough. */ 582 + if (!entry || pmd_downgrade) { 583 + int err; 584 + 585 + if (pmd_downgrade) { 586 + /* 587 + * Make sure 'entry' remains valid while we drop 588 + * mapping->tree_lock. 589 + */ 590 + entry = lock_slot(mapping, slot); 591 + } 592 + 593 + spin_unlock_irq(&mapping->tree_lock); 594 + /* 595 + * Besides huge zero pages the only other thing that gets 596 + * downgraded are empty entries which don't need to be 597 + * unmapped. 598 + */ 599 + if (pmd_downgrade && dax_is_zero_entry(entry)) 600 + unmap_mapping_range(mapping, 601 + (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 602 + 603 + err = radix_tree_preload( 604 + mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 605 + if (err) { 606 + if (pmd_downgrade) 607 + put_locked_mapping_entry(mapping, index, entry); 608 + return ERR_PTR(err); 609 + } 610 + spin_lock_irq(&mapping->tree_lock); 611 + 612 + if (pmd_downgrade) { 613 + radix_tree_delete(&mapping->page_tree, index); 614 + mapping->nrexceptional--; 615 + dax_wake_mapping_entry_waiter(mapping, index, entry, 616 + true); 617 + } 618 + 619 + entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 620 + 621 + err = __radix_tree_insert(&mapping->page_tree, index, 622 + dax_radix_order(entry), entry); 623 + radix_tree_preload_end(); 624 + if (err) { 625 + spin_unlock_irq(&mapping->tree_lock); 626 + /* 627 + * Someone already created the entry? This is a 628 + * normal failure when inserting PMDs in a range 629 + * that already contains PTEs. In that case we want 630 + * to return -EEXIST immediately. 631 + */ 632 + if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD)) 633 + goto restart; 634 + /* 635 + * Our insertion of a DAX PMD entry failed, most 636 + * likely because it collided with a PTE sized entry 637 + * at a different index in the PMD range. We haven't 638 + * inserted anything into the radix tree and have no 639 + * waiters to wake. 640 + */ 641 + return ERR_PTR(err); 642 + } 643 + /* Good, we have inserted empty locked entry into the tree. */ 644 + mapping->nrexceptional++; 645 + spin_unlock_irq(&mapping->tree_lock); 646 + return entry; 647 + } 648 + /* Normal page in radix tree? */ 649 + if (!radix_tree_exceptional_entry(entry)) { 650 + struct page *page = entry; 651 + 652 + get_page(page); 653 + spin_unlock_irq(&mapping->tree_lock); 654 + lock_page(page); 655 + /* Page got truncated? Retry... */ 656 + if (unlikely(page->mapping != mapping)) { 657 + unlock_page(page); 658 + put_page(page); 659 + goto restart; 660 + } 661 + return page; 662 + } 663 + entry = lock_slot(mapping, slot); 664 + out_unlock: 665 + spin_unlock_irq(&mapping->tree_lock); 666 + return entry; 667 + } 668 + 669 + /* 670 + * We do not necessarily hold the mapping->tree_lock when we call this 671 + * function so it is possible that 'entry' is no longer a valid item in the 672 + * radix tree. This is okay because all we really need to do is to find the 673 + * correct waitqueue where tasks might be waiting for that old 'entry' and 674 + * wake them. 675 + */ 676 + void dax_wake_mapping_entry_waiter(struct address_space *mapping, 677 + pgoff_t index, void *entry, bool wake_all) 678 + { 679 + struct exceptional_entry_key key; 680 + wait_queue_head_t *wq; 681 + 682 + wq = dax_entry_waitqueue(mapping, index, entry, &key); 683 + 684 + /* 685 + * Checking for locked entry and prepare_to_wait_exclusive() happens 686 + * under mapping->tree_lock, ditto for entry handling in our callers. 687 + * So at this point all tasks that could have seen our entry locked 688 + * must be in the waitqueue and the following check will see them. 689 + */ 690 + if (waitqueue_active(wq)) 691 + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 692 + } 693 + 694 + void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 695 + { 696 + void *entry, **slot; 697 + 698 + spin_lock_irq(&mapping->tree_lock); 699 + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 700 + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 701 + !slot_locked(mapping, slot))) { 702 + spin_unlock_irq(&mapping->tree_lock); 703 + return; 704 + } 705 + unlock_slot(mapping, slot); 706 + spin_unlock_irq(&mapping->tree_lock); 707 + dax_wake_mapping_entry_waiter(mapping, index, entry, false); 266 708 } 267 709 268 710 /* ··· 473 547 radix_tree_delete(&mapping->page_tree, index); 474 548 mapping->nrexceptional--; 475 549 spin_unlock_irq(&mapping->tree_lock); 476 - dax_wake_mapping_entry_waiter(mapping, index, true); 550 + dax_wake_mapping_entry_waiter(mapping, index, entry, true); 477 551 478 552 return 1; 479 553 } ··· 526 600 return 0; 527 601 } 528 602 529 - #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 530 - 603 + /* 604 + * By this point grab_mapping_entry() has ensured that we have a locked entry 605 + * of the appropriate size so we don't have to worry about downgrading PMDs to 606 + * PTEs. If we happen to be trying to insert a PTE and there is a PMD 607 + * already in the tree, we will skip the insertion and just dirty the PMD as 608 + * appropriate. 609 + */ 531 610 static void *dax_insert_mapping_entry(struct address_space *mapping, 532 611 struct vm_fault *vmf, 533 - void *entry, sector_t sector) 612 + void *entry, sector_t sector, 613 + unsigned long flags) 534 614 { 535 615 struct radix_tree_root *page_tree = &mapping->page_tree; 536 616 int error = 0; ··· 559 627 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 560 628 if (error) 561 629 return ERR_PTR(error); 630 + } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { 631 + /* replacing huge zero page with PMD block mapping */ 632 + unmap_mapping_range(mapping, 633 + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 562 634 } 563 635 564 636 spin_lock_irq(&mapping->tree_lock); 565 - new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 566 - RADIX_DAX_ENTRY_LOCK); 637 + new_entry = dax_radix_locked_entry(sector, flags); 638 + 567 639 if (hole_fill) { 568 640 __delete_from_page_cache(entry, NULL); 569 641 /* Drop pagecache reference */ 570 642 put_page(entry); 571 - error = radix_tree_insert(page_tree, index, new_entry); 643 + error = __radix_tree_insert(page_tree, index, 644 + dax_radix_order(new_entry), new_entry); 572 645 if (error) { 573 646 new_entry = ERR_PTR(error); 574 647 goto unlock; 575 648 } 576 649 mapping->nrexceptional++; 577 - } else { 650 + } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 651 + /* 652 + * Only swap our new entry into the radix tree if the current 653 + * entry is a zero page or an empty entry. If a normal PTE or 654 + * PMD entry is already in the tree, we leave it alone. This 655 + * means that if we are trying to insert a PTE and the 656 + * existing entry is a PMD, we will just leave the PMD in the 657 + * tree and dirty it if necessary. 658 + */ 578 659 struct radix_tree_node *node; 579 660 void **slot; 580 661 void *ret; ··· 619 674 struct address_space *mapping, pgoff_t index, void *entry) 620 675 { 621 676 struct radix_tree_root *page_tree = &mapping->page_tree; 622 - int type = RADIX_DAX_TYPE(entry); 623 677 struct radix_tree_node *node; 624 678 struct blk_dax_ctl dax; 625 679 void **slot; ··· 639 695 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 640 696 goto unlock; 641 697 642 - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 698 + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 699 + dax_is_zero_entry(entry))) { 643 700 ret = -EIO; 644 701 goto unlock; 645 702 } 646 703 647 - dax.sector = RADIX_DAX_SECTOR(entry); 648 - dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 704 + /* 705 + * Even if dax_writeback_mapping_range() was given a wbc->range_start 706 + * in the middle of a PMD, the 'index' we are given will be aligned to 707 + * the start index of the PMD, as will the sector we pull from 708 + * 'entry'. This allows us to flush for PMD_SIZE and not have to 709 + * worry about partial PMD writebacks. 710 + */ 711 + dax.sector = dax_radix_sector(entry); 712 + dax.size = PAGE_SIZE << dax_radix_order(entry); 649 713 spin_unlock_irq(&mapping->tree_lock); 650 714 651 715 /* ··· 692 740 struct block_device *bdev, struct writeback_control *wbc) 693 741 { 694 742 struct inode *inode = mapping->host; 695 - pgoff_t start_index, end_index, pmd_index; 743 + pgoff_t start_index, end_index; 696 744 pgoff_t indices[PAGEVEC_SIZE]; 697 745 struct pagevec pvec; 698 746 bool done = false; 699 747 int i, ret = 0; 700 - void *entry; 701 748 702 749 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 703 750 return -EIO; ··· 706 755 707 756 start_index = wbc->range_start >> PAGE_SHIFT; 708 757 end_index = wbc->range_end >> PAGE_SHIFT; 709 - pmd_index = DAX_PMD_INDEX(start_index); 710 - 711 - rcu_read_lock(); 712 - entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 713 - rcu_read_unlock(); 714 - 715 - /* see if the start of our range is covered by a PMD entry */ 716 - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 717 - start_index = pmd_index; 718 758 719 759 tag_pages_for_writeback(mapping, start_index, end_index); 720 760 ··· 750 808 return PTR_ERR(dax.addr); 751 809 dax_unmap_atomic(bdev, &dax); 752 810 753 - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 811 + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); 754 812 if (IS_ERR(ret)) 755 813 return PTR_ERR(ret); 756 814 *entryp = ret; 757 815 758 816 return vm_insert_mixed(vma, vaddr, dax.pfn); 759 817 } 760 - 761 - /** 762 - * dax_fault - handle a page fault on a DAX file 763 - * @vma: The virtual memory area where the fault occurred 764 - * @vmf: The description of the fault 765 - * @get_block: The filesystem method used to translate file offsets to blocks 766 - * 767 - * When a page fault occurs, filesystems may call this helper in their 768 - * fault handler for DAX files. dax_fault() assumes the caller has done all 769 - * the necessary locking for the page fault to proceed successfully. 770 - */ 771 - int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 772 - get_block_t get_block) 773 - { 774 - struct file *file = vma->vm_file; 775 - struct address_space *mapping = file->f_mapping; 776 - struct inode *inode = mapping->host; 777 - void *entry; 778 - struct buffer_head bh; 779 - unsigned long vaddr = (unsigned long)vmf->virtual_address; 780 - unsigned blkbits = inode->i_blkbits; 781 - sector_t block; 782 - pgoff_t size; 783 - int error; 784 - int major = 0; 785 - 786 - /* 787 - * Check whether offset isn't beyond end of file now. Caller is supposed 788 - * to hold locks serializing us with truncate / punch hole so this is 789 - * a reliable test. 790 - */ 791 - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 792 - if (vmf->pgoff >= size) 793 - return VM_FAULT_SIGBUS; 794 - 795 - memset(&bh, 0, sizeof(bh)); 796 - block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 797 - bh.b_bdev = inode->i_sb->s_bdev; 798 - bh.b_size = PAGE_SIZE; 799 - 800 - entry = grab_mapping_entry(mapping, vmf->pgoff); 801 - if (IS_ERR(entry)) { 802 - error = PTR_ERR(entry); 803 - goto out; 804 - } 805 - 806 - error = get_block(inode, block, &bh, 0); 807 - if (!error && (bh.b_size < PAGE_SIZE)) 808 - error = -EIO; /* fs corruption? */ 809 - if (error) 810 - goto unlock_entry; 811 - 812 - if (vmf->cow_page) { 813 - struct page *new_page = vmf->cow_page; 814 - if (buffer_written(&bh)) 815 - error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), 816 - bh.b_size, new_page, vaddr); 817 - else 818 - clear_user_highpage(new_page, vaddr); 819 - if (error) 820 - goto unlock_entry; 821 - if (!radix_tree_exceptional_entry(entry)) { 822 - vmf->page = entry; 823 - return VM_FAULT_LOCKED; 824 - } 825 - vmf->entry = entry; 826 - return VM_FAULT_DAX_LOCKED; 827 - } 828 - 829 - if (!buffer_mapped(&bh)) { 830 - if (vmf->flags & FAULT_FLAG_WRITE) { 831 - error = get_block(inode, block, &bh, 1); 832 - count_vm_event(PGMAJFAULT); 833 - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 834 - major = VM_FAULT_MAJOR; 835 - if (!error && (bh.b_size < PAGE_SIZE)) 836 - error = -EIO; 837 - if (error) 838 - goto unlock_entry; 839 - } else { 840 - return dax_load_hole(mapping, entry, vmf); 841 - } 842 - } 843 - 844 - /* Filesystem should not return unwritten buffers to us! */ 845 - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 846 - error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), 847 - bh.b_size, &entry, vma, vmf); 848 - unlock_entry: 849 - put_locked_mapping_entry(mapping, vmf->pgoff, entry); 850 - out: 851 - if (error == -ENOMEM) 852 - return VM_FAULT_OOM | major; 853 - /* -EBUSY is fine, somebody else faulted on the same PTE */ 854 - if ((error < 0) && (error != -EBUSY)) 855 - return VM_FAULT_SIGBUS | major; 856 - return VM_FAULT_NOPAGE | major; 857 - } 858 - EXPORT_SYMBOL_GPL(dax_fault); 859 - 860 - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 861 - /* 862 - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 863 - * more often than one might expect in the below function. 864 - */ 865 - #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 866 - 867 - static void __dax_dbg(struct buffer_head *bh, unsigned long address, 868 - const char *reason, const char *fn) 869 - { 870 - if (bh) { 871 - char bname[BDEVNAME_SIZE]; 872 - bdevname(bh->b_bdev, bname); 873 - pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 874 - "length %zd fallback: %s\n", fn, current->comm, 875 - address, bname, bh->b_state, (u64)bh->b_blocknr, 876 - bh->b_size, reason); 877 - } else { 878 - pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 879 - current->comm, address, reason); 880 - } 881 - } 882 - 883 - #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 884 - 885 - /** 886 - * dax_pmd_fault - handle a PMD fault on a DAX file 887 - * @vma: The virtual memory area where the fault occurred 888 - * @vmf: The description of the fault 889 - * @get_block: The filesystem method used to translate file offsets to blocks 890 - * 891 - * When a page fault occurs, filesystems may call this helper in their 892 - * pmd_fault handler for DAX files. 893 - */ 894 - int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 895 - pmd_t *pmd, unsigned int flags, get_block_t get_block) 896 - { 897 - struct file *file = vma->vm_file; 898 - struct address_space *mapping = file->f_mapping; 899 - struct inode *inode = mapping->host; 900 - struct buffer_head bh; 901 - unsigned blkbits = inode->i_blkbits; 902 - unsigned long pmd_addr = address & PMD_MASK; 903 - bool write = flags & FAULT_FLAG_WRITE; 904 - struct block_device *bdev; 905 - pgoff_t size, pgoff; 906 - sector_t block; 907 - int result = 0; 908 - bool alloc = false; 909 - 910 - /* dax pmd mappings require pfn_t_devmap() */ 911 - if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 912 - return VM_FAULT_FALLBACK; 913 - 914 - /* Fall back to PTEs if we're going to COW */ 915 - if (write && !(vma->vm_flags & VM_SHARED)) { 916 - split_huge_pmd(vma, pmd, address); 917 - dax_pmd_dbg(NULL, address, "cow write"); 918 - return VM_FAULT_FALLBACK; 919 - } 920 - /* If the PMD would extend outside the VMA */ 921 - if (pmd_addr < vma->vm_start) { 922 - dax_pmd_dbg(NULL, address, "vma start unaligned"); 923 - return VM_FAULT_FALLBACK; 924 - } 925 - if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 926 - dax_pmd_dbg(NULL, address, "vma end unaligned"); 927 - return VM_FAULT_FALLBACK; 928 - } 929 - 930 - pgoff = linear_page_index(vma, pmd_addr); 931 - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 932 - if (pgoff >= size) 933 - return VM_FAULT_SIGBUS; 934 - /* If the PMD would cover blocks out of the file */ 935 - if ((pgoff | PG_PMD_COLOUR) >= size) { 936 - dax_pmd_dbg(NULL, address, 937 - "offset + huge page size > file size"); 938 - return VM_FAULT_FALLBACK; 939 - } 940 - 941 - memset(&bh, 0, sizeof(bh)); 942 - bh.b_bdev = inode->i_sb->s_bdev; 943 - block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 944 - 945 - bh.b_size = PMD_SIZE; 946 - 947 - if (get_block(inode, block, &bh, 0) != 0) 948 - return VM_FAULT_SIGBUS; 949 - 950 - if (!buffer_mapped(&bh) && write) { 951 - if (get_block(inode, block, &bh, 1) != 0) 952 - return VM_FAULT_SIGBUS; 953 - alloc = true; 954 - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 955 - } 956 - 957 - bdev = bh.b_bdev; 958 - 959 - /* 960 - * If the filesystem isn't willing to tell us the length of a hole, 961 - * just fall back to PTEs. Calling get_block 512 times in a loop 962 - * would be silly. 963 - */ 964 - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 965 - dax_pmd_dbg(&bh, address, "allocated block too small"); 966 - return VM_FAULT_FALLBACK; 967 - } 968 - 969 - /* 970 - * If we allocated new storage, make sure no process has any 971 - * zero pages covering this hole 972 - */ 973 - if (alloc) { 974 - loff_t lstart = pgoff << PAGE_SHIFT; 975 - loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 976 - 977 - truncate_pagecache_range(inode, lstart, lend); 978 - } 979 - 980 - if (!write && !buffer_mapped(&bh)) { 981 - spinlock_t *ptl; 982 - pmd_t entry; 983 - struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); 984 - 985 - if (unlikely(!zero_page)) { 986 - dax_pmd_dbg(&bh, address, "no zero page"); 987 - goto fallback; 988 - } 989 - 990 - ptl = pmd_lock(vma->vm_mm, pmd); 991 - if (!pmd_none(*pmd)) { 992 - spin_unlock(ptl); 993 - dax_pmd_dbg(&bh, address, "pmd already present"); 994 - goto fallback; 995 - } 996 - 997 - dev_dbg(part_to_dev(bdev->bd_part), 998 - "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 999 - __func__, current->comm, address, 1000 - (unsigned long long) to_sector(&bh, inode)); 1001 - 1002 - entry = mk_pmd(zero_page, vma->vm_page_prot); 1003 - entry = pmd_mkhuge(entry); 1004 - set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 1005 - result = VM_FAULT_NOPAGE; 1006 - spin_unlock(ptl); 1007 - } else { 1008 - struct blk_dax_ctl dax = { 1009 - .sector = to_sector(&bh, inode), 1010 - .size = PMD_SIZE, 1011 - }; 1012 - long length = dax_map_atomic(bdev, &dax); 1013 - 1014 - if (length < 0) { 1015 - dax_pmd_dbg(&bh, address, "dax-error fallback"); 1016 - goto fallback; 1017 - } 1018 - if (length < PMD_SIZE) { 1019 - dax_pmd_dbg(&bh, address, "dax-length too small"); 1020 - dax_unmap_atomic(bdev, &dax); 1021 - goto fallback; 1022 - } 1023 - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 1024 - dax_pmd_dbg(&bh, address, "pfn unaligned"); 1025 - dax_unmap_atomic(bdev, &dax); 1026 - goto fallback; 1027 - } 1028 - 1029 - if (!pfn_t_devmap(dax.pfn)) { 1030 - dax_unmap_atomic(bdev, &dax); 1031 - dax_pmd_dbg(&bh, address, "pfn not in memmap"); 1032 - goto fallback; 1033 - } 1034 - dax_unmap_atomic(bdev, &dax); 1035 - 1036 - /* 1037 - * For PTE faults we insert a radix tree entry for reads, and 1038 - * leave it clean. Then on the first write we dirty the radix 1039 - * tree entry via the dax_pfn_mkwrite() path. This sequence 1040 - * allows the dax_pfn_mkwrite() call to be simpler and avoid a 1041 - * call into get_block() to translate the pgoff to a sector in 1042 - * order to be able to create a new radix tree entry. 1043 - * 1044 - * The PMD path doesn't have an equivalent to 1045 - * dax_pfn_mkwrite(), though, so for a read followed by a 1046 - * write we traverse all the way through dax_pmd_fault() 1047 - * twice. This means we can just skip inserting a radix tree 1048 - * entry completely on the initial read and just wait until 1049 - * the write to insert a dirty entry. 1050 - */ 1051 - if (write) { 1052 - /* 1053 - * We should insert radix-tree entry and dirty it here. 1054 - * For now this is broken... 1055 - */ 1056 - } 1057 - 1058 - dev_dbg(part_to_dev(bdev->bd_part), 1059 - "%s: %s addr: %lx pfn: %lx sect: %llx\n", 1060 - __func__, current->comm, address, 1061 - pfn_t_to_pfn(dax.pfn), 1062 - (unsigned long long) dax.sector); 1063 - result |= vmf_insert_pfn_pmd(vma, address, pmd, 1064 - dax.pfn, write); 1065 - } 1066 - 1067 - out: 1068 - return result; 1069 - 1070 - fallback: 1071 - count_vm_event(THP_FAULT_FALLBACK); 1072 - result = VM_FAULT_FALLBACK; 1073 - goto out; 1074 - } 1075 - EXPORT_SYMBOL_GPL(dax_pmd_fault); 1076 - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1077 818 1078 819 /** 1079 820 * dax_pfn_mkwrite - handle first write to DAX page ··· 818 1193 } 819 1194 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 820 1195 821 - /** 822 - * dax_zero_page_range - zero a range within a page of a DAX file 823 - * @inode: The file being truncated 824 - * @from: The file offset that is being truncated to 825 - * @length: The number of bytes to zero 826 - * @get_block: The filesystem method used to translate file offsets to blocks 827 - * 828 - * This function can be called by a filesystem when it is zeroing part of a 829 - * page in a DAX file. This is intended for hole-punch operations. If 830 - * you are truncating a file, the helper function dax_truncate_page() may be 831 - * more convenient. 832 - */ 833 - int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 834 - get_block_t get_block) 835 - { 836 - struct buffer_head bh; 837 - pgoff_t index = from >> PAGE_SHIFT; 838 - unsigned offset = from & (PAGE_SIZE-1); 839 - int err; 840 - 841 - /* Block boundary? Nothing to do */ 842 - if (!length) 843 - return 0; 844 - BUG_ON((offset + length) > PAGE_SIZE); 845 - 846 - memset(&bh, 0, sizeof(bh)); 847 - bh.b_bdev = inode->i_sb->s_bdev; 848 - bh.b_size = PAGE_SIZE; 849 - err = get_block(inode, index, &bh, 0); 850 - if (err < 0 || !buffer_written(&bh)) 851 - return err; 852 - 853 - return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 854 - offset, length); 855 - } 856 - EXPORT_SYMBOL_GPL(dax_zero_page_range); 857 - 858 - /** 859 - * dax_truncate_page - handle a partial page being truncated in a DAX file 860 - * @inode: The file being truncated 861 - * @from: The file offset that is being truncated to 862 - * @get_block: The filesystem method used to translate file offsets to blocks 863 - * 864 - * Similar to block_truncate_page(), this function can be called by a 865 - * filesystem when it is truncating a DAX file to handle the partial page. 866 - */ 867 - int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 868 - { 869 - unsigned length = PAGE_ALIGN(from) - from; 870 - return dax_zero_page_range(inode, from, length, get_block); 871 - } 872 - EXPORT_SYMBOL_GPL(dax_truncate_page); 873 - 874 1196 #ifdef CONFIG_FS_IOMAP 1197 + static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 1198 + { 1199 + return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 1200 + } 1201 + 875 1202 static loff_t 876 - iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1203 + dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 877 1204 struct iomap *iomap) 878 1205 { 879 1206 struct iov_iter *iter = data; ··· 849 1272 struct blk_dax_ctl dax = { 0 }; 850 1273 ssize_t map_len; 851 1274 852 - dax.sector = iomap->blkno + 853 - (((pos & PAGE_MASK) - iomap->offset) >> 9); 1275 + dax.sector = dax_iomap_sector(iomap, pos); 854 1276 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 855 1277 map_len = dax_map_atomic(iomap->bdev, &dax); 856 1278 if (map_len < 0) { ··· 881 1305 } 882 1306 883 1307 /** 884 - * iomap_dax_rw - Perform I/O to a DAX file 1308 + * dax_iomap_rw - Perform I/O to a DAX file 885 1309 * @iocb: The control block for this I/O 886 1310 * @iter: The addresses to do I/O from or to 887 1311 * @ops: iomap ops passed from the file system ··· 891 1315 * and evicting any page cache pages in the region under I/O. 892 1316 */ 893 1317 ssize_t 894 - iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, 1318 + dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 895 1319 struct iomap_ops *ops) 896 1320 { 897 1321 struct address_space *mapping = iocb->ki_filp->f_mapping; ··· 921 1345 922 1346 while (iov_iter_count(iter)) { 923 1347 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 924 - iter, iomap_dax_actor); 1348 + iter, dax_iomap_actor); 925 1349 if (ret <= 0) 926 1350 break; 927 1351 pos += ret; ··· 931 1355 iocb->ki_pos += done; 932 1356 return done ? done : ret; 933 1357 } 934 - EXPORT_SYMBOL_GPL(iomap_dax_rw); 1358 + EXPORT_SYMBOL_GPL(dax_iomap_rw); 935 1359 936 1360 /** 937 - * iomap_dax_fault - handle a page fault on a DAX file 1361 + * dax_iomap_fault - handle a page fault on a DAX file 938 1362 * @vma: The virtual memory area where the fault occurred 939 1363 * @vmf: The description of the fault 940 1364 * @ops: iomap ops passed from the file system ··· 943 1367 * or mkwrite handler for DAX files. Assumes the caller has done all the 944 1368 * necessary locking for the page fault to proceed successfully. 945 1369 */ 946 - int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 1370 + int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 947 1371 struct iomap_ops *ops) 948 1372 { 949 1373 struct address_space *mapping = vma->vm_file->f_mapping; ··· 952 1376 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 953 1377 sector_t sector; 954 1378 struct iomap iomap = { 0 }; 955 - unsigned flags = 0; 1379 + unsigned flags = IOMAP_FAULT; 956 1380 int error, major = 0; 1381 + int locked_status = 0; 957 1382 void *entry; 958 1383 959 1384 /* ··· 965 1388 if (pos >= i_size_read(inode)) 966 1389 return VM_FAULT_SIGBUS; 967 1390 968 - entry = grab_mapping_entry(mapping, vmf->pgoff); 1391 + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 969 1392 if (IS_ERR(entry)) { 970 1393 error = PTR_ERR(entry); 971 1394 goto out; ··· 984 1407 goto unlock_entry; 985 1408 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 986 1409 error = -EIO; /* fs corruption? */ 987 - goto unlock_entry; 1410 + goto finish_iomap; 988 1411 } 989 1412 990 - sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); 1413 + sector = dax_iomap_sector(&iomap, pos); 991 1414 992 1415 if (vmf->cow_page) { 993 1416 switch (iomap.type) { ··· 1006 1429 } 1007 1430 1008 1431 if (error) 1009 - goto unlock_entry; 1432 + goto finish_iomap; 1010 1433 if (!radix_tree_exceptional_entry(entry)) { 1011 1434 vmf->page = entry; 1012 - return VM_FAULT_LOCKED; 1435 + locked_status = VM_FAULT_LOCKED; 1436 + } else { 1437 + vmf->entry = entry; 1438 + locked_status = VM_FAULT_DAX_LOCKED; 1013 1439 } 1014 - vmf->entry = entry; 1015 - return VM_FAULT_DAX_LOCKED; 1440 + goto finish_iomap; 1016 1441 } 1017 1442 1018 1443 switch (iomap.type) { ··· 1029 1450 break; 1030 1451 case IOMAP_UNWRITTEN: 1031 1452 case IOMAP_HOLE: 1032 - if (!(vmf->flags & FAULT_FLAG_WRITE)) 1033 - return dax_load_hole(mapping, entry, vmf); 1453 + if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1454 + locked_status = dax_load_hole(mapping, entry, vmf); 1455 + break; 1456 + } 1034 1457 /*FALLTHRU*/ 1035 1458 default: 1036 1459 WARN_ON_ONCE(1); ··· 1040 1459 break; 1041 1460 } 1042 1461 1462 + finish_iomap: 1463 + if (ops->iomap_end) { 1464 + if (error) { 1465 + /* keep previous error */ 1466 + ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, 1467 + &iomap); 1468 + } else { 1469 + error = ops->iomap_end(inode, pos, PAGE_SIZE, 1470 + PAGE_SIZE, flags, &iomap); 1471 + } 1472 + } 1043 1473 unlock_entry: 1044 - put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1474 + if (!locked_status || error) 1475 + put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1045 1476 out: 1046 1477 if (error == -ENOMEM) 1047 1478 return VM_FAULT_OOM | major; 1048 1479 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1049 1480 if (error < 0 && error != -EBUSY) 1050 1481 return VM_FAULT_SIGBUS | major; 1482 + if (locked_status) { 1483 + WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ 1484 + return locked_status; 1485 + } 1051 1486 return VM_FAULT_NOPAGE | major; 1052 1487 } 1053 - EXPORT_SYMBOL_GPL(iomap_dax_fault); 1488 + EXPORT_SYMBOL_GPL(dax_iomap_fault); 1489 + 1490 + #ifdef CONFIG_FS_DAX_PMD 1491 + /* 1492 + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 1493 + * more often than one might expect in the below functions. 1494 + */ 1495 + #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 1496 + 1497 + static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, 1498 + struct vm_fault *vmf, unsigned long address, 1499 + struct iomap *iomap, loff_t pos, bool write, void **entryp) 1500 + { 1501 + struct address_space *mapping = vma->vm_file->f_mapping; 1502 + struct block_device *bdev = iomap->bdev; 1503 + struct blk_dax_ctl dax = { 1504 + .sector = dax_iomap_sector(iomap, pos), 1505 + .size = PMD_SIZE, 1506 + }; 1507 + long length = dax_map_atomic(bdev, &dax); 1508 + void *ret; 1509 + 1510 + if (length < 0) /* dax_map_atomic() failed */ 1511 + return VM_FAULT_FALLBACK; 1512 + if (length < PMD_SIZE) 1513 + goto unmap_fallback; 1514 + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) 1515 + goto unmap_fallback; 1516 + if (!pfn_t_devmap(dax.pfn)) 1517 + goto unmap_fallback; 1518 + 1519 + dax_unmap_atomic(bdev, &dax); 1520 + 1521 + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, 1522 + RADIX_DAX_PMD); 1523 + if (IS_ERR(ret)) 1524 + return VM_FAULT_FALLBACK; 1525 + *entryp = ret; 1526 + 1527 + return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); 1528 + 1529 + unmap_fallback: 1530 + dax_unmap_atomic(bdev, &dax); 1531 + return VM_FAULT_FALLBACK; 1532 + } 1533 + 1534 + static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, 1535 + struct vm_fault *vmf, unsigned long address, 1536 + struct iomap *iomap, void **entryp) 1537 + { 1538 + struct address_space *mapping = vma->vm_file->f_mapping; 1539 + unsigned long pmd_addr = address & PMD_MASK; 1540 + struct page *zero_page; 1541 + spinlock_t *ptl; 1542 + pmd_t pmd_entry; 1543 + void *ret; 1544 + 1545 + zero_page = mm_get_huge_zero_page(vma->vm_mm); 1546 + 1547 + if (unlikely(!zero_page)) 1548 + return VM_FAULT_FALLBACK; 1549 + 1550 + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1551 + RADIX_DAX_PMD | RADIX_DAX_HZP); 1552 + if (IS_ERR(ret)) 1553 + return VM_FAULT_FALLBACK; 1554 + *entryp = ret; 1555 + 1556 + ptl = pmd_lock(vma->vm_mm, pmd); 1557 + if (!pmd_none(*pmd)) { 1558 + spin_unlock(ptl); 1559 + return VM_FAULT_FALLBACK; 1560 + } 1561 + 1562 + pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); 1563 + pmd_entry = pmd_mkhuge(pmd_entry); 1564 + set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); 1565 + spin_unlock(ptl); 1566 + return VM_FAULT_NOPAGE; 1567 + } 1568 + 1569 + int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, 1570 + pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) 1571 + { 1572 + struct address_space *mapping = vma->vm_file->f_mapping; 1573 + unsigned long pmd_addr = address & PMD_MASK; 1574 + bool write = flags & FAULT_FLAG_WRITE; 1575 + unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1576 + struct inode *inode = mapping->host; 1577 + int result = VM_FAULT_FALLBACK; 1578 + struct iomap iomap = { 0 }; 1579 + pgoff_t max_pgoff, pgoff; 1580 + struct vm_fault vmf; 1581 + void *entry; 1582 + loff_t pos; 1583 + int error; 1584 + 1585 + /* Fall back to PTEs if we're going to COW */ 1586 + if (write && !(vma->vm_flags & VM_SHARED)) 1587 + goto fallback; 1588 + 1589 + /* If the PMD would extend outside the VMA */ 1590 + if (pmd_addr < vma->vm_start) 1591 + goto fallback; 1592 + if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1593 + goto fallback; 1594 + 1595 + /* 1596 + * Check whether offset isn't beyond end of file now. Caller is 1597 + * supposed to hold locks serializing us with truncate / punch hole so 1598 + * this is a reliable test. 1599 + */ 1600 + pgoff = linear_page_index(vma, pmd_addr); 1601 + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1602 + 1603 + if (pgoff > max_pgoff) 1604 + return VM_FAULT_SIGBUS; 1605 + 1606 + /* If the PMD would extend beyond the file size */ 1607 + if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1608 + goto fallback; 1609 + 1610 + /* 1611 + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1612 + * PMD or a HZP entry. If it can't (because a 4k page is already in 1613 + * the tree, for instance), it will return -EEXIST and we just fall 1614 + * back to 4k entries. 1615 + */ 1616 + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1617 + if (IS_ERR(entry)) 1618 + goto fallback; 1619 + 1620 + /* 1621 + * Note that we don't use iomap_apply here. We aren't doing I/O, only 1622 + * setting up a mapping, so really we're using iomap_begin() as a way 1623 + * to look up our filesystem block. 1624 + */ 1625 + pos = (loff_t)pgoff << PAGE_SHIFT; 1626 + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1627 + if (error) 1628 + goto unlock_entry; 1629 + if (iomap.offset + iomap.length < pos + PMD_SIZE) 1630 + goto finish_iomap; 1631 + 1632 + vmf.pgoff = pgoff; 1633 + vmf.flags = flags; 1634 + vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; 1635 + 1636 + switch (iomap.type) { 1637 + case IOMAP_MAPPED: 1638 + result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, 1639 + &iomap, pos, write, &entry); 1640 + break; 1641 + case IOMAP_UNWRITTEN: 1642 + case IOMAP_HOLE: 1643 + if (WARN_ON_ONCE(write)) 1644 + goto finish_iomap; 1645 + result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, 1646 + &entry); 1647 + break; 1648 + default: 1649 + WARN_ON_ONCE(1); 1650 + break; 1651 + } 1652 + 1653 + finish_iomap: 1654 + if (ops->iomap_end) { 1655 + if (result == VM_FAULT_FALLBACK) { 1656 + ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, 1657 + &iomap); 1658 + } else { 1659 + error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, 1660 + iomap_flags, &iomap); 1661 + if (error) 1662 + result = VM_FAULT_FALLBACK; 1663 + } 1664 + } 1665 + unlock_entry: 1666 + put_locked_mapping_entry(mapping, pgoff, entry); 1667 + fallback: 1668 + if (result == VM_FAULT_FALLBACK) { 1669 + split_huge_pmd(vma, pmd, address); 1670 + count_vm_event(THP_FAULT_FALLBACK); 1671 + } 1672 + return result; 1673 + } 1674 + EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); 1675 + #endif /* CONFIG_FS_DAX_PMD */ 1054 1676 #endif /* CONFIG_FS_IOMAP */

+9 -26

fs/ext2/file.c

··· 38 38 return 0; /* skip atime */ 39 39 40 40 inode_lock_shared(inode); 41 - ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); 41 + ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops); 42 42 inode_unlock_shared(inode); 43 43 44 44 file_accessed(iocb->ki_filp); ··· 62 62 if (ret) 63 63 goto out_unlock; 64 64 65 - ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); 65 + ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops); 66 66 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 67 67 i_size_write(inode, iocb->ki_pos); 68 68 mark_inode_dirty(inode); ··· 99 99 } 100 100 down_read(&ei->dax_sem); 101 101 102 - ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); 102 + ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops); 103 103 104 104 up_read(&ei->dax_sem); 105 105 if (vmf->flags & FAULT_FLAG_WRITE) 106 - sb_end_pagefault(inode->i_sb); 107 - return ret; 108 - } 109 - 110 - static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 111 - pmd_t *pmd, unsigned int flags) 112 - { 113 - struct inode *inode = file_inode(vma->vm_file); 114 - struct ext2_inode_info *ei = EXT2_I(inode); 115 - int ret; 116 - 117 - if (flags & FAULT_FLAG_WRITE) { 118 - sb_start_pagefault(inode->i_sb); 119 - file_update_time(vma->vm_file); 120 - } 121 - down_read(&ei->dax_sem); 122 - 123 - ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); 124 - 125 - up_read(&ei->dax_sem); 126 - if (flags & FAULT_FLAG_WRITE) 127 106 sb_end_pagefault(inode->i_sb); 128 107 return ret; 129 108 } ··· 133 154 134 155 static const struct vm_operations_struct ext2_dax_vm_ops = { 135 156 .fault = ext2_dax_fault, 136 - .pmd_fault = ext2_dax_pmd_fault, 157 + /* 158 + * .pmd_fault is not supported for DAX because allocation in ext2 159 + * cannot be reliably aligned to huge page sizes and so pmd faults 160 + * will always fail and fail back to regular faults. 161 + */ 137 162 .page_mkwrite = ext2_dax_fault, 138 163 .pfn_mkwrite = ext2_dax_pfn_mkwrite, 139 164 }; ··· 149 166 150 167 file_accessed(file); 151 168 vma->vm_ops = &ext2_dax_vm_ops; 152 - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 169 + vma->vm_flags |= VM_MIXEDMAP; 153 170 return 0; 154 171 } 155 172 #else

+8 -3

fs/ext2/inode.c

··· 850 850 .iomap_begin = ext2_iomap_begin, 851 851 .iomap_end = ext2_iomap_end, 852 852 }; 853 + #else 854 + /* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ 855 + struct iomap_ops ext2_iomap_ops; 853 856 #endif /* CONFIG_FS_DAX */ 854 857 855 858 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ··· 1296 1293 1297 1294 inode_dio_wait(inode); 1298 1295 1299 - if (IS_DAX(inode)) 1300 - error = dax_truncate_page(inode, newsize, ext2_get_block); 1301 - else if (test_opt(inode->i_sb, NOBH)) 1296 + if (IS_DAX(inode)) { 1297 + error = iomap_zero_range(inode, newsize, 1298 + PAGE_ALIGN(newsize) - newsize, NULL, 1299 + &ext2_iomap_ops); 1300 + } else if (test_opt(inode->i_sb, NOBH)) 1302 1301 error = nobh_truncate_page(inode->i_mapping, 1303 1302 newsize, ext2_get_block); 1304 1303 else

+1

fs/ext4/Kconfig

··· 37 37 select CRC16 38 38 select CRYPTO 39 39 select CRYPTO_CRC32C 40 + select FS_IOMAP if FS_DAX 40 41 help 41 42 This is the next generation of the ext3 filesystem. 42 43

+1 -1

fs/ext4/acl.c

··· 196 196 error = posix_acl_update_mode(inode, &inode->i_mode, &acl); 197 197 if (error) 198 198 return error; 199 - inode->i_ctime = ext4_current_time(inode); 199 + inode->i_ctime = current_time(inode); 200 200 ext4_mark_inode_dirty(handle, inode); 201 201 } 202 202 break;

+7 -24

fs/ext4/ext4.h

··· 397 397 #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 398 398 399 399 #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ 400 - #define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ 400 + #define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ 401 401 402 + /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ 402 403 #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ 403 404 EXT4_IMMUTABLE_FL | \ 404 405 EXT4_APPEND_FL | \ ··· 1534 1533 return container_of(inode, struct ext4_inode_info, vfs_inode); 1535 1534 } 1536 1535 1537 - static inline struct timespec ext4_current_time(struct inode *inode) 1538 - { 1539 - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? 1540 - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; 1541 - } 1542 - 1543 1536 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) 1544 1537 { 1545 1538 return ino == EXT4_ROOT_INO || ··· 2272 2277 struct ext4_group_desc *gdp); 2273 2278 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 2274 2279 2275 - static inline int ext4_sb_has_crypto(struct super_block *sb) 2276 - { 2277 - return ext4_has_feature_encrypt(sb); 2278 - } 2279 - 2280 2280 static inline bool ext4_encrypted_inode(struct inode *inode) 2281 2281 { 2282 2282 return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); ··· 2329 2339 #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page 2330 2340 #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page 2331 2341 #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range 2332 - #define fscrypt_process_policy fscrypt_notsupp_process_policy 2333 - #define fscrypt_get_policy fscrypt_notsupp_get_policy 2342 + #define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy 2343 + #define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy 2334 2344 #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context 2335 2345 #define fscrypt_inherit_context fscrypt_notsupp_inherit_context 2336 2346 #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info ··· 2448 2458 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2449 2459 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 2450 2460 struct buffer_head *bh_result, int create); 2451 - int ext4_dax_get_block(struct inode *inode, sector_t iblock, 2452 - struct buffer_head *bh_result, int create); 2453 2461 int ext4_get_block(struct inode *inode, sector_t iblock, 2454 2462 struct buffer_head *bh_result, int create); 2455 2463 int ext4_dio_get_block(struct inode *inode, sector_t iblock, ··· 2480 2492 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2481 2493 extern int ext4_inode_attach_jinode(struct inode *inode); 2482 2494 extern int ext4_can_truncate(struct inode *inode); 2483 - extern void ext4_truncate(struct inode *); 2495 + extern int ext4_truncate(struct inode *); 2484 2496 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); 2485 2497 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2486 2498 extern void ext4_set_inode_flags(struct inode *); ··· 3117 3129 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); 3118 3130 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3119 3131 struct ext4_map_blocks *map, int flags); 3120 - extern void ext4_ext_truncate(handle_t *, struct inode *); 3132 + extern int ext4_ext_truncate(handle_t *, struct inode *); 3121 3133 extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 3122 3134 ext4_lblk_t end); 3123 3135 extern void ext4_ext_init(struct super_block *); ··· 3253 3265 } 3254 3266 } 3255 3267 3256 - static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) 3257 - { 3258 - int blksize = 1 << inode->i_blkbits; 3259 - 3260 - return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); 3261 - } 3268 + extern struct iomap_ops ext4_iomap_ops; 3262 3269 3263 3270 #endif /* __KERNEL__ */ 3264 3271

+8 -6

fs/ext4/ext4_jbd2.h

··· 414 414 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ 415 415 /* We do not support data journalling with delayed allocation */ 416 416 if (!S_ISREG(inode->i_mode) || 417 - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 417 + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 418 + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && 419 + !test_opt(inode->i_sb, DELALLOC))) { 420 + /* We do not support data journalling for encrypted data */ 421 + if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) 422 + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ 418 423 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ 419 - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && 420 - !test_opt(inode->i_sb, DELALLOC)) 421 - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ 424 + } 422 425 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 423 426 return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ 424 427 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 425 428 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ 426 - else 427 - BUG(); 429 + BUG(); 428 430 } 429 431 430 432 static inline int ext4_should_journal_data(struct inode *inode)

+13 -14

fs/ext4/extents.c

··· 4631 4631 return err ? err : allocated; 4632 4632 } 4633 4633 4634 - void ext4_ext_truncate(handle_t *handle, struct inode *inode) 4634 + int ext4_ext_truncate(handle_t *handle, struct inode *inode) 4635 4635 { 4636 4636 struct super_block *sb = inode->i_sb; 4637 4637 ext4_lblk_t last_block; ··· 4645 4645 4646 4646 /* we have to know where to truncate from in crash case */ 4647 4647 EXT4_I(inode)->i_disksize = inode->i_size; 4648 - ext4_mark_inode_dirty(handle, inode); 4648 + err = ext4_mark_inode_dirty(handle, inode); 4649 + if (err) 4650 + return err; 4649 4651 4650 4652 last_block = (inode->i_size + sb->s_blocksize - 1) 4651 4653 >> EXT4_BLOCK_SIZE_BITS(sb); ··· 4659 4657 congestion_wait(BLK_RW_ASYNC, HZ/50); 4660 4658 goto retry; 4661 4659 } 4662 - if (err) { 4663 - ext4_std_error(inode->i_sb, err); 4664 - return; 4665 - } 4666 - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4667 - ext4_std_error(inode->i_sb, err); 4660 + if (err) 4661 + return err; 4662 + return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4668 4663 } 4669 4664 4670 4665 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ··· 4700 4701 /* 4701 4702 * Recalculate credits when extent tree depth changes. 4702 4703 */ 4703 - if (depth >= 0 && depth != ext_depth(inode)) { 4704 + if (depth != ext_depth(inode)) { 4704 4705 credits = ext4_chunk_trans_blocks(inode, len); 4705 4706 depth = ext_depth(inode); 4706 4707 } ··· 4724 4725 map.m_lblk += ret; 4725 4726 map.m_len = len = len - ret; 4726 4727 epos = (loff_t)map.m_lblk << inode->i_blkbits; 4727 - inode->i_ctime = ext4_current_time(inode); 4728 + inode->i_ctime = current_time(inode); 4728 4729 if (new_size) { 4729 4730 if (epos > new_size) 4730 4731 epos = new_size; ··· 4852 4853 } 4853 4854 /* Now release the pages and zero block aligned part of pages */ 4854 4855 truncate_pagecache_range(inode, start, end - 1); 4855 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4856 + inode->i_mtime = inode->i_ctime = current_time(inode); 4856 4857 4857 4858 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4858 4859 flags, mode); ··· 4877 4878 goto out_dio; 4878 4879 } 4879 4880 4880 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4881 + inode->i_mtime = inode->i_ctime = current_time(inode); 4881 4882 if (new_size) { 4882 4883 ext4_update_inode_size(inode, new_size); 4883 4884 } else { ··· 5567 5568 up_write(&EXT4_I(inode)->i_data_sem); 5568 5569 if (IS_SYNC(inode)) 5569 5570 ext4_handle_sync(handle); 5570 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 5571 + inode->i_mtime = inode->i_ctime = current_time(inode); 5571 5572 ext4_mark_inode_dirty(handle, inode); 5572 5573 5573 5574 out_stop: ··· 5677 5678 /* Expand file to avoid data loss if there is error while shifting */ 5678 5679 inode->i_size += len; 5679 5680 EXT4_I(inode)->i_disksize += len; 5680 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 5681 + inode->i_mtime = inode->i_ctime = current_time(inode); 5681 5682 ret = ext4_mark_inode_dirty(handle, inode); 5682 5683 if (ret) 5683 5684 goto out_stop;

+132 -52

fs/ext4/file.c

··· 31 31 #include "xattr.h" 32 32 #include "acl.h" 33 33 34 + #ifdef CONFIG_FS_DAX 35 + static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) 36 + { 37 + struct inode *inode = file_inode(iocb->ki_filp); 38 + ssize_t ret; 39 + 40 + inode_lock_shared(inode); 41 + /* 42 + * Recheck under inode lock - at this point we are sure it cannot 43 + * change anymore 44 + */ 45 + if (!IS_DAX(inode)) { 46 + inode_unlock_shared(inode); 47 + /* Fallback to buffered IO in case we cannot support DAX */ 48 + return generic_file_read_iter(iocb, to); 49 + } 50 + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); 51 + inode_unlock_shared(inode); 52 + 53 + file_accessed(iocb->ki_filp); 54 + return ret; 55 + } 56 + #endif 57 + 58 + static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 59 + { 60 + if (!iov_iter_count(to)) 61 + return 0; /* skip atime */ 62 + 63 + #ifdef CONFIG_FS_DAX 64 + if (IS_DAX(file_inode(iocb->ki_filp))) 65 + return ext4_dax_read_iter(iocb, to); 66 + #endif 67 + return generic_file_read_iter(iocb, to); 68 + } 69 + 34 70 /* 35 71 * Called when an inode is released. Note that this is different 36 72 * from ext4_file_open: open gets called at every open, but release ··· 124 88 return 0; 125 89 } 126 90 91 + /* Is IO overwriting allocated and initialized blocks? */ 92 + static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) 93 + { 94 + struct ext4_map_blocks map; 95 + unsigned int blkbits = inode->i_blkbits; 96 + int err, blklen; 97 + 98 + if (pos + len > i_size_read(inode)) 99 + return false; 100 + 101 + map.m_lblk = pos >> blkbits; 102 + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); 103 + blklen = map.m_len; 104 + 105 + err = ext4_map_blocks(NULL, inode, &map, 0); 106 + /* 107 + * 'err==len' means that all of the blocks have been preallocated, 108 + * regardless of whether they have been initialized or not. To exclude 109 + * unwritten extents, we need to check m_flags. 110 + */ 111 + return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); 112 + } 113 + 114 + static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) 115 + { 116 + struct inode *inode = file_inode(iocb->ki_filp); 117 + ssize_t ret; 118 + 119 + ret = generic_write_checks(iocb, from); 120 + if (ret <= 0) 121 + return ret; 122 + /* 123 + * If we have encountered a bitmap-format file, the size limit 124 + * is smaller than s_maxbytes, which is for extent-mapped files. 125 + */ 126 + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 127 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 128 + 129 + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) 130 + return -EFBIG; 131 + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 132 + } 133 + return iov_iter_count(from); 134 + } 135 + 136 + #ifdef CONFIG_FS_DAX 137 + static ssize_t 138 + ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) 139 + { 140 + struct inode *inode = file_inode(iocb->ki_filp); 141 + ssize_t ret; 142 + bool overwrite = false; 143 + 144 + inode_lock(inode); 145 + ret = ext4_write_checks(iocb, from); 146 + if (ret <= 0) 147 + goto out; 148 + ret = file_remove_privs(iocb->ki_filp); 149 + if (ret) 150 + goto out; 151 + ret = file_update_time(iocb->ki_filp); 152 + if (ret) 153 + goto out; 154 + 155 + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { 156 + overwrite = true; 157 + downgrade_write(&inode->i_rwsem); 158 + } 159 + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); 160 + out: 161 + if (!overwrite) 162 + inode_unlock(inode); 163 + else 164 + inode_unlock_shared(inode); 165 + if (ret > 0) 166 + ret = generic_write_sync(iocb, ret); 167 + return ret; 168 + } 169 + #endif 170 + 127 171 static ssize_t 128 172 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 129 173 { ··· 213 97 int overwrite = 0; 214 98 ssize_t ret; 215 99 100 + #ifdef CONFIG_FS_DAX 101 + if (IS_DAX(inode)) 102 + return ext4_dax_write_iter(iocb, from); 103 + #endif 104 + 216 105 inode_lock(inode); 217 - ret = generic_write_checks(iocb, from); 106 + ret = ext4_write_checks(iocb, from); 218 107 if (ret <= 0) 219 108 goto out; 220 109 ··· 235 114 ext4_unwritten_wait(inode); 236 115 } 237 116 238 - /* 239 - * If we have encountered a bitmap-format file, the size limit 240 - * is smaller than s_maxbytes, which is for extent-mapped files. 241 - */ 242 - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 243 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 244 - 245 - if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { 246 - ret = -EFBIG; 247 - goto out; 248 - } 249 - iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 250 - } 251 - 252 117 iocb->private = &overwrite; 253 - if (o_direct) { 254 - size_t length = iov_iter_count(from); 255 - loff_t pos = iocb->ki_pos; 256 - 257 - /* check whether we do a DIO overwrite or not */ 258 - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && 259 - pos + length <= i_size_read(inode)) { 260 - struct ext4_map_blocks map; 261 - unsigned int blkbits = inode->i_blkbits; 262 - int err, len; 263 - 264 - map.m_lblk = pos >> blkbits; 265 - map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); 266 - len = map.m_len; 267 - 268 - err = ext4_map_blocks(NULL, inode, &map, 0); 269 - /* 270 - * 'err==len' means that all of blocks has 271 - * been preallocated no matter they are 272 - * initialized or not. For excluding 273 - * unwritten extents, we need to check 274 - * m_flags. There are two conditions that 275 - * indicate for initialized extents. 1) If we 276 - * hit extent cache, EXT4_MAP_MAPPED flag is 277 - * returned; 2) If we do a real lookup, 278 - * non-flags are returned. So we should check 279 - * these two conditions. 280 - */ 281 - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) 282 - overwrite = 1; 283 - } 284 - } 118 + /* Check whether we do a DIO overwrite or not */ 119 + if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && 120 + ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) 121 + overwrite = 1; 285 122 286 123 ret = __generic_file_write_iter(iocb, from); 287 124 inode_unlock(inode); ··· 275 196 if (IS_ERR(handle)) 276 197 result = VM_FAULT_SIGBUS; 277 198 else 278 - result = dax_fault(vma, vmf, ext4_dax_get_block); 199 + result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); 279 200 280 201 if (write) { 281 202 if (!IS_ERR(handle)) ··· 309 230 310 231 if (IS_ERR(handle)) 311 232 result = VM_FAULT_SIGBUS; 312 - else 313 - result = dax_pmd_fault(vma, addr, pmd, flags, 314 - ext4_dax_get_block); 233 + else { 234 + result = dax_iomap_pmd_fault(vma, addr, pmd, flags, 235 + &ext4_iomap_ops); 236 + } 315 237 316 238 if (write) { 317 239 if (!IS_ERR(handle)) ··· 767 687 768 688 const struct file_operations ext4_file_operations = { 769 689 .llseek = ext4_llseek, 770 - .read_iter = generic_file_read_iter, 690 + .read_iter = ext4_file_read_iter, 771 691 .write_iter = ext4_file_write_iter, 772 692 .unlocked_ioctl = ext4_ioctl, 773 693 #ifdef CONFIG_COMPAT

+2 -3

fs/ext4/ialloc.c

··· 1039 1039 /* This is the optimal IO size (for stat), not the fs block size */ 1040 1040 inode->i_blocks = 0; 1041 1041 inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = 1042 - ext4_current_time(inode); 1042 + current_time(inode); 1043 1043 1044 1044 memset(ei->i_data, 0, sizeof(ei->i_data)); 1045 1045 ei->i_dir_start_lookup = 0; ··· 1115 1115 } 1116 1116 1117 1117 if (encrypt) { 1118 - /* give pointer to avoid set_context with journal ops. */ 1119 - err = fscrypt_inherit_context(dir, inode, &encrypt, true); 1118 + err = fscrypt_inherit_context(dir, inode, handle, true); 1120 1119 if (err) 1121 1120 goto fail_free_drop; 1122 1121 }

+15 -3

fs/ext4/inline.c

··· 299 299 EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; 300 300 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); 301 301 ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); 302 + /* 303 + * Propagate changes to inode->i_flags as well - e.g. S_DAX may 304 + * get cleared 305 + */ 306 + ext4_set_inode_flags(inode); 302 307 get_bh(is.iloc.bh); 303 308 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 304 309 ··· 341 336 342 337 len -= EXT4_MIN_INLINE_DATA_SIZE; 343 338 value = kzalloc(len, GFP_NOFS); 344 - if (!value) 339 + if (!value) { 340 + error = -ENOMEM; 345 341 goto out; 342 + } 346 343 347 344 error = ext4_xattr_ibody_get(inode, i.name_index, i.name, 348 345 value, len); ··· 449 442 } 450 443 } 451 444 ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); 445 + /* 446 + * Propagate changes to inode->i_flags as well - e.g. S_DAX may 447 + * get set. 448 + */ 449 + ext4_set_inode_flags(inode); 452 450 453 451 get_bh(is.iloc.bh); 454 452 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); ··· 1040 1028 * happen is that the times are slightly out of date 1041 1029 * and/or different from the directory change time. 1042 1030 */ 1043 - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); 1031 + dir->i_mtime = dir->i_ctime = current_time(dir); 1044 1032 ext4_update_dx_flag(dir); 1045 1033 dir->i_version++; 1046 1034 ext4_mark_inode_dirty(handle, dir); ··· 1983 1971 if (inode->i_nlink) 1984 1972 ext4_orphan_del(handle, inode); 1985 1973 1986 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 1974 + inode->i_mtime = inode->i_ctime = current_time(inode); 1987 1975 ext4_mark_inode_dirty(handle, inode); 1988 1976 if (IS_SYNC(inode)) 1989 1977 ext4_handle_sync(handle);

+244 -107

fs/ext4/inode.c

··· 37 37 #include <linux/printk.h> 38 38 #include <linux/slab.h> 39 39 #include <linux/bitops.h> 40 + #include <linux/iomap.h> 40 41 41 42 #include "ext4_jbd2.h" 42 43 #include "xattr.h" ··· 72 71 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, 73 72 csum_size); 74 73 offset += csum_size; 75 - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, 76 - EXT4_INODE_SIZE(inode->i_sb) - 77 - offset); 78 74 } 75 + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, 76 + EXT4_INODE_SIZE(inode->i_sb) - offset); 79 77 } 80 78 81 79 return csum; ··· 261 261 "couldn't mark inode dirty (err %d)", err); 262 262 goto stop_handle; 263 263 } 264 - if (inode->i_blocks) 265 - ext4_truncate(inode); 264 + if (inode->i_blocks) { 265 + err = ext4_truncate(inode); 266 + if (err) { 267 + ext4_error(inode->i_sb, 268 + "couldn't truncate inode %lu (err %d)", 269 + inode->i_ino, err); 270 + goto stop_handle; 271 + } 272 + } 266 273 267 274 /* 268 275 * ext4_ext_truncate() doesn't reserve any slop when it ··· 774 767 ext4_update_bh_state(bh, map.m_flags); 775 768 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 776 769 ret = 0; 770 + } else if (ret == 0) { 771 + /* hole case, need to fill in bh->b_size */ 772 + bh->b_size = inode->i_sb->s_blocksize * map.m_len; 777 773 } 778 774 return ret; 779 775 } ··· 1176 1166 if (unlikely(err)) 1177 1167 page_zero_new_buffers(page, from, to); 1178 1168 else if (decrypt) 1179 - err = fscrypt_decrypt_page(page); 1169 + err = fscrypt_decrypt_page(page->mapping->host, page, 1170 + PAGE_SIZE, 0, page->index); 1180 1171 return err; 1181 1172 } 1182 1173 #endif ··· 2902 2891 2903 2892 index = pos >> PAGE_SHIFT; 2904 2893 2905 - if (ext4_nonda_switch(inode->i_sb)) { 2894 + if (ext4_nonda_switch(inode->i_sb) || 2895 + S_ISLNK(inode->i_mode)) { 2906 2896 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2907 2897 return ext4_write_begin(file, mapping, pos, 2908 2898 len, flags, pagep, fsdata); ··· 3280 3268 } 3281 3269 3282 3270 #ifdef CONFIG_FS_DAX 3283 - /* 3284 - * Get block function for DAX IO and mmap faults. It takes care of converting 3285 - * unwritten extents to written ones and initializes new / converted blocks 3286 - * to zeros. 3287 - */ 3288 - int ext4_dax_get_block(struct inode *inode, sector_t iblock, 3289 - struct buffer_head *bh_result, int create) 3271 + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3272 + unsigned flags, struct iomap *iomap) 3290 3273 { 3274 + unsigned int blkbits = inode->i_blkbits; 3275 + unsigned long first_block = offset >> blkbits; 3276 + unsigned long last_block = (offset + length - 1) >> blkbits; 3277 + struct ext4_map_blocks map; 3291 3278 int ret; 3292 3279 3293 - ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); 3294 - if (!create) 3295 - return _ext4_get_block(inode, iblock, bh_result, 0); 3280 + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) 3281 + return -ERANGE; 3296 3282 3297 - ret = ext4_get_block_trans(inode, iblock, bh_result, 3298 - EXT4_GET_BLOCKS_PRE_IO | 3299 - EXT4_GET_BLOCKS_CREATE_ZERO); 3300 - if (ret < 0) 3301 - return ret; 3283 + map.m_lblk = first_block; 3284 + map.m_len = last_block - first_block + 1; 3302 3285 3303 - if (buffer_unwritten(bh_result)) { 3286 + if (!(flags & IOMAP_WRITE)) { 3287 + ret = ext4_map_blocks(NULL, inode, &map, 0); 3288 + } else { 3289 + int dio_credits; 3290 + handle_t *handle; 3291 + int retries = 0; 3292 + 3293 + /* Trim mapping request to maximum we can map at once for DIO */ 3294 + if (map.m_len > DIO_MAX_BLOCKS) 3295 + map.m_len = DIO_MAX_BLOCKS; 3296 + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); 3297 + retry: 3304 3298 /* 3305 - * We are protected by i_mmap_sem or i_mutex so we know block 3306 - * cannot go away from under us even though we dropped 3307 - * i_data_sem. Convert extent to written and write zeros there. 3299 + * Either we allocate blocks and then we don't get unwritten 3300 + * extent so we have reserved enough credits, or the blocks 3301 + * are already allocated and unwritten and in that case 3302 + * extent conversion fits in the credits as well. 3308 3303 */ 3309 - ret = ext4_get_block_trans(inode, iblock, bh_result, 3310 - EXT4_GET_BLOCKS_CONVERT | 3311 - EXT4_GET_BLOCKS_CREATE_ZERO); 3312 - if (ret < 0) 3304 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 3305 + dio_credits); 3306 + if (IS_ERR(handle)) 3307 + return PTR_ERR(handle); 3308 + 3309 + ret = ext4_map_blocks(handle, inode, &map, 3310 + EXT4_GET_BLOCKS_CREATE_ZERO); 3311 + if (ret < 0) { 3312 + ext4_journal_stop(handle); 3313 + if (ret == -ENOSPC && 3314 + ext4_should_retry_alloc(inode->i_sb, &retries)) 3315 + goto retry; 3313 3316 return ret; 3317 + } 3318 + 3319 + /* 3320 + * If we added blocks beyond i_size, we need to make sure they 3321 + * will get truncated if we crash before updating i_size in 3322 + * ext4_iomap_end(). For faults we don't need to do that (and 3323 + * even cannot because for orphan list operations inode_lock is 3324 + * required) - if we happen to instantiate block beyond i_size, 3325 + * it is because we race with truncate which has already added 3326 + * the inode to the orphan list. 3327 + */ 3328 + if (!(flags & IOMAP_FAULT) && first_block + map.m_len > 3329 + (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { 3330 + int err; 3331 + 3332 + err = ext4_orphan_add(handle, inode); 3333 + if (err < 0) { 3334 + ext4_journal_stop(handle); 3335 + return err; 3336 + } 3337 + } 3338 + ext4_journal_stop(handle); 3339 + } 3340 + 3341 + iomap->flags = 0; 3342 + iomap->bdev = inode->i_sb->s_bdev; 3343 + iomap->offset = first_block << blkbits; 3344 + 3345 + if (ret == 0) { 3346 + iomap->type = IOMAP_HOLE; 3347 + iomap->blkno = IOMAP_NULL_BLOCK; 3348 + iomap->length = (u64)map.m_len << blkbits; 3349 + } else { 3350 + if (map.m_flags & EXT4_MAP_MAPPED) { 3351 + iomap->type = IOMAP_MAPPED; 3352 + } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { 3353 + iomap->type = IOMAP_UNWRITTEN; 3354 + } else { 3355 + WARN_ON_ONCE(1); 3356 + return -EIO; 3357 + } 3358 + iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); 3359 + iomap->length = (u64)map.m_len << blkbits; 3360 + } 3361 + 3362 + if (map.m_flags & EXT4_MAP_NEW) 3363 + iomap->flags |= IOMAP_F_NEW; 3364 + return 0; 3365 + } 3366 + 3367 + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, 3368 + ssize_t written, unsigned flags, struct iomap *iomap) 3369 + { 3370 + int ret = 0; 3371 + handle_t *handle; 3372 + int blkbits = inode->i_blkbits; 3373 + bool truncate = false; 3374 + 3375 + if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) 3376 + return 0; 3377 + 3378 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 3379 + if (IS_ERR(handle)) { 3380 + ret = PTR_ERR(handle); 3381 + goto orphan_del; 3382 + } 3383 + if (ext4_update_inode_size(inode, offset + written)) 3384 + ext4_mark_inode_dirty(handle, inode); 3385 + /* 3386 + * We may need to truncate allocated but not written blocks beyond EOF. 3387 + */ 3388 + if (iomap->offset + iomap->length > 3389 + ALIGN(inode->i_size, 1 << blkbits)) { 3390 + ext4_lblk_t written_blk, end_blk; 3391 + 3392 + written_blk = (offset + written) >> blkbits; 3393 + end_blk = (offset + length) >> blkbits; 3394 + if (written_blk < end_blk && ext4_can_truncate(inode)) 3395 + truncate = true; 3314 3396 } 3315 3397 /* 3316 - * At least for now we have to clear BH_New so that DAX code 3317 - * doesn't attempt to zero blocks again in a racy way. 3398 + * Remove inode from orphan list if we were extending a inode and 3399 + * everything went fine. 3318 3400 */ 3319 - clear_buffer_new(bh_result); 3320 - return 0; 3401 + if (!truncate && inode->i_nlink && 3402 + !list_empty(&EXT4_I(inode)->i_orphan)) 3403 + ext4_orphan_del(handle, inode); 3404 + ext4_journal_stop(handle); 3405 + if (truncate) { 3406 + ext4_truncate_failed_write(inode); 3407 + orphan_del: 3408 + /* 3409 + * If truncate failed early the inode might still be on the 3410 + * orphan list; we need to make sure the inode is removed from 3411 + * the orphan list in that case. 3412 + */ 3413 + if (inode->i_nlink) 3414 + ext4_orphan_del(NULL, inode); 3415 + } 3416 + return ret; 3321 3417 } 3322 - #else 3323 - /* Just define empty function, it will never get called. */ 3324 - int ext4_dax_get_block(struct inode *inode, sector_t iblock, 3325 - struct buffer_head *bh_result, int create) 3326 - { 3327 - BUG(); 3328 - return 0; 3329 - } 3418 + 3419 + struct iomap_ops ext4_iomap_ops = { 3420 + .iomap_begin = ext4_iomap_begin, 3421 + .iomap_end = ext4_iomap_end, 3422 + }; 3423 + 3330 3424 #endif 3331 3425 3332 3426 static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ··· 3554 3436 iocb->private = NULL; 3555 3437 if (overwrite) 3556 3438 get_block_func = ext4_dio_get_block_overwrite; 3557 - else if (IS_DAX(inode)) { 3558 - /* 3559 - * We can avoid zeroing for aligned DAX writes beyond EOF. Other 3560 - * writes need zeroing either because they can race with page 3561 - * faults or because they use partial blocks. 3562 - */ 3563 - if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && 3564 - ext4_aligned_io(inode, offset, count)) 3565 - get_block_func = ext4_dio_get_block; 3566 - else 3567 - get_block_func = ext4_dax_get_block; 3568 - dio_flags = DIO_LOCKING; 3569 - } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || 3439 + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || 3570 3440 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { 3571 3441 get_block_func = ext4_dio_get_block; 3572 3442 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; ··· 3568 3462 #ifdef CONFIG_EXT4_FS_ENCRYPTION 3569 3463 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3570 3464 #endif 3571 - if (IS_DAX(inode)) { 3572 - ret = dax_do_io(iocb, inode, iter, get_block_func, 3573 - ext4_end_io_dio, dio_flags); 3574 - } else 3575 - ret = __blockdev_direct_IO(iocb, inode, 3576 - inode->i_sb->s_bdev, iter, 3577 - get_block_func, 3578 - ext4_end_io_dio, NULL, dio_flags); 3465 + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, 3466 + get_block_func, ext4_end_io_dio, NULL, 3467 + dio_flags); 3579 3468 3580 3469 if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3581 3470 EXT4_STATE_DIO_UNWRITTEN)) { ··· 3639 3538 { 3640 3539 struct address_space *mapping = iocb->ki_filp->f_mapping; 3641 3540 struct inode *inode = mapping->host; 3541 + size_t count = iov_iter_count(iter); 3642 3542 ssize_t ret; 3643 3543 3644 3544 /* ··· 3648 3546 * we are protected against page writeback as well. 3649 3547 */ 3650 3548 inode_lock_shared(inode); 3651 - if (IS_DAX(inode)) { 3652 - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); 3653 - } else { 3654 - size_t count = iov_iter_count(iter); 3655 - 3656 - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, 3657 - iocb->ki_pos + count); 3658 - if (ret) 3659 - goto out_unlock; 3660 - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, 3661 - iter, ext4_dio_get_block, 3662 - NULL, NULL, 0); 3663 - } 3549 + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, 3550 + iocb->ki_pos + count); 3551 + if (ret) 3552 + goto out_unlock; 3553 + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, 3554 + iter, ext4_dio_get_block, NULL, NULL, 0); 3664 3555 out_unlock: 3665 3556 inode_unlock_shared(inode); 3666 3557 return ret; ··· 3680 3585 3681 3586 /* Let buffer I/O handle the inline data case. */ 3682 3587 if (ext4_has_inline_data(inode)) 3588 + return 0; 3589 + 3590 + /* DAX uses iomap path now */ 3591 + if (WARN_ON_ONCE(IS_DAX(inode))) 3683 3592 return 0; 3684 3593 3685 3594 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); ··· 3714 3615 return __set_page_dirty_nobuffers(page); 3715 3616 } 3716 3617 3618 + static int ext4_set_page_dirty(struct page *page) 3619 + { 3620 + WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); 3621 + WARN_ON_ONCE(!page_has_buffers(page)); 3622 + return __set_page_dirty_buffers(page); 3623 + } 3624 + 3717 3625 static const struct address_space_operations ext4_aops = { 3718 3626 .readpage = ext4_readpage, 3719 3627 .readpages = ext4_readpages, ··· 3728 3622 .writepages = ext4_writepages, 3729 3623 .write_begin = ext4_write_begin, 3730 3624 .write_end = ext4_write_end, 3625 + .set_page_dirty = ext4_set_page_dirty, 3731 3626 .bmap = ext4_bmap, 3732 3627 .invalidatepage = ext4_invalidatepage, 3733 3628 .releasepage = ext4_releasepage, ··· 3761 3654 .writepages = ext4_writepages, 3762 3655 .write_begin = ext4_da_write_begin, 3763 3656 .write_end = ext4_da_write_end, 3657 + .set_page_dirty = ext4_set_page_dirty, 3764 3658 .bmap = ext4_bmap, 3765 3659 .invalidatepage = ext4_da_invalidatepage, 3766 3660 .releasepage = ext4_releasepage, ··· 3851 3743 /* We expect the key to be set. */ 3852 3744 BUG_ON(!fscrypt_has_encryption_key(inode)); 3853 3745 BUG_ON(blocksize != PAGE_SIZE); 3854 - WARN_ON_ONCE(fscrypt_decrypt_page(page)); 3746 + WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, 3747 + page, PAGE_SIZE, 0, page->index)); 3855 3748 } 3856 3749 } 3857 3750 if (ext4_should_journal_data(inode)) { ··· 3901 3792 if (length > max || length < 0) 3902 3793 length = max; 3903 3794 3904 - if (IS_DAX(inode)) 3905 - return dax_zero_page_range(inode, from, length, ext4_get_block); 3795 + if (IS_DAX(inode)) { 3796 + return iomap_zero_range(inode, from, length, NULL, 3797 + &ext4_iomap_ops); 3798 + } 3906 3799 return __ext4_block_zero_page_range(handle, mapping, from, length); 3907 3800 } 3908 3801 ··· 4137 4026 if (IS_SYNC(inode)) 4138 4027 ext4_handle_sync(handle); 4139 4028 4140 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4029 + inode->i_mtime = inode->i_ctime = current_time(inode); 4141 4030 ext4_mark_inode_dirty(handle, inode); 4142 4031 out_stop: 4143 4032 ext4_journal_stop(handle); ··· 4202 4091 * that's fine - as long as they are linked from the inode, the post-crash 4203 4092 * ext4_truncate() run will find them and release them. 4204 4093 */ 4205 - void ext4_truncate(struct inode *inode) 4094 + int ext4_truncate(struct inode *inode) 4206 4095 { 4207 4096 struct ext4_inode_info *ei = EXT4_I(inode); 4208 4097 unsigned int credits; 4098 + int err = 0; 4209 4099 handle_t *handle; 4210 4100 struct address_space *mapping = inode->i_mapping; 4211 4101 ··· 4220 4108 trace_ext4_truncate_enter(inode); 4221 4109 4222 4110 if (!ext4_can_truncate(inode)) 4223 - return; 4111 + return 0; 4224 4112 4225 4113 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4226 4114 ··· 4232 4120 4233 4121 ext4_inline_data_truncate(inode, &has_inline); 4234 4122 if (has_inline) 4235 - return; 4123 + return 0; 4236 4124 } 4237 4125 4238 4126 /* If we zero-out tail of the page, we have to create jinode for jbd2 */ 4239 4127 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { 4240 4128 if (ext4_inode_attach_jinode(inode) < 0) 4241 - return; 4129 + return 0; 4242 4130 } 4243 4131 4244 4132 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ··· 4247 4135 credits = ext4_blocks_for_truncate(inode); 4248 4136 4249 4137 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4250 - if (IS_ERR(handle)) { 4251 - ext4_std_error(inode->i_sb, PTR_ERR(handle)); 4252 - return; 4253 - } 4138 + if (IS_ERR(handle)) 4139 + return PTR_ERR(handle); 4254 4140 4255 4141 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 4256 4142 ext4_block_truncate_page(handle, mapping, inode->i_size); ··· 4262 4152 * Implication: the file must always be in a sane, consistent 4263 4153 * truncatable state while each transaction commits. 4264 4154 */ 4265 - if (ext4_orphan_add(handle, inode)) 4155 + err = ext4_orphan_add(handle, inode); 4156 + if (err) 4266 4157 goto out_stop; 4267 4158 4268 4159 down_write(&EXT4_I(inode)->i_data_sem); ··· 4271 4160 ext4_discard_preallocations(inode); 4272 4161 4273 4162 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4274 - ext4_ext_truncate(handle, inode); 4163 + err = ext4_ext_truncate(handle, inode); 4275 4164 else 4276 4165 ext4_ind_truncate(handle, inode); 4277 4166 4278 4167 up_write(&ei->i_data_sem); 4168 + if (err) 4169 + goto out_stop; 4279 4170 4280 4171 if (IS_SYNC(inode)) 4281 4172 ext4_handle_sync(handle); ··· 4293 4180 if (inode->i_nlink) 4294 4181 ext4_orphan_del(handle, inode); 4295 4182 4296 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4183 + inode->i_mtime = inode->i_ctime = current_time(inode); 4297 4184 ext4_mark_inode_dirty(handle, inode); 4298 4185 ext4_journal_stop(handle); 4299 4186 4300 4187 trace_ext4_truncate_exit(inode); 4188 + return err; 4301 4189 } 4302 4190 4303 4191 /* ··· 4466 4352 new_fl |= S_NOATIME; 4467 4353 if (flags & EXT4_DIRSYNC_FL) 4468 4354 new_fl |= S_DIRSYNC; 4469 - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) 4355 + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && 4356 + !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && 4357 + !ext4_encrypted_inode(inode)) 4470 4358 new_fl |= S_DAX; 4471 4359 inode_set_flags(inode, new_fl, 4472 4360 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); ··· 4527 4411 { 4528 4412 __le32 *magic = (void *)raw_inode + 4529 4413 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; 4530 - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 4414 + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= 4415 + EXT4_INODE_SIZE(inode->i_sb) && 4416 + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 4531 4417 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4532 4418 ext4_find_inline_data_nolock(inode); 4533 4419 } else ··· 4552 4434 struct inode *inode; 4553 4435 journal_t *journal = EXT4_SB(sb)->s_journal; 4554 4436 long ret; 4437 + loff_t size; 4555 4438 int block; 4556 4439 uid_t i_uid; 4557 4440 gid_t i_gid; ··· 4575 4456 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4576 4457 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4577 4458 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4578 - EXT4_INODE_SIZE(inode->i_sb)) { 4579 - EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", 4580 - EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, 4581 - EXT4_INODE_SIZE(inode->i_sb)); 4459 + EXT4_INODE_SIZE(inode->i_sb) || 4460 + (ei->i_extra_isize & 3)) { 4461 + EXT4_ERROR_INODE(inode, 4462 + "bad extra_isize %u (inode size %u)", 4463 + ei->i_extra_isize, 4464 + EXT4_INODE_SIZE(inode->i_sb)); 4582 4465 ret = -EFSCORRUPTED; 4583 4466 goto bad_inode; 4584 4467 } ··· 4655 4534 ei->i_file_acl |= 4656 4535 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4657 4536 inode->i_size = ext4_isize(raw_inode); 4537 + if ((size = i_size_read(inode)) < 0) { 4538 + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); 4539 + ret = -EFSCORRUPTED; 4540 + goto bad_inode; 4541 + } 4658 4542 ei->i_disksize = inode->i_size; 4659 4543 #ifdef CONFIG_QUOTA 4660 4544 ei->i_reserved_quota = 0; ··· 4703 4577 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4704 4578 if (ei->i_extra_isize == 0) { 4705 4579 /* The extra space is currently unused. Use it. */ 4580 + BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); 4706 4581 ei->i_extra_isize = sizeof(struct ext4_inode) - 4707 4582 EXT4_GOOD_OLD_INODE_SIZE; 4708 4583 } else { ··· 5281 5154 * update c/mtime in shrink case below 5282 5155 */ 5283 5156 if (!shrink) { 5284 - inode->i_mtime = ext4_current_time(inode); 5157 + inode->i_mtime = current_time(inode); 5285 5158 inode->i_ctime = inode->i_mtime; 5286 5159 } 5287 5160 down_write(&EXT4_I(inode)->i_data_sem); ··· 5326 5199 * in data=journal mode to make pages freeable. 5327 5200 */ 5328 5201 truncate_pagecache(inode, inode->i_size); 5329 - if (shrink) 5330 - ext4_truncate(inode); 5202 + if (shrink) { 5203 + rc = ext4_truncate(inode); 5204 + if (rc) 5205 + error = rc; 5206 + } 5331 5207 up_write(&EXT4_I(inode)->i_mmap_sem); 5332 5208 } 5333 5209 5334 - if (!rc) { 5210 + if (!error) { 5335 5211 setattr_copy(inode, attr); 5336 5212 mark_inode_dirty(inode); 5337 5213 } ··· 5346 5216 if (orphan && inode->i_nlink) 5347 5217 ext4_orphan_del(NULL, inode); 5348 5218 5349 - if (!rc && (ia_valid & ATTR_MODE)) 5219 + if (!error && (ia_valid & ATTR_MODE)) 5350 5220 rc = posix_acl_chmod(inode, inode->i_mode); 5351 5221 5352 5222 err_out: ··· 5585 5455 err = ext4_reserve_inode_write(handle, inode, &iloc); 5586 5456 if (err) 5587 5457 return err; 5588 - if (ext4_handle_valid(handle) && 5589 - EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5458 + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5590 5459 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5591 5460 /* 5592 - * We need extra buffer credits since we may write into EA block 5461 + * In nojournal mode, we can immediately attempt to expand 5462 + * the inode. When journaled, we first need to obtain extra 5463 + * buffer credits since we may write into the EA block 5593 5464 * with this same handle. If journal_extend fails, then it will 5594 5465 * only result in a minor loss of functionality for that inode. 5595 5466 * If this is felt to be critical, then e2fsck should be run to 5596 5467 * force a large enough s_min_extra_isize. 5597 5468 */ 5598 - if ((jbd2_journal_extend(handle, 5599 - EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 5469 + if (!ext4_handle_valid(handle) || 5470 + jbd2_journal_extend(handle, 5471 + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { 5600 5472 ret = ext4_expand_extra_isize(inode, 5601 5473 sbi->s_want_extra_isize, 5602 5474 iloc, handle); ··· 5752 5620 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5753 5621 } 5754 5622 ext4_set_aops(inode); 5623 + /* 5624 + * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. 5625 + * E.g. S_DAX may get cleared / set. 5626 + */ 5627 + ext4_set_inode_flags(inode); 5755 5628 5756 5629 jbd2_journal_unlock_updates(journal); 5757 5630 percpu_up_write(&sbi->s_journal_flag_rwsem);

+44 -38

fs/ext4/ioctl.c

··· 153 153 154 154 swap_inode_data(inode, inode_bl); 155 155 156 - inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); 156 + inode->i_ctime = inode_bl->i_ctime = current_time(inode); 157 157 158 158 spin_lock(&sbi->s_next_gen_lock); 159 159 inode->i_generation = sbi->s_next_generation++; ··· 191 191 return err; 192 192 } 193 193 194 + #ifdef CONFIG_EXT4_FS_ENCRYPTION 194 195 static int uuid_is_zero(__u8 u[16]) 195 196 { 196 197 int i; ··· 201 200 return 0; 202 201 return 1; 203 202 } 203 + #endif 204 204 205 205 static int ext4_ioctl_setflags(struct inode *inode, 206 206 unsigned int flags) ··· 250 248 err = -EOPNOTSUPP; 251 249 goto flags_out; 252 250 } 253 - } else if (oldflags & EXT4_EOFBLOCKS_FL) 254 - ext4_truncate(inode); 251 + } else if (oldflags & EXT4_EOFBLOCKS_FL) { 252 + err = ext4_truncate(inode); 253 + if (err) 254 + goto flags_out; 255 + } 255 256 256 257 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 257 258 if (IS_ERR(handle)) { ··· 270 265 for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { 271 266 if (!(mask & EXT4_FL_USER_MODIFIABLE)) 272 267 continue; 268 + /* These flags get special treatment later */ 269 + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) 270 + continue; 273 271 if (mask & flags) 274 272 ext4_set_inode_flag(inode, i); 275 273 else ··· 280 272 } 281 273 282 274 ext4_set_inode_flags(inode); 283 - inode->i_ctime = ext4_current_time(inode); 275 + inode->i_ctime = current_time(inode); 284 276 285 277 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 286 278 flags_err: ··· 376 368 } 377 369 378 370 EXT4_I(inode)->i_projid = kprojid; 379 - inode->i_ctime = ext4_current_time(inode); 371 + inode->i_ctime = current_time(inode); 380 372 out_dirty: 381 373 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 382 374 if (!err) ··· 416 408 xflags |= FS_XFLAG_PROJINHERIT; 417 409 return xflags; 418 410 } 411 + 412 + #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ 413 + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ 414 + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) 419 415 420 416 /* Transfer xflags flags to internal */ 421 417 static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) ··· 465 453 if (get_user(flags, (int __user *) arg)) 466 454 return -EFAULT; 467 455 456 + if (flags & ~EXT4_FL_USER_VISIBLE) 457 + return -EOPNOTSUPP; 458 + /* 459 + * chattr(1) grabs flags via GETFLAGS, modifies the result and 460 + * passes that to SETFLAGS. So we cannot easily make SETFLAGS 461 + * more restrictive than just silently masking off visible but 462 + * not settable flags as we always did. 463 + */ 464 + flags &= EXT4_FL_USER_MODIFIABLE; 465 + if (ext4_mask_flags(inode->i_mode, flags) != flags) 466 + return -EOPNOTSUPP; 467 + 468 468 err = mnt_want_write_file(filp); 469 469 if (err) 470 470 return err; 471 - 472 - flags = ext4_mask_flags(inode->i_mode, flags); 473 471 474 472 inode_lock(inode); 475 473 err = ext4_ioctl_setflags(inode, flags); ··· 522 500 } 523 501 err = ext4_reserve_inode_write(handle, inode, &iloc); 524 502 if (err == 0) { 525 - inode->i_ctime = ext4_current_time(inode); 503 + inode->i_ctime = current_time(inode); 526 504 inode->i_generation = generation; 527 505 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 528 506 } ··· 787 765 } 788 766 case EXT4_IOC_PRECACHE_EXTENTS: 789 767 return ext4_ext_precache(inode); 790 - case EXT4_IOC_SET_ENCRYPTION_POLICY: { 791 - #ifdef CONFIG_EXT4_FS_ENCRYPTION 792 - struct fscrypt_policy policy; 793 768 769 + case EXT4_IOC_SET_ENCRYPTION_POLICY: 794 770 if (!ext4_has_feature_encrypt(sb)) 795 771 return -EOPNOTSUPP; 772 + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); 796 773 797 - if (copy_from_user(&policy, 798 - (struct fscrypt_policy __user *)arg, 799 - sizeof(policy))) 800 - return -EFAULT; 801 - return fscrypt_process_policy(filp, &policy); 802 - #else 803 - return -EOPNOTSUPP; 804 - #endif 805 - } 806 774 case EXT4_IOC_GET_ENCRYPTION_PWSALT: { 775 + #ifdef CONFIG_EXT4_FS_ENCRYPTION 807 776 int err, err2; 808 777 struct ext4_sb_info *sbi = EXT4_SB(sb); 809 778 handle_t *handle; 810 779 811 - if (!ext4_sb_has_crypto(sb)) 780 + if (!ext4_has_feature_encrypt(sb)) 812 781 return -EOPNOTSUPP; 813 782 if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { 814 783 err = mnt_want_write_file(filp); ··· 829 816 sbi->s_es->s_encrypt_pw_salt, 16)) 830 817 return -EFAULT; 831 818 return 0; 832 - } 833 - case EXT4_IOC_GET_ENCRYPTION_POLICY: { 834 - #ifdef CONFIG_EXT4_FS_ENCRYPTION 835 - struct fscrypt_policy policy; 836 - int err = 0; 837 - 838 - if (!ext4_encrypted_inode(inode)) 839 - return -ENOENT; 840 - err = fscrypt_get_policy(inode, &policy); 841 - if (err) 842 - return err; 843 - if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) 844 - return -EFAULT; 845 - return 0; 846 819 #else 847 820 return -EOPNOTSUPP; 848 821 #endif 849 822 } 823 + case EXT4_IOC_GET_ENCRYPTION_POLICY: 824 + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); 825 + 850 826 case EXT4_IOC_FSGETXATTR: 851 827 { 852 828 struct fsxattr fa; ··· 867 865 if (!inode_owner_or_capable(inode)) 868 866 return -EACCES; 869 867 868 + if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) 869 + return -EOPNOTSUPP; 870 + 871 + flags = ext4_xflags_to_iflags(fa.fsx_xflags); 872 + if (ext4_mask_flags(inode->i_mode, flags) != flags) 873 + return -EOPNOTSUPP; 874 + 870 875 err = mnt_want_write_file(filp); 871 876 if (err) 872 877 return err; 873 - 874 - flags = ext4_xflags_to_iflags(fa.fsx_xflags); 875 - flags = ext4_mask_flags(inode->i_mode, flags); 876 878 877 879 inode_lock(inode); 878 880 flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |

+2 -2

fs/ext4/mballoc.c

··· 669 669 ext4_grpblk_t min; 670 670 ext4_grpblk_t max; 671 671 ext4_grpblk_t chunk; 672 - unsigned short border; 672 + unsigned int border; 673 673 674 674 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 675 675 ··· 2287 2287 struct ext4_group_info *grinfo; 2288 2288 struct sg { 2289 2289 struct ext4_group_info info; 2290 - ext4_grpblk_t counters[16]; 2290 + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; 2291 2291 } sg; 2292 2292 2293 2293 group--;

+13 -11

fs/ext4/namei.c

··· 1941 1941 * happen is that the times are slightly out of date 1942 1942 * and/or different from the directory change time. 1943 1943 */ 1944 - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); 1944 + dir->i_mtime = dir->i_ctime = current_time(dir); 1945 1945 ext4_update_dx_flag(dir); 1946 1946 dir->i_version++; 1947 1947 ext4_mark_inode_dirty(handle, dir); ··· 2987 2987 * recovery. */ 2988 2988 inode->i_size = 0; 2989 2989 ext4_orphan_add(handle, inode); 2990 - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); 2990 + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2991 2991 ext4_mark_inode_dirty(handle, inode); 2992 2992 ext4_dec_count(handle, dir); 2993 2993 ext4_update_dx_flag(dir); ··· 3050 3050 retval = ext4_delete_entry(handle, dir, de, bh); 3051 3051 if (retval) 3052 3052 goto end_unlink; 3053 - dir->i_ctime = dir->i_mtime = ext4_current_time(dir); 3053 + dir->i_ctime = dir->i_mtime = current_time(dir); 3054 3054 ext4_update_dx_flag(dir); 3055 3055 ext4_mark_inode_dirty(handle, dir); 3056 3056 drop_nlink(inode); 3057 3057 if (!inode->i_nlink) 3058 3058 ext4_orphan_add(handle, inode); 3059 - inode->i_ctime = ext4_current_time(inode); 3059 + inode->i_ctime = current_time(inode); 3060 3060 ext4_mark_inode_dirty(handle, inode); 3061 3061 3062 3062 end_unlink: ··· 3254 3254 if (IS_DIRSYNC(dir)) 3255 3255 ext4_handle_sync(handle); 3256 3256 3257 - inode->i_ctime = ext4_current_time(inode); 3257 + inode->i_ctime = current_time(inode); 3258 3258 ext4_inc_count(handle, inode); 3259 3259 ihold(inode); 3260 3260 ··· 3381 3381 ent->de->file_type = file_type; 3382 3382 ent->dir->i_version++; 3383 3383 ent->dir->i_ctime = ent->dir->i_mtime = 3384 - ext4_current_time(ent->dir); 3384 + current_time(ent->dir); 3385 3385 ext4_mark_inode_dirty(handle, ent->dir); 3386 3386 BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); 3387 3387 if (!ent->inlined) { ··· 3651 3651 * Like most other Unix systems, set the ctime for inodes on a 3652 3652 * rename. 3653 3653 */ 3654 - old.inode->i_ctime = ext4_current_time(old.inode); 3654 + old.inode->i_ctime = current_time(old.inode); 3655 3655 ext4_mark_inode_dirty(handle, old.inode); 3656 3656 3657 3657 if (!whiteout) { ··· 3663 3663 3664 3664 if (new.inode) { 3665 3665 ext4_dec_count(handle, new.inode); 3666 - new.inode->i_ctime = ext4_current_time(new.inode); 3666 + new.inode->i_ctime = current_time(new.inode); 3667 3667 } 3668 - old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); 3668 + old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); 3669 3669 ext4_update_dx_flag(old.dir); 3670 3670 if (old.dir_bh) { 3671 3671 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); ··· 3723 3723 }; 3724 3724 u8 new_file_type; 3725 3725 int retval; 3726 + struct timespec ctime; 3726 3727 3727 3728 if ((ext4_encrypted_inode(old_dir) || 3728 3729 ext4_encrypted_inode(new_dir)) && ··· 3824 3823 * Like most other Unix systems, set the ctime for inodes on a 3825 3824 * rename. 3826 3825 */ 3827 - old.inode->i_ctime = ext4_current_time(old.inode); 3828 - new.inode->i_ctime = ext4_current_time(new.inode); 3826 + ctime = current_time(old.inode); 3827 + old.inode->i_ctime = ctime; 3828 + new.inode->i_ctime = ctime; 3829 3829 ext4_mark_inode_dirty(handle, old.inode); 3830 3830 ext4_mark_inode_dirty(handle, new.inode); 3831 3831

+2 -1

fs/ext4/page-io.c

··· 470 470 gfp_t gfp_flags = GFP_NOFS; 471 471 472 472 retry_encrypt: 473 - data_page = fscrypt_encrypt_page(inode, page, gfp_flags); 473 + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, 474 + page->index, gfp_flags); 474 475 if (IS_ERR(data_page)) { 475 476 ret = PTR_ERR(data_page); 476 477 if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {

+103 -49

fs/ext4/super.c

··· 863 863 percpu_counter_destroy(&sbi->s_dirs_counter); 864 864 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 865 865 percpu_free_rwsem(&sbi->s_journal_flag_rwsem); 866 - brelse(sbi->s_sbh); 867 866 #ifdef CONFIG_QUOTA 868 867 for (i = 0; i < EXT4_MAXQUOTAS; i++) 869 868 kfree(sbi->s_qf_names[i]); ··· 894 895 } 895 896 if (sbi->s_mmp_tsk) 896 897 kthread_stop(sbi->s_mmp_tsk); 898 + brelse(sbi->s_sbh); 897 899 sb->s_fs_info = NULL; 898 900 /* 899 901 * Now that we are completely done shutting down the ··· 1114 1114 static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, 1115 1115 void *fs_data) 1116 1116 { 1117 - handle_t *handle; 1118 - int res, res2; 1117 + handle_t *handle = fs_data; 1118 + int res, res2, retries = 0; 1119 1119 1120 - /* fs_data is null when internally used. */ 1121 - if (fs_data) { 1122 - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, 1123 - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, 1124 - len, 0); 1120 + /* 1121 + * If a journal handle was specified, then the encryption context is 1122 + * being set on a new inode via inheritance and is part of a larger 1123 + * transaction to create the inode. Otherwise the encryption context is 1124 + * being set on an existing inode in its own transaction. Only in the 1125 + * latter case should the "retry on ENOSPC" logic be used. 1126 + */ 1127 + 1128 + if (handle) { 1129 + res = ext4_xattr_set_handle(handle, inode, 1130 + EXT4_XATTR_INDEX_ENCRYPTION, 1131 + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 1132 + ctx, len, 0); 1125 1133 if (!res) { 1126 1134 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 1127 1135 ext4_clear_inode_state(inode, 1128 1136 EXT4_STATE_MAY_INLINE_DATA); 1137 + /* 1138 + * Update inode->i_flags - e.g. S_DAX may get disabled 1139 + */ 1140 + ext4_set_inode_flags(inode); 1129 1141 } 1130 1142 return res; 1131 1143 } 1132 1144 1145 + retry: 1133 1146 handle = ext4_journal_start(inode, EXT4_HT_MISC, 1134 1147 ext4_jbd2_credits_xattr(inode)); 1135 1148 if (IS_ERR(handle)) 1136 1149 return PTR_ERR(handle); 1137 1150 1138 - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, 1139 - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, 1140 - len, 0); 1151 + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, 1152 + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 1153 + ctx, len, 0); 1141 1154 if (!res) { 1142 1155 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 1156 + /* Update inode->i_flags - e.g. S_DAX may get disabled */ 1157 + ext4_set_inode_flags(inode); 1143 1158 res = ext4_mark_inode_dirty(handle, inode); 1144 1159 if (res) 1145 1160 EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); 1146 1161 } 1147 1162 res2 = ext4_journal_stop(handle); 1163 + 1164 + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1165 + goto retry; 1148 1166 if (!res) 1149 1167 res = res2; 1150 1168 return res; ··· 1901 1883 return 0; 1902 1884 } 1903 1885 } 1904 - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && 1905 - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 1906 - ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " 1907 - "in data=ordered mode"); 1908 - return 0; 1909 - } 1910 1886 return 1; 1911 1887 } 1912 1888 ··· 2342 2330 struct ext4_super_block *es) 2343 2331 { 2344 2332 unsigned int s_flags = sb->s_flags; 2345 - int nr_orphans = 0, nr_truncates = 0; 2333 + int ret, nr_orphans = 0, nr_truncates = 0; 2346 2334 #ifdef CONFIG_QUOTA 2347 2335 int i; 2348 2336 #endif ··· 2424 2412 inode->i_ino, inode->i_size); 2425 2413 inode_lock(inode); 2426 2414 truncate_inode_pages(inode->i_mapping, inode->i_size); 2427 - ext4_truncate(inode); 2415 + ret = ext4_truncate(inode); 2416 + if (ret) 2417 + ext4_std_error(inode->i_sb, ret); 2428 2418 inode_unlock(inode); 2429 2419 nr_truncates++; 2430 2420 } else { ··· 3207 3193 ext4_set_bit(s++, buf); 3208 3194 count++; 3209 3195 } 3210 - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { 3211 - ext4_set_bit(EXT4_B2C(sbi, s++), buf); 3212 - count++; 3196 + j = ext4_bg_num_gdb(sb, grp); 3197 + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { 3198 + ext4_error(sb, "Invalid number of block group " 3199 + "descriptor blocks: %d", j); 3200 + j = EXT4_BLOCKS_PER_GROUP(sb) - s; 3213 3201 } 3202 + count += j; 3203 + for (; j > 0; j--) 3204 + ext4_set_bit(EXT4_B2C(sbi, s++), buf); 3214 3205 } 3215 3206 if (!count) 3216 3207 return 0; ··· 3320 3301 char *orig_data = kstrdup(data, GFP_KERNEL); 3321 3302 struct buffer_head *bh; 3322 3303 struct ext4_super_block *es = NULL; 3323 - struct ext4_sb_info *sbi; 3304 + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 3324 3305 ext4_fsblk_t block; 3325 3306 ext4_fsblk_t sb_block = get_sb_block(&data); 3326 3307 ext4_fsblk_t logical_sb_block; ··· 3339 3320 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3340 3321 ext4_group_t first_not_zeroed; 3341 3322 3342 - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 3343 - if (!sbi) 3344 - goto out_free_orig; 3323 + if ((data && !orig_data) || !sbi) 3324 + goto out_free_base; 3345 3325 3346 3326 sbi->s_blockgroup_lock = 3347 3327 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 3348 - if (!sbi->s_blockgroup_lock) { 3349 - kfree(sbi); 3350 - goto out_free_orig; 3351 - } 3328 + if (!sbi->s_blockgroup_lock) 3329 + goto out_free_base; 3330 + 3352 3331 sb->s_fs_info = sbi; 3353 3332 sbi->s_sb = sb; 3354 3333 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; ··· 3492 3475 */ 3493 3476 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 3494 3477 3495 - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3496 - &journal_devnum, &journal_ioprio, 0)) { 3497 - ext4_msg(sb, KERN_WARNING, 3498 - "failed to parse options in superblock: %s", 3499 - sbi->s_es->s_mount_opts); 3478 + if (sbi->s_es->s_mount_opts[0]) { 3479 + char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, 3480 + sizeof(sbi->s_es->s_mount_opts), 3481 + GFP_KERNEL); 3482 + if (!s_mount_opts) 3483 + goto failed_mount; 3484 + if (!parse_options(s_mount_opts, sb, &journal_devnum, 3485 + &journal_ioprio, 0)) { 3486 + ext4_msg(sb, KERN_WARNING, 3487 + "failed to parse options in superblock: %s", 3488 + s_mount_opts); 3489 + } 3490 + kfree(s_mount_opts); 3500 3491 } 3501 3492 sbi->s_def_mount_opt = sbi->s_mount_opt; 3502 3493 if (!parse_options((char *) data, sb, &journal_devnum, ··· 3529 3504 ext4_msg(sb, KERN_ERR, "can't mount with " 3530 3505 "both data=journal and dax"); 3531 3506 goto failed_mount; 3507 + } 3508 + if (ext4_has_feature_encrypt(sb)) { 3509 + ext4_msg(sb, KERN_WARNING, 3510 + "encrypted files will use data=ordered " 3511 + "instead of data journaling mode"); 3532 3512 } 3533 3513 if (test_opt(sb, DELALLOC)) 3534 3514 clear_opt(sb, DELALLOC); ··· 3690 3660 3691 3661 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 3692 3662 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 3693 - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) 3694 - goto cantfind_ext4; 3695 3663 3696 3664 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); 3697 3665 if (sbi->s_inodes_per_block == 0) 3698 3666 goto cantfind_ext4; 3667 + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || 3668 + sbi->s_inodes_per_group > blocksize * 8) { 3669 + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", 3670 + sbi->s_blocks_per_group); 3671 + goto failed_mount; 3672 + } 3699 3673 sbi->s_itb_per_group = sbi->s_inodes_per_group / 3700 3674 sbi->s_inodes_per_block; 3701 3675 sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); ··· 3782 3748 } 3783 3749 sbi->s_cluster_ratio = clustersize / blocksize; 3784 3750 3785 - if (sbi->s_inodes_per_group > blocksize * 8) { 3786 - ext4_msg(sb, KERN_ERR, 3787 - "#inodes per group too big: %lu", 3788 - sbi->s_inodes_per_group); 3789 - goto failed_mount; 3790 - } 3791 - 3792 3751 /* Do we have standard group size of clustersize * 8 blocks ? */ 3793 3752 if (sbi->s_blocks_per_group == clustersize << 3) 3794 3753 set_opt2(sb, STD_GROUP_SIZE); ··· 3841 3814 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); 3842 3815 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 3843 3816 EXT4_DESC_PER_BLOCK(sb); 3817 + if (ext4_has_feature_meta_bg(sb)) { 3818 + if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { 3819 + ext4_msg(sb, KERN_WARNING, 3820 + "first meta block group too large: %u " 3821 + "(group descriptor block count %u)", 3822 + le32_to_cpu(es->s_first_meta_bg), db_count); 3823 + goto failed_mount; 3824 + } 3825 + } 3844 3826 sbi->s_group_desc = ext4_kvmalloc(db_count * 3845 3827 sizeof(struct buffer_head *), 3846 3828 GFP_KERNEL); ··· 4003 3967 default: 4004 3968 break; 4005 3969 } 3970 + 3971 + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && 3972 + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 3973 + ext4_msg(sb, KERN_ERR, "can't mount with " 3974 + "journal_async_commit in data=ordered mode"); 3975 + goto failed_mount_wq; 3976 + } 3977 + 4006 3978 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4007 3979 4008 3980 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; ··· 4204 4160 4205 4161 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) 4206 4162 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 4207 - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, 4163 + "Opts: %.*s%s%s", descr, 4164 + (int) sizeof(sbi->s_es->s_mount_opts), 4165 + sbi->s_es->s_mount_opts, 4208 4166 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 4209 4167 4210 4168 if (es->s_error_count) ··· 4285 4239 out_fail: 4286 4240 sb->s_fs_info = NULL; 4287 4241 kfree(sbi->s_blockgroup_lock); 4242 + out_free_base: 4288 4243 kfree(sbi); 4289 - out_free_orig: 4290 4244 kfree(orig_data); 4291 4245 return err ? err : ret; 4292 4246 } ··· 4596 4550 &EXT4_SB(sb)->s_freeinodes_counter)); 4597 4551 BUFFER_TRACE(sbh, "marking dirty"); 4598 4552 ext4_superblock_csum_set(sb); 4599 - lock_buffer(sbh); 4553 + if (sync) 4554 + lock_buffer(sbh); 4600 4555 if (buffer_write_io_error(sbh)) { 4601 4556 /* 4602 4557 * Oh, dear. A previous attempt to write the ··· 4613 4566 set_buffer_uptodate(sbh); 4614 4567 } 4615 4568 mark_buffer_dirty(sbh); 4616 - unlock_buffer(sbh); 4617 4569 if (sync) { 4570 + unlock_buffer(sbh); 4618 4571 error = __sync_dirty_buffer(sbh, 4619 4572 test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); 4620 4573 if (error) ··· 4901 4854 if (test_opt(sb, DAX)) { 4902 4855 ext4_msg(sb, KERN_ERR, "can't mount with " 4903 4856 "both data=journal and dax"); 4857 + err = -EINVAL; 4858 + goto restore_opts; 4859 + } 4860 + } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { 4861 + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 4862 + ext4_msg(sb, KERN_ERR, "can't mount with " 4863 + "journal_async_commit in data=ordered mode"); 4904 4864 err = -EINVAL; 4905 4865 goto restore_opts; 4906 4866 } ··· 5420 5366 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); 5421 5367 if (IS_ERR(handle)) 5422 5368 goto out; 5423 - inode->i_mtime = inode->i_ctime = CURRENT_TIME; 5369 + inode->i_mtime = inode->i_ctime = current_time(inode); 5424 5370 ext4_mark_inode_dirty(handle, inode); 5425 5371 ext4_journal_stop(handle); 5426 5372

+29 -16

fs/ext4/xattr.c

··· 185 185 { 186 186 struct ext4_xattr_entry *e = entry; 187 187 188 + /* Find the end of the names list */ 188 189 while (!IS_LAST_ENTRY(e)) { 189 190 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); 190 191 if ((void *)next >= end) ··· 193 192 e = next; 194 193 } 195 194 195 + /* Check the values */ 196 196 while (!IS_LAST_ENTRY(entry)) { 197 197 if (entry->e_value_block != 0) 198 198 return -EFSCORRUPTED; 199 - if (entry->e_value_size != 0 && 200 - (value_start + le16_to_cpu(entry->e_value_offs) < 201 - (void *)e + sizeof(__u32) || 202 - value_start + le16_to_cpu(entry->e_value_offs) + 203 - le32_to_cpu(entry->e_value_size) > end)) 204 - return -EFSCORRUPTED; 199 + if (entry->e_value_size != 0) { 200 + u16 offs = le16_to_cpu(entry->e_value_offs); 201 + u32 size = le32_to_cpu(entry->e_value_size); 202 + void *value; 203 + 204 + /* 205 + * The value cannot overlap the names, and the value 206 + * with padding cannot extend beyond 'end'. Check both 207 + * the padded and unpadded sizes, since the size may 208 + * overflow to 0 when adding padding. 209 + */ 210 + if (offs > end - value_start) 211 + return -EFSCORRUPTED; 212 + value = value_start + offs; 213 + if (value < (void *)e + sizeof(u32) || 214 + size > end - value || 215 + EXT4_XATTR_SIZE(size) > end - value) 216 + return -EFSCORRUPTED; 217 + } 205 218 entry = EXT4_XATTR_NEXT(entry); 206 219 } 207 220 ··· 246 231 __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, 247 232 void *end, const char *function, unsigned int line) 248 233 { 249 - struct ext4_xattr_entry *entry = IFIRST(header); 250 234 int error = -EFSCORRUPTED; 251 235 252 - if (((void *) header >= end) || 236 + if (end - (void *)header < sizeof(*header) + sizeof(u32) || 253 237 (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) 254 238 goto errout; 255 - error = ext4_xattr_check_names(entry, end, entry); 239 + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); 256 240 errout: 257 241 if (error) 258 242 __ext4_error_inode(inode, function, line, 0, ··· 1123 1109 return 0; 1124 1110 } 1125 1111 1126 - static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 1112 + static int ext4_xattr_ibody_set(struct inode *inode, 1127 1113 struct ext4_xattr_info *i, 1128 1114 struct ext4_xattr_ibody_find *is) 1129 1115 { ··· 1230 1216 } 1231 1217 if (!value) { 1232 1218 if (!is.s.not_found) 1233 - error = ext4_xattr_ibody_set(handle, inode, &i, &is); 1219 + error = ext4_xattr_ibody_set(inode, &i, &is); 1234 1220 else if (!bs.s.not_found) 1235 1221 error = ext4_xattr_block_set(handle, inode, &i, &bs); 1236 1222 } else { ··· 1241 1227 if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) 1242 1228 goto cleanup; 1243 1229 1244 - error = ext4_xattr_ibody_set(handle, inode, &i, &is); 1230 + error = ext4_xattr_ibody_set(inode, &i, &is); 1245 1231 if (!error && !bs.s.not_found) { 1246 1232 i.value = NULL; 1247 1233 error = ext4_xattr_block_set(handle, inode, &i, &bs); ··· 1256 1242 goto cleanup; 1257 1243 if (!is.s.not_found) { 1258 1244 i.value = NULL; 1259 - error = ext4_xattr_ibody_set(handle, inode, &i, 1260 - &is); 1245 + error = ext4_xattr_ibody_set(inode, &i, &is); 1261 1246 } 1262 1247 } 1263 1248 } 1264 1249 if (!error) { 1265 1250 ext4_xattr_update_super_block(handle, inode->i_sb); 1266 - inode->i_ctime = ext4_current_time(inode); 1251 + inode->i_ctime = current_time(inode); 1267 1252 if (!value) 1268 1253 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); 1269 1254 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); ··· 1397 1384 goto out; 1398 1385 1399 1386 /* Remove the chosen entry from the inode */ 1400 - error = ext4_xattr_ibody_set(handle, inode, &i, is); 1387 + error = ext4_xattr_ibody_set(inode, &i, is); 1401 1388 if (error) 1402 1389 goto out; 1403 1390

+3 -1

fs/f2fs/data.c

··· 1246 1246 fio->old_blkaddr); 1247 1247 retry_encrypt: 1248 1248 fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, 1249 - gfp_flags); 1249 + PAGE_SIZE, 0, 1250 + fio->page->index, 1251 + gfp_flags); 1250 1252 if (IS_ERR(fio->encrypted_page)) { 1251 1253 err = PTR_ERR(fio->encrypted_page); 1252 1254 if (err == -ENOMEM) {

+2 -2

fs/f2fs/f2fs.h

··· 2520 2520 #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page 2521 2521 #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page 2522 2522 #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range 2523 - #define fscrypt_process_policy fscrypt_notsupp_process_policy 2524 - #define fscrypt_get_policy fscrypt_notsupp_get_policy 2523 + #define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy 2524 + #define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy 2525 2525 #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context 2526 2526 #define fscrypt_inherit_context fscrypt_notsupp_inherit_context 2527 2527 #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info

+2 -17

fs/f2fs/file.c

··· 1762 1762 1763 1763 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) 1764 1764 { 1765 - struct fscrypt_policy policy; 1766 1765 struct inode *inode = file_inode(filp); 1767 - 1768 - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, 1769 - sizeof(policy))) 1770 - return -EFAULT; 1771 1766 1772 1767 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); 1773 1768 1774 - return fscrypt_process_policy(filp, &policy); 1769 + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); 1775 1770 } 1776 1771 1777 1772 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) 1778 1773 { 1779 - struct fscrypt_policy policy; 1780 - struct inode *inode = file_inode(filp); 1781 - int err; 1782 - 1783 - err = fscrypt_get_policy(inode, &policy); 1784 - if (err) 1785 - return err; 1786 - 1787 - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) 1788 - return -EFAULT; 1789 - return 0; 1774 + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); 1790 1775 } 1791 1776 1792 1777 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)

+3 -2

fs/iomap.c

··· 467 467 468 468 offset = page_offset(page); 469 469 while (length > 0) { 470 - ret = iomap_apply(inode, offset, length, IOMAP_WRITE, 471 - ops, page, iomap_page_mkwrite_actor); 470 + ret = iomap_apply(inode, offset, length, 471 + IOMAP_WRITE | IOMAP_FAULT, ops, page, 472 + iomap_page_mkwrite_actor); 472 473 if (unlikely(ret <= 0)) 473 474 goto out_unlock; 474 475 offset += ret;

+18 -23

fs/mbcache.c

··· 29 29 /* log2 of hash table size */ 30 30 int c_bucket_bits; 31 31 /* Maximum entries in cache to avoid degrading hash too much */ 32 - int c_max_entries; 32 + unsigned long c_max_entries; 33 33 /* Protects c_list, c_entry_count */ 34 34 spinlock_t c_list_lock; 35 35 struct list_head c_list; ··· 43 43 static struct kmem_cache *mb_entry_cache; 44 44 45 45 static unsigned long mb_cache_shrink(struct mb_cache *cache, 46 - unsigned int nr_to_scan); 46 + unsigned long nr_to_scan); 47 47 48 48 static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, 49 49 u32 key) ··· 155 155 } 156 156 157 157 /* 158 - * mb_cache_entry_find_first - find the first entry in cache with given key 158 + * mb_cache_entry_find_first - find the first reusable entry with the given key 159 159 * @cache: cache where we should search 160 160 * @key: key to look for 161 161 * 162 - * Search in @cache for entry with key @key. Grabs reference to the first 163 - * entry found and returns the entry. 162 + * Search in @cache for a reusable entry with key @key. Grabs reference to the 163 + * first reusable entry found and returns the entry. 164 164 */ 165 165 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, 166 166 u32 key) ··· 170 170 EXPORT_SYMBOL(mb_cache_entry_find_first); 171 171 172 172 /* 173 - * mb_cache_entry_find_next - find next entry in cache with the same 173 + * mb_cache_entry_find_next - find next reusable entry with the same key 174 174 * @cache: cache where we should search 175 175 * @entry: entry to start search from 176 176 * 177 - * Finds next entry in the hash chain which has the same key as @entry. 178 - * If @entry is unhashed (which can happen when deletion of entry races 179 - * with the search), finds the first entry in the hash chain. The function 180 - * drops reference to @entry and returns with a reference to the found entry. 177 + * Finds next reusable entry in the hash chain which has the same key as @entry. 178 + * If @entry is unhashed (which can happen when deletion of entry races with the 179 + * search), finds the first reusable entry in the hash chain. The function drops 180 + * reference to @entry and returns with a reference to the found entry. 181 181 */ 182 182 struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, 183 183 struct mb_cache_entry *entry) ··· 274 274 275 275 /* Shrink number of entries in cache */ 276 276 static unsigned long mb_cache_shrink(struct mb_cache *cache, 277 - unsigned int nr_to_scan) 277 + unsigned long nr_to_scan) 278 278 { 279 279 struct mb_cache_entry *entry; 280 280 struct hlist_bl_head *head; 281 - unsigned int shrunk = 0; 281 + unsigned long shrunk = 0; 282 282 283 283 spin_lock(&cache->c_list_lock); 284 284 while (nr_to_scan-- && !list_empty(&cache->c_list)) { ··· 286 286 struct mb_cache_entry, e_list); 287 287 if (entry->e_referenced) { 288 288 entry->e_referenced = 0; 289 - list_move_tail(&cache->c_list, &entry->e_list); 289 + list_move_tail(&entry->e_list, &cache->c_list); 290 290 continue; 291 291 } 292 292 list_del_init(&entry->e_list); ··· 316 316 static unsigned long mb_cache_scan(struct shrinker *shrink, 317 317 struct shrink_control *sc) 318 318 { 319 - int nr_to_scan = sc->nr_to_scan; 320 319 struct mb_cache *cache = container_of(shrink, struct mb_cache, 321 320 c_shrink); 322 - return mb_cache_shrink(cache, nr_to_scan); 321 + return mb_cache_shrink(cache, sc->nr_to_scan); 323 322 } 324 323 325 324 /* We shrink 1/X of the cache when we have too many entries in it */ ··· 340 341 struct mb_cache *mb_cache_create(int bucket_bits) 341 342 { 342 343 struct mb_cache *cache; 343 - int bucket_count = 1 << bucket_bits; 344 - int i; 345 - 346 - if (!try_module_get(THIS_MODULE)) 347 - return NULL; 344 + unsigned long bucket_count = 1UL << bucket_bits; 345 + unsigned long i; 348 346 349 347 cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); 350 348 if (!cache) ··· 373 377 return cache; 374 378 375 379 err_out: 376 - module_put(THIS_MODULE); 377 380 return NULL; 378 381 } 379 382 EXPORT_SYMBOL(mb_cache_create); ··· 406 411 } 407 412 kfree(cache->c_hash); 408 413 kfree(cache); 409 - module_put(THIS_MODULE); 410 414 } 411 415 EXPORT_SYMBOL(mb_cache_destroy); 412 416 ··· 414 420 mb_entry_cache = kmem_cache_create("mbcache", 415 421 sizeof(struct mb_cache_entry), 0, 416 422 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 417 - BUG_ON(!mb_entry_cache); 423 + if (!mb_entry_cache) 424 + return -ENOMEM; 418 425 return 0; 419 426 } 420 427

+5 -21

fs/xfs/xfs_aops.c

··· 1297 1297 sector_t iblock, 1298 1298 struct buffer_head *bh_result, 1299 1299 int create, 1300 - bool direct, 1301 - bool dax_fault) 1300 + bool direct) 1302 1301 { 1303 1302 struct xfs_inode *ip = XFS_I(inode); 1304 1303 struct xfs_mount *mp = ip->i_mount; ··· 1418 1419 if (ISUNWRITTEN(&imap)) 1419 1420 set_buffer_unwritten(bh_result); 1420 1421 /* direct IO needs special help */ 1421 - if (create) { 1422 - if (dax_fault) 1423 - ASSERT(!ISUNWRITTEN(&imap)); 1424 - else 1425 - xfs_map_direct(inode, bh_result, &imap, offset, 1426 - is_cow); 1427 - } 1422 + if (create) 1423 + xfs_map_direct(inode, bh_result, &imap, offset, is_cow); 1428 1424 } 1429 1425 1430 1426 /* ··· 1459 1465 struct buffer_head *bh_result, 1460 1466 int create) 1461 1467 { 1462 - return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); 1468 + return __xfs_get_blocks(inode, iblock, bh_result, create, false); 1463 1469 } 1464 1470 1465 1471 int ··· 1469 1475 struct buffer_head *bh_result, 1470 1476 int create) 1471 1477 { 1472 - return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); 1473 - } 1474 - 1475 - int 1476 - xfs_get_blocks_dax_fault( 1477 - struct inode *inode, 1478 - sector_t iblock, 1479 - struct buffer_head *bh_result, 1480 - int create) 1481 - { 1482 - return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1478 + return __xfs_get_blocks(inode, iblock, bh_result, create, true); 1483 1479 } 1484 1480 1485 1481 /*

-3

fs/xfs/xfs_aops.h

··· 59 59 struct buffer_head *map_bh, int create); 60 60 int xfs_get_blocks_direct(struct inode *inode, sector_t offset, 61 61 struct buffer_head *map_bh, int create); 62 - int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, 63 - struct buffer_head *map_bh, int create); 64 - 65 62 int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, 66 63 ssize_t size, void *private); 67 64 int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);

+5 -5

fs/xfs/xfs_file.c

··· 318 318 return 0; /* skip atime */ 319 319 320 320 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 321 - ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); 321 + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); 322 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 323 323 324 324 file_accessed(iocb->ki_filp); ··· 653 653 654 654 trace_xfs_file_dax_write(ip, count, pos); 655 655 656 - ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); 656 + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); 657 657 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 658 658 i_size_write(inode, iocb->ki_pos); 659 659 error = xfs_setfilesize(ip, pos, ret); ··· 1474 1474 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1475 1475 1476 1476 if (IS_DAX(inode)) { 1477 - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); 1477 + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); 1478 1478 } else { 1479 1479 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); 1480 1480 ret = block_page_mkwrite_return(ret); ··· 1508 1508 * changes to xfs_get_blocks_direct() to map unwritten extent 1509 1509 * ioend for conversion on read-only mappings. 1510 1510 */ 1511 - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); 1511 + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); 1512 1512 } else 1513 1513 ret = filemap_fault(vma, vmf); 1514 1514 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ··· 1545 1545 } 1546 1546 1547 1547 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1548 - ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); 1548 + ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); 1549 1549 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1550 1550 1551 1551 if (flags & FAULT_FLAG_WRITE)

+45 -15

include/linux/dax.h

··· 8 8 9 9 struct iomap_ops; 10 10 11 - /* We use lowest available exceptional entry bit for locking */ 11 + /* 12 + * We use lowest available bit in exceptional entry for locking, one bit for 13 + * the entry size (PMD) and two more to tell us if the entry is a huge zero 14 + * page (HZP) or an empty entry that is just used for locking. In total four 15 + * special bits. 16 + * 17 + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and 18 + * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem 19 + * block allocation. 20 + */ 21 + #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) 12 22 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) 23 + #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 24 + #define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 25 + #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) 13 26 14 - ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, 27 + static inline unsigned long dax_radix_sector(void *entry) 28 + { 29 + return (unsigned long)entry >> RADIX_DAX_SHIFT; 30 + } 31 + 32 + static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) 33 + { 34 + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | 35 + ((unsigned long)sector << RADIX_DAX_SHIFT) | 36 + RADIX_DAX_ENTRY_LOCK); 37 + } 38 + 39 + ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 15 40 struct iomap_ops *ops); 16 - ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, 17 - get_block_t, dio_iodone_t, int flags); 18 - int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 19 - int dax_truncate_page(struct inode *, loff_t from, get_block_t); 20 - int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 41 + int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 21 42 struct iomap_ops *ops); 22 - int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 23 43 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 24 44 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 25 - pgoff_t index, bool wake_all); 45 + pgoff_t index, void *entry, bool wake_all); 26 46 27 47 #ifdef CONFIG_FS_DAX 28 48 struct page *read_dax_sector(struct block_device *bdev, sector_t n); ··· 68 48 } 69 49 #endif 70 50 71 - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 72 - int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, 73 - unsigned int flags, get_block_t); 51 + #ifdef CONFIG_FS_DAX_PMD 52 + static inline unsigned int dax_radix_order(void *entry) 53 + { 54 + if ((unsigned long)entry & RADIX_DAX_PMD) 55 + return PMD_SHIFT - PAGE_SHIFT; 56 + return 0; 57 + } 58 + int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, 59 + pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); 74 60 #else 75 - static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 76 - pmd_t *pmd, unsigned int flags, get_block_t gb) 61 + static inline unsigned int dax_radix_order(void *entry) 62 + { 63 + return 0; 64 + } 65 + static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, 66 + unsigned long address, pmd_t *pmd, unsigned int flags, 67 + struct iomap_ops *ops) 77 68 { 78 69 return VM_FAULT_FALLBACK; 79 70 } 80 71 #endif 81 72 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 82 - #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 83 73 84 74 static inline bool vma_is_dax(struct vm_area_struct *vma) 85 75 {

+34 -100

include/linux/fscrypto.h

··· 18 18 #include <crypto/skcipher.h> 19 19 #include <uapi/linux/fs.h> 20 20 21 - #define FS_KEY_DERIVATION_NONCE_SIZE 16 22 - #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 21 + #define FS_CRYPTO_BLOCK_SIZE 16 23 22 24 - #define FS_POLICY_FLAGS_PAD_4 0x00 25 - #define FS_POLICY_FLAGS_PAD_8 0x01 26 - #define FS_POLICY_FLAGS_PAD_16 0x02 27 - #define FS_POLICY_FLAGS_PAD_32 0x03 28 - #define FS_POLICY_FLAGS_PAD_MASK 0x03 29 - #define FS_POLICY_FLAGS_VALID 0x03 30 - 31 - /* Encryption algorithms */ 32 - #define FS_ENCRYPTION_MODE_INVALID 0 33 - #define FS_ENCRYPTION_MODE_AES_256_XTS 1 34 - #define FS_ENCRYPTION_MODE_AES_256_GCM 2 35 - #define FS_ENCRYPTION_MODE_AES_256_CBC 3 36 - #define FS_ENCRYPTION_MODE_AES_256_CTS 4 37 - 38 - /** 39 - * Encryption context for inode 40 - * 41 - * Protector format: 42 - * 1 byte: Protector format (1 = this version) 43 - * 1 byte: File contents encryption mode 44 - * 1 byte: File names encryption mode 45 - * 1 byte: Flags 46 - * 8 bytes: Master Key descriptor 47 - * 16 bytes: Encryption Key derivation nonce 48 - */ 49 - struct fscrypt_context { 50 - u8 format; 51 - u8 contents_encryption_mode; 52 - u8 filenames_encryption_mode; 53 - u8 flags; 54 - u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; 55 - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; 56 - } __packed; 57 - 58 - /* Encryption parameters */ 59 - #define FS_XTS_TWEAK_SIZE 16 60 - #define FS_AES_128_ECB_KEY_SIZE 16 61 - #define FS_AES_256_GCM_KEY_SIZE 32 62 - #define FS_AES_256_CBC_KEY_SIZE 32 63 - #define FS_AES_256_CTS_KEY_SIZE 32 64 - #define FS_AES_256_XTS_KEY_SIZE 64 65 - #define FS_MAX_KEY_SIZE 64 66 - 67 - #define FS_KEY_DESC_PREFIX "fscrypt:" 68 - #define FS_KEY_DESC_PREFIX_SIZE 8 69 - 70 - /* This is passed in from userspace into the kernel keyring */ 71 - struct fscrypt_key { 72 - u32 mode; 73 - u8 raw[FS_MAX_KEY_SIZE]; 74 - u32 size; 75 - } __packed; 76 - 77 - struct fscrypt_info { 78 - u8 ci_data_mode; 79 - u8 ci_filename_mode; 80 - u8 ci_flags; 81 - struct crypto_skcipher *ci_ctfm; 82 - struct key *ci_keyring_key; 83 - u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; 84 - }; 85 - 86 - #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 87 - #define FS_WRITE_PATH_FL 0x00000002 23 + struct fscrypt_info; 88 24 89 25 struct fscrypt_ctx { 90 26 union { ··· 37 101 u8 flags; /* Flags */ 38 102 u8 mode; /* Encryption mode for tfm */ 39 103 }; 40 - 41 - struct fscrypt_completion_result { 42 - struct completion completion; 43 - int res; 44 - }; 45 - 46 - #define DECLARE_FS_COMPLETION_RESULT(ecr) \ 47 - struct fscrypt_completion_result ecr = { \ 48 - COMPLETION_INITIALIZER((ecr).completion), 0 } 49 - 50 - #define FS_FNAME_NUM_SCATTER_ENTRIES 4 51 - #define FS_CRYPTO_BLOCK_SIZE 16 52 - #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 53 104 54 105 /** 55 106 * For encrypted symlinks, the ciphertext length is stored at the beginning ··· 77 154 #define fname_len(p) ((p)->disk_name.len) 78 155 79 156 /* 157 + * fscrypt superblock flags 158 + */ 159 + #define FS_CFLG_OWN_PAGES (1U << 1) 160 + 161 + /* 80 162 * crypto opertions for filesystems 81 163 */ 82 164 struct fscrypt_operations { 165 + unsigned int flags; 83 166 int (*get_context)(struct inode *, void *, size_t); 84 167 int (*key_prefix)(struct inode *, u8 **); 85 168 int (*prepare_context)(struct inode *); ··· 135 206 #endif 136 207 } 137 208 138 - static inline int fscrypt_has_encryption_key(struct inode *inode) 209 + static inline int fscrypt_has_encryption_key(const struct inode *inode) 139 210 { 140 211 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 141 212 return (inode->i_crypt_info != NULL); ··· 167 238 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 168 239 /* crypto.c */ 169 240 extern struct kmem_cache *fscrypt_info_cachep; 170 - int fscrypt_initialize(void); 171 - 172 - extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); 241 + extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); 173 242 extern void fscrypt_release_ctx(struct fscrypt_ctx *); 174 - extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); 175 - extern int fscrypt_decrypt_page(struct page *); 243 + extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, 244 + unsigned int, unsigned int, 245 + u64, gfp_t); 246 + extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, 247 + unsigned int, u64); 176 248 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); 177 249 extern void fscrypt_pullback_bio_page(struct page **, bool); 178 250 extern void fscrypt_restore_control_page(struct page *); 179 - extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, 251 + extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, 180 252 unsigned int); 181 253 /* policy.c */ 182 - extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); 183 - extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); 254 + extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); 255 + extern int fscrypt_ioctl_get_policy(struct file *, void __user *); 184 256 extern int fscrypt_has_permitted_context(struct inode *, struct inode *); 185 257 extern int fscrypt_inherit_context(struct inode *, struct inode *, 186 258 void *, bool); 187 259 /* keyinfo.c */ 188 - extern int get_crypt_info(struct inode *); 189 260 extern int fscrypt_get_encryption_info(struct inode *); 190 261 extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); 191 262 ··· 193 264 extern int fscrypt_setup_filename(struct inode *, const struct qstr *, 194 265 int lookup, struct fscrypt_name *); 195 266 extern void fscrypt_free_filename(struct fscrypt_name *); 196 - extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); 197 - extern int fscrypt_fname_alloc_buffer(struct inode *, u32, 267 + extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); 268 + extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, 198 269 struct fscrypt_str *); 199 270 extern void fscrypt_fname_free_buffer(struct fscrypt_str *); 200 271 extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, ··· 204 275 #endif 205 276 206 277 /* crypto.c */ 207 - static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, 278 + static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(const struct inode *i, 208 279 gfp_t f) 209 280 { 210 281 return ERR_PTR(-EOPNOTSUPP); ··· 215 286 return; 216 287 } 217 288 218 - static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, 219 - struct page *p, gfp_t f) 289 + static inline struct page *fscrypt_notsupp_encrypt_page(const struct inode *i, 290 + struct page *p, 291 + unsigned int len, 292 + unsigned int offs, 293 + u64 lblk_num, gfp_t f) 220 294 { 221 295 return ERR_PTR(-EOPNOTSUPP); 222 296 } 223 297 224 - static inline int fscrypt_notsupp_decrypt_page(struct page *p) 298 + static inline int fscrypt_notsupp_decrypt_page(const struct inode *i, struct page *p, 299 + unsigned int len, unsigned int offs, 300 + u64 lblk_num) 225 301 { 226 302 return -EOPNOTSUPP; 227 303 } ··· 247 313 return; 248 314 } 249 315 250 - static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, 316 + static inline int fscrypt_notsupp_zeroout_range(const struct inode *i, pgoff_t p, 251 317 sector_t s, unsigned int f) 252 318 { 253 319 return -EOPNOTSUPP; 254 320 } 255 321 256 322 /* policy.c */ 257 - static inline int fscrypt_notsupp_process_policy(struct file *f, 258 - const struct fscrypt_policy *p) 323 + static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, 324 + const void __user *arg) 259 325 { 260 326 return -EOPNOTSUPP; 261 327 } 262 328 263 - static inline int fscrypt_notsupp_get_policy(struct inode *i, 264 - struct fscrypt_policy *p) 329 + static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, 330 + void __user *arg) 265 331 { 266 332 return -EOPNOTSUPP; 267 333 }

+1

include/linux/iomap.h

··· 49 49 #define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ 50 50 #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ 51 51 #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ 52 + #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ 52 53 53 54 struct iomap_ops { 54 55 /*

+14

include/uapi/linux/fs.h

··· 258 258 /* Policy provided via an ioctl on the topmost directory */ 259 259 #define FS_KEY_DESCRIPTOR_SIZE 8 260 260 261 + #define FS_POLICY_FLAGS_PAD_4 0x00 262 + #define FS_POLICY_FLAGS_PAD_8 0x01 263 + #define FS_POLICY_FLAGS_PAD_16 0x02 264 + #define FS_POLICY_FLAGS_PAD_32 0x03 265 + #define FS_POLICY_FLAGS_PAD_MASK 0x03 266 + #define FS_POLICY_FLAGS_VALID 0x03 267 + 268 + /* Encryption algorithms */ 269 + #define FS_ENCRYPTION_MODE_INVALID 0 270 + #define FS_ENCRYPTION_MODE_AES_256_XTS 1 271 + #define FS_ENCRYPTION_MODE_AES_256_GCM 2 272 + #define FS_ENCRYPTION_MODE_AES_256_CBC 3 273 + #define FS_ENCRYPTION_MODE_AES_256_CTS 4 274 + 261 275 struct fscrypt_policy { 262 276 __u8 version; 263 277 __u8 contents_encryption_mode;

+2 -3

mm/filemap.c

··· 135 135 } else { 136 136 /* DAX can replace empty locked entry with a hole */ 137 137 WARN_ON_ONCE(p != 138 - (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 139 - RADIX_DAX_ENTRY_LOCK)); 138 + dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); 140 139 /* Wakeup waiters for exceptional entry lock */ 141 - dax_wake_mapping_entry_waiter(mapping, page->index, 140 + dax_wake_mapping_entry_waiter(mapping, page->index, p, 142 141 false); 143 142 } 144 143 }