Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: move ext4_ind_* functions from inode.c to indirect.c

This patch moves functions from inode.c to indirect.c.
The moved functions are ext4_ind_* functions and their helpers.
Functions called from inode.c are declared extern.

Signed-off-by: Amir Goldstein <amir73il@users.sf.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by

Amir Goldstein and committed by
Theodore Ts'o
dae1e52c 9f125d64

+1521 -1487
+1 -1
fs/ext4/Makefile
··· 7 7 ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 8 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 9 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 10 - mmp.o 10 + mmp.o indirect.o 11 11 12 12 ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 13 13 ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
+1
fs/ext4/block_validity.c
··· 266 266 } 267 267 return 0; 268 268 } 269 +
+9
fs/ext4/ext4.h
··· 1834 1834 extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1835 1835 extern void ext4_da_update_reserve_space(struct inode *inode, 1836 1836 int used, int quota_claim); 1837 + 1838 + /* indirect.c */ 1839 + extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 1840 + struct ext4_map_blocks *map, int flags); 1841 + extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 1842 + const struct iovec *iov, loff_t offset, 1843 + unsigned long nr_segs); 1844 + extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 1845 + extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 1837 1846 extern void ext4_ind_truncate(struct inode *inode); 1838 1847 1839 1848 /* ioctl.c */
+1510
fs/ext4/indirect.c
··· 1 + /* 2 + * linux/fs/ext4/indirect.c 3 + * 4 + * from 5 + * 6 + * linux/fs/ext4/inode.c 7 + * 8 + * Copyright (C) 1992, 1993, 1994, 1995 9 + * Remy Card (card@masi.ibp.fr) 10 + * Laboratoire MASI - Institut Blaise Pascal 11 + * Universite Pierre et Marie Curie (Paris VI) 12 + * 13 + * from 14 + * 15 + * linux/fs/minix/inode.c 16 + * 17 + * Copyright (C) 1991, 1992 Linus Torvalds 18 + * 19 + * Goal-directed block allocation by Stephen Tweedie 20 + * (sct@redhat.com), 1993, 1998 21 + */ 22 + 23 + #include <linux/module.h> 24 + #include "ext4_jbd2.h" 25 + #include "truncate.h" 26 + 27 + #include <trace/events/ext4.h> 28 + 29 + typedef struct { 30 + __le32 *p; 31 + __le32 key; 32 + struct buffer_head *bh; 33 + } Indirect; 34 + 35 + static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 36 + { 37 + p->key = *(p->p = v); 38 + p->bh = bh; 39 + } 40 + 41 + /** 42 + * ext4_block_to_path - parse the block number into array of offsets 43 + * @inode: inode in question (we are only interested in its superblock) 44 + * @i_block: block number to be parsed 45 + * @offsets: array to store the offsets in 46 + * @boundary: set this non-zero if the referred-to block is likely to be 47 + * followed (on disk) by an indirect block. 48 + * 49 + * To store the locations of file's data ext4 uses a data structure common 50 + * for UNIX filesystems - tree of pointers anchored in the inode, with 51 + * data blocks at leaves and indirect blocks in intermediate nodes. 52 + * This function translates the block number into path in that tree - 53 + * return value is the path length and @offsets[n] is the offset of 54 + * pointer to (n+1)th node in the nth one. If @block is out of range 55 + * (negative or too large) warning is printed and zero returned. 56 + * 57 + * Note: function doesn't find node addresses, so no IO is needed. All 58 + * we need to know is the capacity of indirect blocks (taken from the 59 + * inode->i_sb). 60 + */ 61 + 62 + /* 63 + * Portability note: the last comparison (check that we fit into triple 64 + * indirect block) is spelled differently, because otherwise on an 65 + * architecture with 32-bit longs and 8Kb pages we might get into trouble 66 + * if our filesystem had 8Kb blocks. We might use long long, but that would 67 + * kill us on x86. Oh, well, at least the sign propagation does not matter - 68 + * i_block would have to be negative in the very beginning, so we would not 69 + * get there at all. 70 + */ 71 + 72 + static int ext4_block_to_path(struct inode *inode, 73 + ext4_lblk_t i_block, 74 + ext4_lblk_t offsets[4], int *boundary) 75 + { 76 + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 77 + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 78 + const long direct_blocks = EXT4_NDIR_BLOCKS, 79 + indirect_blocks = ptrs, 80 + double_blocks = (1 << (ptrs_bits * 2)); 81 + int n = 0; 82 + int final = 0; 83 + 84 + if (i_block < direct_blocks) { 85 + offsets[n++] = i_block; 86 + final = direct_blocks; 87 + } else if ((i_block -= direct_blocks) < indirect_blocks) { 88 + offsets[n++] = EXT4_IND_BLOCK; 89 + offsets[n++] = i_block; 90 + final = ptrs; 91 + } else if ((i_block -= indirect_blocks) < double_blocks) { 92 + offsets[n++] = EXT4_DIND_BLOCK; 93 + offsets[n++] = i_block >> ptrs_bits; 94 + offsets[n++] = i_block & (ptrs - 1); 95 + final = ptrs; 96 + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 97 + offsets[n++] = EXT4_TIND_BLOCK; 98 + offsets[n++] = i_block >> (ptrs_bits * 2); 99 + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 100 + offsets[n++] = i_block & (ptrs - 1); 101 + final = ptrs; 102 + } else { 103 + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", 104 + i_block + direct_blocks + 105 + indirect_blocks + double_blocks, inode->i_ino); 106 + } 107 + if (boundary) 108 + *boundary = final - 1 - (i_block & (ptrs - 1)); 109 + return n; 110 + } 111 + 112 + /** 113 + * ext4_get_branch - read the chain of indirect blocks leading to data 114 + * @inode: inode in question 115 + * @depth: depth of the chain (1 - direct pointer, etc.) 116 + * @offsets: offsets of pointers in inode/indirect blocks 117 + * @chain: place to store the result 118 + * @err: here we store the error value 119 + * 120 + * Function fills the array of triples <key, p, bh> and returns %NULL 121 + * if everything went OK or the pointer to the last filled triple 122 + * (incomplete one) otherwise. Upon the return chain[i].key contains 123 + * the number of (i+1)-th block in the chain (as it is stored in memory, 124 + * i.e. little-endian 32-bit), chain[i].p contains the address of that 125 + * number (it points into struct inode for i==0 and into the bh->b_data 126 + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 127 + * block for i>0 and NULL for i==0. In other words, it holds the block 128 + * numbers of the chain, addresses they were taken from (and where we can 129 + * verify that chain did not change) and buffer_heads hosting these 130 + * numbers. 131 + * 132 + * Function stops when it stumbles upon zero pointer (absent block) 133 + * (pointer to last triple returned, *@err == 0) 134 + * or when it gets an IO error reading an indirect block 135 + * (ditto, *@err == -EIO) 136 + * or when it reads all @depth-1 indirect blocks successfully and finds 137 + * the whole chain, all way to the data (returns %NULL, *err == 0). 138 + * 139 + * Need to be called with 140 + * down_read(&EXT4_I(inode)->i_data_sem) 141 + */ 142 + static Indirect *ext4_get_branch(struct inode *inode, int depth, 143 + ext4_lblk_t *offsets, 144 + Indirect chain[4], int *err) 145 + { 146 + struct super_block *sb = inode->i_sb; 147 + Indirect *p = chain; 148 + struct buffer_head *bh; 149 + 150 + *err = 0; 151 + /* i_data is not going away, no lock needed */ 152 + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 153 + if (!p->key) 154 + goto no_block; 155 + while (--depth) { 156 + bh = sb_getblk(sb, le32_to_cpu(p->key)); 157 + if (unlikely(!bh)) 158 + goto failure; 159 + 160 + if (!bh_uptodate_or_lock(bh)) { 161 + if (bh_submit_read(bh) < 0) { 162 + put_bh(bh); 163 + goto failure; 164 + } 165 + /* validate block references */ 166 + if (ext4_check_indirect_blockref(inode, bh)) { 167 + put_bh(bh); 168 + goto failure; 169 + } 170 + } 171 + 172 + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 173 + /* Reader: end */ 174 + if (!p->key) 175 + goto no_block; 176 + } 177 + return NULL; 178 + 179 + failure: 180 + *err = -EIO; 181 + no_block: 182 + return p; 183 + } 184 + 185 + /** 186 + * ext4_find_near - find a place for allocation with sufficient locality 187 + * @inode: owner 188 + * @ind: descriptor of indirect block. 189 + * 190 + * This function returns the preferred place for block allocation. 191 + * It is used when heuristic for sequential allocation fails. 192 + * Rules are: 193 + * + if there is a block to the left of our position - allocate near it. 194 + * + if pointer will live in indirect block - allocate near that block. 195 + * + if pointer will live in inode - allocate in the same 196 + * cylinder group. 197 + * 198 + * In the latter case we colour the starting block by the callers PID to 199 + * prevent it from clashing with concurrent allocations for a different inode 200 + * in the same block group. The PID is used here so that functionally related 201 + * files will be close-by on-disk. 202 + * 203 + * Caller must make sure that @ind is valid and will stay that way. 204 + */ 205 + static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 206 + { 207 + struct ext4_inode_info *ei = EXT4_I(inode); 208 + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 209 + __le32 *p; 210 + ext4_fsblk_t bg_start; 211 + ext4_fsblk_t last_block; 212 + ext4_grpblk_t colour; 213 + ext4_group_t block_group; 214 + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 215 + 216 + /* Try to find previous block */ 217 + for (p = ind->p - 1; p >= start; p--) { 218 + if (*p) 219 + return le32_to_cpu(*p); 220 + } 221 + 222 + /* No such thing, so let's try location of indirect block */ 223 + if (ind->bh) 224 + return ind->bh->b_blocknr; 225 + 226 + /* 227 + * It is going to be referred to from the inode itself? OK, just put it 228 + * into the same cylinder group then. 229 + */ 230 + block_group = ei->i_block_group; 231 + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 232 + block_group &= ~(flex_size-1); 233 + if (S_ISREG(inode->i_mode)) 234 + block_group++; 235 + } 236 + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 237 + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 238 + 239 + /* 240 + * If we are doing delayed allocation, we don't need take 241 + * colour into account. 242 + */ 243 + if (test_opt(inode->i_sb, DELALLOC)) 244 + return bg_start; 245 + 246 + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 247 + colour = (current->pid % 16) * 248 + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 249 + else 250 + colour = (current->pid % 16) * ((last_block - bg_start) / 16); 251 + return bg_start + colour; 252 + } 253 + 254 + /** 255 + * ext4_find_goal - find a preferred place for allocation. 256 + * @inode: owner 257 + * @block: block we want 258 + * @partial: pointer to the last triple within a chain 259 + * 260 + * Normally this function find the preferred place for block allocation, 261 + * returns it. 262 + * Because this is only used for non-extent files, we limit the block nr 263 + * to 32 bits. 264 + */ 265 + static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 266 + Indirect *partial) 267 + { 268 + ext4_fsblk_t goal; 269 + 270 + /* 271 + * XXX need to get goal block from mballoc's data structures 272 + */ 273 + 274 + goal = ext4_find_near(inode, partial); 275 + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 276 + return goal; 277 + } 278 + 279 + /** 280 + * ext4_blks_to_allocate - Look up the block map and count the number 281 + * of direct blocks need to be allocated for the given branch. 282 + * 283 + * @branch: chain of indirect blocks 284 + * @k: number of blocks need for indirect blocks 285 + * @blks: number of data blocks to be mapped. 286 + * @blocks_to_boundary: the offset in the indirect block 287 + * 288 + * return the total number of blocks to be allocate, including the 289 + * direct and indirect blocks. 290 + */ 291 + static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 292 + int blocks_to_boundary) 293 + { 294 + unsigned int count = 0; 295 + 296 + /* 297 + * Simple case, [t,d]Indirect block(s) has not allocated yet 298 + * then it's clear blocks on that path have not allocated 299 + */ 300 + if (k > 0) { 301 + /* right now we don't handle cross boundary allocation */ 302 + if (blks < blocks_to_boundary + 1) 303 + count += blks; 304 + else 305 + count += blocks_to_boundary + 1; 306 + return count; 307 + } 308 + 309 + count++; 310 + while (count < blks && count <= blocks_to_boundary && 311 + le32_to_cpu(*(branch[0].p + count)) == 0) { 312 + count++; 313 + } 314 + return count; 315 + } 316 + 317 + /** 318 + * ext4_alloc_blocks: multiple allocate blocks needed for a branch 319 + * @handle: handle for this transaction 320 + * @inode: inode which needs allocated blocks 321 + * @iblock: the logical block to start allocated at 322 + * @goal: preferred physical block of allocation 323 + * @indirect_blks: the number of blocks need to allocate for indirect 324 + * blocks 325 + * @blks: number of desired blocks 326 + * @new_blocks: on return it will store the new block numbers for 327 + * the indirect blocks(if needed) and the first direct block, 328 + * @err: on return it will store the error code 329 + * 330 + * This function will return the number of blocks allocated as 331 + * requested by the passed-in parameters. 332 + */ 333 + static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 334 + ext4_lblk_t iblock, ext4_fsblk_t goal, 335 + int indirect_blks, int blks, 336 + ext4_fsblk_t new_blocks[4], int *err) 337 + { 338 + struct ext4_allocation_request ar; 339 + int target, i; 340 + unsigned long count = 0, blk_allocated = 0; 341 + int index = 0; 342 + ext4_fsblk_t current_block = 0; 343 + int ret = 0; 344 + 345 + /* 346 + * Here we try to allocate the requested multiple blocks at once, 347 + * on a best-effort basis. 348 + * To build a branch, we should allocate blocks for 349 + * the indirect blocks(if not allocated yet), and at least 350 + * the first direct block of this branch. That's the 351 + * minimum number of blocks need to allocate(required) 352 + */ 353 + /* first we try to allocate the indirect blocks */ 354 + target = indirect_blks; 355 + while (target > 0) { 356 + count = target; 357 + /* allocating blocks for indirect blocks and direct blocks */ 358 + current_block = ext4_new_meta_blocks(handle, inode, goal, 359 + 0, &count, err); 360 + if (*err) 361 + goto failed_out; 362 + 363 + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { 364 + EXT4_ERROR_INODE(inode, 365 + "current_block %llu + count %lu > %d!", 366 + current_block, count, 367 + EXT4_MAX_BLOCK_FILE_PHYS); 368 + *err = -EIO; 369 + goto failed_out; 370 + } 371 + 372 + target -= count; 373 + /* allocate blocks for indirect blocks */ 374 + while (index < indirect_blks && count) { 375 + new_blocks[index++] = current_block++; 376 + count--; 377 + } 378 + if (count > 0) { 379 + /* 380 + * save the new block number 381 + * for the first direct block 382 + */ 383 + new_blocks[index] = current_block; 384 + printk(KERN_INFO "%s returned more blocks than " 385 + "requested\n", __func__); 386 + WARN_ON(1); 387 + break; 388 + } 389 + } 390 + 391 + target = blks - count ; 392 + blk_allocated = count; 393 + if (!target) 394 + goto allocated; 395 + /* Now allocate data blocks */ 396 + memset(&ar, 0, sizeof(ar)); 397 + ar.inode = inode; 398 + ar.goal = goal; 399 + ar.len = target; 400 + ar.logical = iblock; 401 + if (S_ISREG(inode->i_mode)) 402 + /* enable in-core preallocation only for regular files */ 403 + ar.flags = EXT4_MB_HINT_DATA; 404 + 405 + current_block = ext4_mb_new_blocks(handle, &ar, err); 406 + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { 407 + EXT4_ERROR_INODE(inode, 408 + "current_block %llu + ar.len %d > %d!", 409 + current_block, ar.len, 410 + EXT4_MAX_BLOCK_FILE_PHYS); 411 + *err = -EIO; 412 + goto failed_out; 413 + } 414 + 415 + if (*err && (target == blks)) { 416 + /* 417 + * if the allocation failed and we didn't allocate 418 + * any blocks before 419 + */ 420 + goto failed_out; 421 + } 422 + if (!*err) { 423 + if (target == blks) { 424 + /* 425 + * save the new block number 426 + * for the first direct block 427 + */ 428 + new_blocks[index] = current_block; 429 + } 430 + blk_allocated += ar.len; 431 + } 432 + allocated: 433 + /* total number of blocks allocated for direct blocks */ 434 + ret = blk_allocated; 435 + *err = 0; 436 + return ret; 437 + failed_out: 438 + for (i = 0; i < index; i++) 439 + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 440 + return ret; 441 + } 442 + 443 + /** 444 + * ext4_alloc_branch - allocate and set up a chain of blocks. 445 + * @handle: handle for this transaction 446 + * @inode: owner 447 + * @indirect_blks: number of allocated indirect blocks 448 + * @blks: number of allocated direct blocks 449 + * @goal: preferred place for allocation 450 + * @offsets: offsets (in the blocks) to store the pointers to next. 451 + * @branch: place to store the chain in. 452 + * 453 + * This function allocates blocks, zeroes out all but the last one, 454 + * links them into chain and (if we are synchronous) writes them to disk. 455 + * In other words, it prepares a branch that can be spliced onto the 456 + * inode. It stores the information about that chain in the branch[], in 457 + * the same format as ext4_get_branch() would do. We are calling it after 458 + * we had read the existing part of chain and partial points to the last 459 + * triple of that (one with zero ->key). Upon the exit we have the same 460 + * picture as after the successful ext4_get_block(), except that in one 461 + * place chain is disconnected - *branch->p is still zero (we did not 462 + * set the last link), but branch->key contains the number that should 463 + * be placed into *branch->p to fill that gap. 464 + * 465 + * If allocation fails we free all blocks we've allocated (and forget 466 + * their buffer_heads) and return the error value the from failed 467 + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 468 + * as described above and return 0. 469 + */ 470 + static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 471 + ext4_lblk_t iblock, int indirect_blks, 472 + int *blks, ext4_fsblk_t goal, 473 + ext4_lblk_t *offsets, Indirect *branch) 474 + { 475 + int blocksize = inode->i_sb->s_blocksize; 476 + int i, n = 0; 477 + int err = 0; 478 + struct buffer_head *bh; 479 + int num; 480 + ext4_fsblk_t new_blocks[4]; 481 + ext4_fsblk_t current_block; 482 + 483 + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 484 + *blks, new_blocks, &err); 485 + if (err) 486 + return err; 487 + 488 + branch[0].key = cpu_to_le32(new_blocks[0]); 489 + /* 490 + * metadata blocks and data blocks are allocated. 491 + */ 492 + for (n = 1; n <= indirect_blks; n++) { 493 + /* 494 + * Get buffer_head for parent block, zero it out 495 + * and set the pointer to new one, then send 496 + * parent to disk. 497 + */ 498 + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 499 + if (unlikely(!bh)) { 500 + err = -EIO; 501 + goto failed; 502 + } 503 + 504 + branch[n].bh = bh; 505 + lock_buffer(bh); 506 + BUFFER_TRACE(bh, "call get_create_access"); 507 + err = ext4_journal_get_create_access(handle, bh); 508 + if (err) { 509 + /* Don't brelse(bh) here; it's done in 510 + * ext4_journal_forget() below */ 511 + unlock_buffer(bh); 512 + goto failed; 513 + } 514 + 515 + memset(bh->b_data, 0, blocksize); 516 + branch[n].p = (__le32 *) bh->b_data + offsets[n]; 517 + branch[n].key = cpu_to_le32(new_blocks[n]); 518 + *branch[n].p = branch[n].key; 519 + if (n == indirect_blks) { 520 + current_block = new_blocks[n]; 521 + /* 522 + * End of chain, update the last new metablock of 523 + * the chain to point to the new allocated 524 + * data blocks numbers 525 + */ 526 + for (i = 1; i < num; i++) 527 + *(branch[n].p + i) = cpu_to_le32(++current_block); 528 + } 529 + BUFFER_TRACE(bh, "marking uptodate"); 530 + set_buffer_uptodate(bh); 531 + unlock_buffer(bh); 532 + 533 + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 534 + err = ext4_handle_dirty_metadata(handle, inode, bh); 535 + if (err) 536 + goto failed; 537 + } 538 + *blks = num; 539 + return err; 540 + failed: 541 + /* Allocation failed, free what we already allocated */ 542 + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 543 + for (i = 1; i <= n ; i++) { 544 + /* 545 + * branch[i].bh is newly allocated, so there is no 546 + * need to revoke the block, which is why we don't 547 + * need to set EXT4_FREE_BLOCKS_METADATA. 548 + */ 549 + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 550 + EXT4_FREE_BLOCKS_FORGET); 551 + } 552 + for (i = n+1; i < indirect_blks; i++) 553 + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 554 + 555 + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); 556 + 557 + return err; 558 + } 559 + 560 + /** 561 + * ext4_splice_branch - splice the allocated branch onto inode. 562 + * @handle: handle for this transaction 563 + * @inode: owner 564 + * @block: (logical) number of block we are adding 565 + * @chain: chain of indirect blocks (with a missing link - see 566 + * ext4_alloc_branch) 567 + * @where: location of missing link 568 + * @num: number of indirect blocks we are adding 569 + * @blks: number of direct blocks we are adding 570 + * 571 + * This function fills the missing link and does all housekeeping needed in 572 + * inode (->i_blocks, etc.). In case of success we end up with the full 573 + * chain to new block and return 0. 574 + */ 575 + static int ext4_splice_branch(handle_t *handle, struct inode *inode, 576 + ext4_lblk_t block, Indirect *where, int num, 577 + int blks) 578 + { 579 + int i; 580 + int err = 0; 581 + ext4_fsblk_t current_block; 582 + 583 + /* 584 + * If we're splicing into a [td]indirect block (as opposed to the 585 + * inode) then we need to get write access to the [td]indirect block 586 + * before the splice. 587 + */ 588 + if (where->bh) { 589 + BUFFER_TRACE(where->bh, "get_write_access"); 590 + err = ext4_journal_get_write_access(handle, where->bh); 591 + if (err) 592 + goto err_out; 593 + } 594 + /* That's it */ 595 + 596 + *where->p = where->key; 597 + 598 + /* 599 + * Update the host buffer_head or inode to point to more just allocated 600 + * direct blocks blocks 601 + */ 602 + if (num == 0 && blks > 1) { 603 + current_block = le32_to_cpu(where->key) + 1; 604 + for (i = 1; i < blks; i++) 605 + *(where->p + i) = cpu_to_le32(current_block++); 606 + } 607 + 608 + /* We are done with atomic stuff, now do the rest of housekeeping */ 609 + /* had we spliced it onto indirect block? */ 610 + if (where->bh) { 611 + /* 612 + * If we spliced it onto an indirect block, we haven't 613 + * altered the inode. Note however that if it is being spliced 614 + * onto an indirect block at the very end of the file (the 615 + * file is growing) then we *will* alter the inode to reflect 616 + * the new i_size. But that is not done here - it is done in 617 + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 618 + */ 619 + jbd_debug(5, "splicing indirect only\n"); 620 + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 621 + err = ext4_handle_dirty_metadata(handle, inode, where->bh); 622 + if (err) 623 + goto err_out; 624 + } else { 625 + /* 626 + * OK, we spliced it into the inode itself on a direct block. 627 + */ 628 + ext4_mark_inode_dirty(handle, inode); 629 + jbd_debug(5, "splicing direct\n"); 630 + } 631 + return err; 632 + 633 + err_out: 634 + for (i = 1; i <= num; i++) { 635 + /* 636 + * branch[i].bh is newly allocated, so there is no 637 + * need to revoke the block, which is why we don't 638 + * need to set EXT4_FREE_BLOCKS_METADATA. 639 + */ 640 + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 641 + EXT4_FREE_BLOCKS_FORGET); 642 + } 643 + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), 644 + blks, 0); 645 + 646 + return err; 647 + } 648 + 649 + /* 650 + * The ext4_ind_map_blocks() function handles non-extents inodes 651 + * (i.e., using the traditional indirect/double-indirect i_blocks 652 + * scheme) for ext4_map_blocks(). 653 + * 654 + * Allocation strategy is simple: if we have to allocate something, we will 655 + * have to go the whole way to leaf. So let's do it before attaching anything 656 + * to tree, set linkage between the newborn blocks, write them if sync is 657 + * required, recheck the path, free and repeat if check fails, otherwise 658 + * set the last missing link (that will protect us from any truncate-generated 659 + * removals - all blocks on the path are immune now) and possibly force the 660 + * write on the parent block. 661 + * That has a nice additional property: no special recovery from the failed 662 + * allocations is needed - we simply release blocks and do not touch anything 663 + * reachable from inode. 664 + * 665 + * `handle' can be NULL if create == 0. 666 + * 667 + * return > 0, # of blocks mapped or allocated. 668 + * return = 0, if plain lookup failed. 669 + * return < 0, error case. 670 + * 671 + * The ext4_ind_get_blocks() function should be called with 672 + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem 673 + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or 674 + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 675 + * blocks. 676 + */ 677 + int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 678 + struct ext4_map_blocks *map, 679 + int flags) 680 + { 681 + int err = -EIO; 682 + ext4_lblk_t offsets[4]; 683 + Indirect chain[4]; 684 + Indirect *partial; 685 + ext4_fsblk_t goal; 686 + int indirect_blks; 687 + int blocks_to_boundary = 0; 688 + int depth; 689 + int count = 0; 690 + ext4_fsblk_t first_block = 0; 691 + 692 + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 693 + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 694 + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 695 + depth = ext4_block_to_path(inode, map->m_lblk, offsets, 696 + &blocks_to_boundary); 697 + 698 + if (depth == 0) 699 + goto out; 700 + 701 + partial = ext4_get_branch(inode, depth, offsets, chain, &err); 702 + 703 + /* Simplest case - block found, no allocation needed */ 704 + if (!partial) { 705 + first_block = le32_to_cpu(chain[depth - 1].key); 706 + count++; 707 + /*map more blocks*/ 708 + while (count < map->m_len && count <= blocks_to_boundary) { 709 + ext4_fsblk_t blk; 710 + 711 + blk = le32_to_cpu(*(chain[depth-1].p + count)); 712 + 713 + if (blk == first_block + count) 714 + count++; 715 + else 716 + break; 717 + } 718 + goto got_it; 719 + } 720 + 721 + /* Next simple case - plain lookup or failed read of indirect block */ 722 + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 723 + goto cleanup; 724 + 725 + /* 726 + * Okay, we need to do block allocation. 727 + */ 728 + goal = ext4_find_goal(inode, map->m_lblk, partial); 729 + 730 + /* the number of blocks need to allocate for [d,t]indirect blocks */ 731 + indirect_blks = (chain + depth) - partial - 1; 732 + 733 + /* 734 + * Next look up the indirect map to count the totoal number of 735 + * direct blocks to allocate for this branch. 736 + */ 737 + count = ext4_blks_to_allocate(partial, indirect_blks, 738 + map->m_len, blocks_to_boundary); 739 + /* 740 + * Block out ext4_truncate while we alter the tree 741 + */ 742 + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, 743 + &count, goal, 744 + offsets + (partial - chain), partial); 745 + 746 + /* 747 + * The ext4_splice_branch call will free and forget any buffers 748 + * on the new chain if there is a failure, but that risks using 749 + * up transaction credits, especially for bitmaps where the 750 + * credits cannot be returned. Can we handle this somehow? We 751 + * may need to return -EAGAIN upwards in the worst case. --sct 752 + */ 753 + if (!err) 754 + err = ext4_splice_branch(handle, inode, map->m_lblk, 755 + partial, indirect_blks, count); 756 + if (err) 757 + goto cleanup; 758 + 759 + map->m_flags |= EXT4_MAP_NEW; 760 + 761 + ext4_update_inode_fsync_trans(handle, inode, 1); 762 + got_it: 763 + map->m_flags |= EXT4_MAP_MAPPED; 764 + map->m_pblk = le32_to_cpu(chain[depth-1].key); 765 + map->m_len = count; 766 + if (count > blocks_to_boundary) 767 + map->m_flags |= EXT4_MAP_BOUNDARY; 768 + err = count; 769 + /* Clean up and exit */ 770 + partial = chain + depth - 1; /* the whole chain */ 771 + cleanup: 772 + while (partial > chain) { 773 + BUFFER_TRACE(partial->bh, "call brelse"); 774 + brelse(partial->bh); 775 + partial--; 776 + } 777 + out: 778 + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 779 + map->m_pblk, map->m_len, err); 780 + return err; 781 + } 782 + 783 + /* 784 + * O_DIRECT for ext3 (or indirect map) based files 785 + * 786 + * If the O_DIRECT write will extend the file then add this inode to the 787 + * orphan list. So recovery will truncate it back to the original size 788 + * if the machine crashes during the write. 789 + * 790 + * If the O_DIRECT write is intantiating holes inside i_size and the machine 791 + * crashes then stale disk data _may_ be exposed inside the file. But current 792 + * VFS code falls back into buffered path in that case so we are safe. 793 + */ 794 + ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 795 + const struct iovec *iov, loff_t offset, 796 + unsigned long nr_segs) 797 + { 798 + struct file *file = iocb->ki_filp; 799 + struct inode *inode = file->f_mapping->host; 800 + struct ext4_inode_info *ei = EXT4_I(inode); 801 + handle_t *handle; 802 + ssize_t ret; 803 + int orphan = 0; 804 + size_t count = iov_length(iov, nr_segs); 805 + int retries = 0; 806 + 807 + if (rw == WRITE) { 808 + loff_t final_size = offset + count; 809 + 810 + if (final_size > inode->i_size) { 811 + /* Credits for sb + inode write */ 812 + handle = ext4_journal_start(inode, 2); 813 + if (IS_ERR(handle)) { 814 + ret = PTR_ERR(handle); 815 + goto out; 816 + } 817 + ret = ext4_orphan_add(handle, inode); 818 + if (ret) { 819 + ext4_journal_stop(handle); 820 + goto out; 821 + } 822 + orphan = 1; 823 + ei->i_disksize = inode->i_size; 824 + ext4_journal_stop(handle); 825 + } 826 + } 827 + 828 + retry: 829 + if (rw == READ && ext4_should_dioread_nolock(inode)) 830 + ret = __blockdev_direct_IO(rw, iocb, inode, 831 + inode->i_sb->s_bdev, iov, 832 + offset, nr_segs, 833 + ext4_get_block, NULL, NULL, 0); 834 + else { 835 + ret = blockdev_direct_IO(rw, iocb, inode, 836 + inode->i_sb->s_bdev, iov, 837 + offset, nr_segs, 838 + ext4_get_block, NULL); 839 + 840 + if (unlikely((rw & WRITE) && ret < 0)) { 841 + loff_t isize = i_size_read(inode); 842 + loff_t end = offset + iov_length(iov, nr_segs); 843 + 844 + if (end > isize) 845 + ext4_truncate_failed_write(inode); 846 + } 847 + } 848 + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 849 + goto retry; 850 + 851 + if (orphan) { 852 + int err; 853 + 854 + /* Credits for sb + inode write */ 855 + handle = ext4_journal_start(inode, 2); 856 + if (IS_ERR(handle)) { 857 + /* This is really bad luck. We've written the data 858 + * but cannot extend i_size. Bail out and pretend 859 + * the write failed... */ 860 + ret = PTR_ERR(handle); 861 + if (inode->i_nlink) 862 + ext4_orphan_del(NULL, inode); 863 + 864 + goto out; 865 + } 866 + if (inode->i_nlink) 867 + ext4_orphan_del(handle, inode); 868 + if (ret > 0) { 869 + loff_t end = offset + ret; 870 + if (end > inode->i_size) { 871 + ei->i_disksize = end; 872 + i_size_write(inode, end); 873 + /* 874 + * We're going to return a positive `ret' 875 + * here due to non-zero-length I/O, so there's 876 + * no way of reporting error returns from 877 + * ext4_mark_inode_dirty() to userspace. So 878 + * ignore it. 879 + */ 880 + ext4_mark_inode_dirty(handle, inode); 881 + } 882 + } 883 + err = ext4_journal_stop(handle); 884 + if (ret == 0) 885 + ret = err; 886 + } 887 + out: 888 + return ret; 889 + } 890 + 891 + /* 892 + * Calculate the number of metadata blocks need to reserve 893 + * to allocate a new block at @lblocks for non extent file based file 894 + */ 895 + int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) 896 + { 897 + struct ext4_inode_info *ei = EXT4_I(inode); 898 + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); 899 + int blk_bits; 900 + 901 + if (lblock < EXT4_NDIR_BLOCKS) 902 + return 0; 903 + 904 + lblock -= EXT4_NDIR_BLOCKS; 905 + 906 + if (ei->i_da_metadata_calc_len && 907 + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { 908 + ei->i_da_metadata_calc_len++; 909 + return 0; 910 + } 911 + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 912 + ei->i_da_metadata_calc_len = 1; 913 + blk_bits = order_base_2(lblock); 914 + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 915 + } 916 + 917 + int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 918 + { 919 + int indirects; 920 + 921 + /* if nrblocks are contiguous */ 922 + if (chunk) { 923 + /* 924 + * With N contiguous data blocks, we need at most 925 + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, 926 + * 2 dindirect blocks, and 1 tindirect block 927 + */ 928 + return DIV_ROUND_UP(nrblocks, 929 + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; 930 + } 931 + /* 932 + * if nrblocks are not contiguous, worse case, each block touch 933 + * a indirect block, and each indirect block touch a double indirect 934 + * block, plus a triple indirect block 935 + */ 936 + indirects = nrblocks * 2 + 1; 937 + return indirects; 938 + } 939 + 940 + /* 941 + * Truncate transactions can be complex and absolutely huge. So we need to 942 + * be able to restart the transaction at a conventient checkpoint to make 943 + * sure we don't overflow the journal. 944 + * 945 + * start_transaction gets us a new handle for a truncate transaction, 946 + * and extend_transaction tries to extend the existing one a bit. If 947 + * extend fails, we need to propagate the failure up and restart the 948 + * transaction in the top-level truncate loop. --sct 949 + */ 950 + static handle_t *start_transaction(struct inode *inode) 951 + { 952 + handle_t *result; 953 + 954 + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); 955 + if (!IS_ERR(result)) 956 + return result; 957 + 958 + ext4_std_error(inode->i_sb, PTR_ERR(result)); 959 + return result; 960 + } 961 + 962 + /* 963 + * Try to extend this transaction for the purposes of truncation. 964 + * 965 + * Returns 0 if we managed to create more room. If we can't create more 966 + * room, and the transaction must be restarted we return 1. 967 + */ 968 + static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 969 + { 970 + if (!ext4_handle_valid(handle)) 971 + return 0; 972 + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 973 + return 0; 974 + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) 975 + return 0; 976 + return 1; 977 + } 978 + 979 + /* 980 + * Probably it should be a library function... search for first non-zero word 981 + * or memcmp with zero_page, whatever is better for particular architecture. 982 + * Linus? 983 + */ 984 + static inline int all_zeroes(__le32 *p, __le32 *q) 985 + { 986 + while (p < q) 987 + if (*p++) 988 + return 0; 989 + return 1; 990 + } 991 + 992 + /** 993 + * ext4_find_shared - find the indirect blocks for partial truncation. 994 + * @inode: inode in question 995 + * @depth: depth of the affected branch 996 + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 997 + * @chain: place to store the pointers to partial indirect blocks 998 + * @top: place to the (detached) top of branch 999 + * 1000 + * This is a helper function used by ext4_truncate(). 1001 + * 1002 + * When we do truncate() we may have to clean the ends of several 1003 + * indirect blocks but leave the blocks themselves alive. Block is 1004 + * partially truncated if some data below the new i_size is referred 1005 + * from it (and it is on the path to the first completely truncated 1006 + * data block, indeed). We have to free the top of that path along 1007 + * with everything to the right of the path. Since no allocation 1008 + * past the truncation point is possible until ext4_truncate() 1009 + * finishes, we may safely do the latter, but top of branch may 1010 + * require special attention - pageout below the truncation point 1011 + * might try to populate it. 1012 + * 1013 + * We atomically detach the top of branch from the tree, store the 1014 + * block number of its root in *@top, pointers to buffer_heads of 1015 + * partially truncated blocks - in @chain[].bh and pointers to 1016 + * their last elements that should not be removed - in 1017 + * @chain[].p. Return value is the pointer to last filled element 1018 + * of @chain. 1019 + * 1020 + * The work left to caller to do the actual freeing of subtrees: 1021 + * a) free the subtree starting from *@top 1022 + * b) free the subtrees whose roots are stored in 1023 + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 1024 + * c) free the subtrees growing from the inode past the @chain[0]. 1025 + * (no partially truncated stuff there). */ 1026 + 1027 + static Indirect *ext4_find_shared(struct inode *inode, int depth, 1028 + ext4_lblk_t offsets[4], Indirect chain[4], 1029 + __le32 *top) 1030 + { 1031 + Indirect *partial, *p; 1032 + int k, err; 1033 + 1034 + *top = 0; 1035 + /* Make k index the deepest non-null offset + 1 */ 1036 + for (k = depth; k > 1 && !offsets[k-1]; k--) 1037 + ; 1038 + partial = ext4_get_branch(inode, k, offsets, chain, &err); 1039 + /* Writer: pointers */ 1040 + if (!partial) 1041 + partial = chain + k-1; 1042 + /* 1043 + * If the branch acquired continuation since we've looked at it - 1044 + * fine, it should all survive and (new) top doesn't belong to us. 1045 + */ 1046 + if (!partial->key && *partial->p) 1047 + /* Writer: end */ 1048 + goto no_top; 1049 + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 1050 + ; 1051 + /* 1052 + * OK, we've found the last block that must survive. The rest of our 1053 + * branch should be detached before unlocking. However, if that rest 1054 + * of branch is all ours and does not grow immediately from the inode 1055 + * it's easier to cheat and just decrement partial->p. 1056 + */ 1057 + if (p == chain + k - 1 && p > chain) { 1058 + p->p--; 1059 + } else { 1060 + *top = *p->p; 1061 + /* Nope, don't do this in ext4. Must leave the tree intact */ 1062 + #if 0 1063 + *p->p = 0; 1064 + #endif 1065 + } 1066 + /* Writer: end */ 1067 + 1068 + while (partial > p) { 1069 + brelse(partial->bh); 1070 + partial--; 1071 + } 1072 + no_top: 1073 + return partial; 1074 + } 1075 + 1076 + /* 1077 + * Zero a number of block pointers in either an inode or an indirect block. 1078 + * If we restart the transaction we must again get write access to the 1079 + * indirect block for further modification. 1080 + * 1081 + * We release `count' blocks on disk, but (last - first) may be greater 1082 + * than `count' because there can be holes in there. 1083 + * 1084 + * Return 0 on success, 1 on invalid block range 1085 + * and < 0 on fatal error. 1086 + */ 1087 + static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 1088 + struct buffer_head *bh, 1089 + ext4_fsblk_t block_to_free, 1090 + unsigned long count, __le32 *first, 1091 + __le32 *last) 1092 + { 1093 + __le32 *p; 1094 + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 1095 + int err; 1096 + 1097 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 1098 + flags |= EXT4_FREE_BLOCKS_METADATA; 1099 + 1100 + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 1101 + count)) { 1102 + EXT4_ERROR_INODE(inode, "attempt to clear invalid " 1103 + "blocks %llu len %lu", 1104 + (unsigned long long) block_to_free, count); 1105 + return 1; 1106 + } 1107 + 1108 + if (try_to_extend_transaction(handle, inode)) { 1109 + if (bh) { 1110 + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1111 + err = ext4_handle_dirty_metadata(handle, inode, bh); 1112 + if (unlikely(err)) 1113 + goto out_err; 1114 + } 1115 + err = ext4_mark_inode_dirty(handle, inode); 1116 + if (unlikely(err)) 1117 + goto out_err; 1118 + err = ext4_truncate_restart_trans(handle, inode, 1119 + ext4_blocks_for_truncate(inode)); 1120 + if (unlikely(err)) 1121 + goto out_err; 1122 + if (bh) { 1123 + BUFFER_TRACE(bh, "retaking write access"); 1124 + err = ext4_journal_get_write_access(handle, bh); 1125 + if (unlikely(err)) 1126 + goto out_err; 1127 + } 1128 + } 1129 + 1130 + for (p = first; p < last; p++) 1131 + *p = 0; 1132 + 1133 + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); 1134 + return 0; 1135 + out_err: 1136 + ext4_std_error(inode->i_sb, err); 1137 + return err; 1138 + } 1139 + 1140 + /** 1141 + * ext4_free_data - free a list of data blocks 1142 + * @handle: handle for this transaction 1143 + * @inode: inode we are dealing with 1144 + * @this_bh: indirect buffer_head which contains *@first and *@last 1145 + * @first: array of block numbers 1146 + * @last: points immediately past the end of array 1147 + * 1148 + * We are freeing all blocks referred from that array (numbers are stored as 1149 + * little-endian 32-bit) and updating @inode->i_blocks appropriately. 1150 + * 1151 + * We accumulate contiguous runs of blocks to free. Conveniently, if these 1152 + * blocks are contiguous then releasing them at one time will only affect one 1153 + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 1154 + * actually use a lot of journal space. 1155 + * 1156 + * @this_bh will be %NULL if @first and @last point into the inode's direct 1157 + * block pointers. 1158 + */ 1159 + static void ext4_free_data(handle_t *handle, struct inode *inode, 1160 + struct buffer_head *this_bh, 1161 + __le32 *first, __le32 *last) 1162 + { 1163 + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 1164 + unsigned long count = 0; /* Number of blocks in the run */ 1165 + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 1166 + corresponding to 1167 + block_to_free */ 1168 + ext4_fsblk_t nr; /* Current block # */ 1169 + __le32 *p; /* Pointer into inode/ind 1170 + for current block */ 1171 + int err = 0; 1172 + 1173 + if (this_bh) { /* For indirect block */ 1174 + BUFFER_TRACE(this_bh, "get_write_access"); 1175 + err = ext4_journal_get_write_access(handle, this_bh); 1176 + /* Important: if we can't update the indirect pointers 1177 + * to the blocks, we can't free them. */ 1178 + if (err) 1179 + return; 1180 + } 1181 + 1182 + for (p = first; p < last; p++) { 1183 + nr = le32_to_cpu(*p); 1184 + if (nr) { 1185 + /* accumulate blocks to free if they're contiguous */ 1186 + if (count == 0) { 1187 + block_to_free = nr; 1188 + block_to_free_p = p; 1189 + count = 1; 1190 + } else if (nr == block_to_free + count) { 1191 + count++; 1192 + } else { 1193 + err = ext4_clear_blocks(handle, inode, this_bh, 1194 + block_to_free, count, 1195 + block_to_free_p, p); 1196 + if (err) 1197 + break; 1198 + block_to_free = nr; 1199 + block_to_free_p = p; 1200 + count = 1; 1201 + } 1202 + } 1203 + } 1204 + 1205 + if (!err && count > 0) 1206 + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, 1207 + count, block_to_free_p, p); 1208 + if (err < 0) 1209 + /* fatal error */ 1210 + return; 1211 + 1212 + if (this_bh) { 1213 + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 1214 + 1215 + /* 1216 + * The buffer head should have an attached journal head at this 1217 + * point. However, if the data is corrupted and an indirect 1218 + * block pointed to itself, it would have been detached when 1219 + * the block was cleared. Check for this instead of OOPSing. 1220 + */ 1221 + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 1222 + ext4_handle_dirty_metadata(handle, inode, this_bh); 1223 + else 1224 + EXT4_ERROR_INODE(inode, 1225 + "circular indirect block detected at " 1226 + "block %llu", 1227 + (unsigned long long) this_bh->b_blocknr); 1228 + } 1229 + } 1230 + 1231 + /** 1232 + * ext4_free_branches - free an array of branches 1233 + * @handle: JBD handle for this transaction 1234 + * @inode: inode we are dealing with 1235 + * @parent_bh: the buffer_head which contains *@first and *@last 1236 + * @first: array of block numbers 1237 + * @last: pointer immediately past the end of array 1238 + * @depth: depth of the branches to free 1239 + * 1240 + * We are freeing all blocks referred from these branches (numbers are 1241 + * stored as little-endian 32-bit) and updating @inode->i_blocks 1242 + * appropriately. 1243 + */ 1244 + static void ext4_free_branches(handle_t *handle, struct inode *inode, 1245 + struct buffer_head *parent_bh, 1246 + __le32 *first, __le32 *last, int depth) 1247 + { 1248 + ext4_fsblk_t nr; 1249 + __le32 *p; 1250 + 1251 + if (ext4_handle_is_aborted(handle)) 1252 + return; 1253 + 1254 + if (depth--) { 1255 + struct buffer_head *bh; 1256 + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1257 + p = last; 1258 + while (--p >= first) { 1259 + nr = le32_to_cpu(*p); 1260 + if (!nr) 1261 + continue; /* A hole */ 1262 + 1263 + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 1264 + nr, 1)) { 1265 + EXT4_ERROR_INODE(inode, 1266 + "invalid indirect mapped " 1267 + "block %lu (level %d)", 1268 + (unsigned long) nr, depth); 1269 + break; 1270 + } 1271 + 1272 + /* Go read the buffer for the next level down */ 1273 + bh = sb_bread(inode->i_sb, nr); 1274 + 1275 + /* 1276 + * A read failure? Report error and clear slot 1277 + * (should be rare). 1278 + */ 1279 + if (!bh) { 1280 + EXT4_ERROR_INODE_BLOCK(inode, nr, 1281 + "Read failure"); 1282 + continue; 1283 + } 1284 + 1285 + /* This zaps the entire block. Bottom up. */ 1286 + BUFFER_TRACE(bh, "free child branches"); 1287 + ext4_free_branches(handle, inode, bh, 1288 + (__le32 *) bh->b_data, 1289 + (__le32 *) bh->b_data + addr_per_block, 1290 + depth); 1291 + brelse(bh); 1292 + 1293 + /* 1294 + * Everything below this this pointer has been 1295 + * released. Now let this top-of-subtree go. 1296 + * 1297 + * We want the freeing of this indirect block to be 1298 + * atomic in the journal with the updating of the 1299 + * bitmap block which owns it. So make some room in 1300 + * the journal. 1301 + * 1302 + * We zero the parent pointer *after* freeing its 1303 + * pointee in the bitmaps, so if extend_transaction() 1304 + * for some reason fails to put the bitmap changes and 1305 + * the release into the same transaction, recovery 1306 + * will merely complain about releasing a free block, 1307 + * rather than leaking blocks. 1308 + */ 1309 + if (ext4_handle_is_aborted(handle)) 1310 + return; 1311 + if (try_to_extend_transaction(handle, inode)) { 1312 + ext4_mark_inode_dirty(handle, inode); 1313 + ext4_truncate_restart_trans(handle, inode, 1314 + ext4_blocks_for_truncate(inode)); 1315 + } 1316 + 1317 + /* 1318 + * The forget flag here is critical because if 1319 + * we are journaling (and not doing data 1320 + * journaling), we have to make sure a revoke 1321 + * record is written to prevent the journal 1322 + * replay from overwriting the (former) 1323 + * indirect block if it gets reallocated as a 1324 + * data block. This must happen in the same 1325 + * transaction where the data blocks are 1326 + * actually freed. 1327 + */ 1328 + ext4_free_blocks(handle, inode, NULL, nr, 1, 1329 + EXT4_FREE_BLOCKS_METADATA| 1330 + EXT4_FREE_BLOCKS_FORGET); 1331 + 1332 + if (parent_bh) { 1333 + /* 1334 + * The block which we have just freed is 1335 + * pointed to by an indirect block: journal it 1336 + */ 1337 + BUFFER_TRACE(parent_bh, "get_write_access"); 1338 + if (!ext4_journal_get_write_access(handle, 1339 + parent_bh)){ 1340 + *p = 0; 1341 + BUFFER_TRACE(parent_bh, 1342 + "call ext4_handle_dirty_metadata"); 1343 + ext4_handle_dirty_metadata(handle, 1344 + inode, 1345 + parent_bh); 1346 + } 1347 + } 1348 + } 1349 + } else { 1350 + /* We have reached the bottom of the tree. */ 1351 + BUFFER_TRACE(parent_bh, "free data blocks"); 1352 + ext4_free_data(handle, inode, parent_bh, first, last); 1353 + } 1354 + } 1355 + 1356 + void ext4_ind_truncate(struct inode *inode) 1357 + { 1358 + handle_t *handle; 1359 + struct ext4_inode_info *ei = EXT4_I(inode); 1360 + __le32 *i_data = ei->i_data; 1361 + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1362 + struct address_space *mapping = inode->i_mapping; 1363 + ext4_lblk_t offsets[4]; 1364 + Indirect chain[4]; 1365 + Indirect *partial; 1366 + __le32 nr = 0; 1367 + int n = 0; 1368 + ext4_lblk_t last_block, max_block; 1369 + unsigned blocksize = inode->i_sb->s_blocksize; 1370 + 1371 + handle = start_transaction(inode); 1372 + if (IS_ERR(handle)) 1373 + return; /* AKPM: return what? */ 1374 + 1375 + last_block = (inode->i_size + blocksize-1) 1376 + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1377 + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 1378 + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1379 + 1380 + if (inode->i_size & (blocksize - 1)) 1381 + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 1382 + goto out_stop; 1383 + 1384 + if (last_block != max_block) { 1385 + n = ext4_block_to_path(inode, last_block, offsets, NULL); 1386 + if (n == 0) 1387 + goto out_stop; /* error */ 1388 + } 1389 + 1390 + /* 1391 + * OK. This truncate is going to happen. We add the inode to the 1392 + * orphan list, so that if this truncate spans multiple transactions, 1393 + * and we crash, we will resume the truncate when the filesystem 1394 + * recovers. It also marks the inode dirty, to catch the new size. 1395 + * 1396 + * Implication: the file must always be in a sane, consistent 1397 + * truncatable state while each transaction commits. 1398 + */ 1399 + if (ext4_orphan_add(handle, inode)) 1400 + goto out_stop; 1401 + 1402 + /* 1403 + * From here we block out all ext4_get_block() callers who want to 1404 + * modify the block allocation tree. 1405 + */ 1406 + down_write(&ei->i_data_sem); 1407 + 1408 + ext4_discard_preallocations(inode); 1409 + 1410 + /* 1411 + * The orphan list entry will now protect us from any crash which 1412 + * occurs before the truncate completes, so it is now safe to propagate 1413 + * the new, shorter inode size (held for now in i_size) into the 1414 + * on-disk inode. We do this via i_disksize, which is the value which 1415 + * ext4 *really* writes onto the disk inode. 1416 + */ 1417 + ei->i_disksize = inode->i_size; 1418 + 1419 + if (last_block == max_block) { 1420 + /* 1421 + * It is unnecessary to free any data blocks if last_block is 1422 + * equal to the indirect block limit. 1423 + */ 1424 + goto out_unlock; 1425 + } else if (n == 1) { /* direct blocks */ 1426 + ext4_free_data(handle, inode, NULL, i_data+offsets[0], 1427 + i_data + EXT4_NDIR_BLOCKS); 1428 + goto do_indirects; 1429 + } 1430 + 1431 + partial = ext4_find_shared(inode, n, offsets, chain, &nr); 1432 + /* Kill the top of shared branch (not detached) */ 1433 + if (nr) { 1434 + if (partial == chain) { 1435 + /* Shared branch grows from the inode */ 1436 + ext4_free_branches(handle, inode, NULL, 1437 + &nr, &nr+1, (chain+n-1) - partial); 1438 + *partial->p = 0; 1439 + /* 1440 + * We mark the inode dirty prior to restart, 1441 + * and prior to stop. No need for it here. 1442 + */ 1443 + } else { 1444 + /* Shared branch grows from an indirect block */ 1445 + BUFFER_TRACE(partial->bh, "get_write_access"); 1446 + ext4_free_branches(handle, inode, partial->bh, 1447 + partial->p, 1448 + partial->p+1, (chain+n-1) - partial); 1449 + } 1450 + } 1451 + /* Clear the ends of indirect blocks on the shared branch */ 1452 + while (partial > chain) { 1453 + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 1454 + (__le32*)partial->bh->b_data+addr_per_block, 1455 + (chain+n-1) - partial); 1456 + BUFFER_TRACE(partial->bh, "call brelse"); 1457 + brelse(partial->bh); 1458 + partial--; 1459 + } 1460 + do_indirects: 1461 + /* Kill the remaining (whole) subtrees */ 1462 + switch (offsets[0]) { 1463 + default: 1464 + nr = i_data[EXT4_IND_BLOCK]; 1465 + if (nr) { 1466 + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 1467 + i_data[EXT4_IND_BLOCK] = 0; 1468 + } 1469 + case EXT4_IND_BLOCK: 1470 + nr = i_data[EXT4_DIND_BLOCK]; 1471 + if (nr) { 1472 + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 1473 + i_data[EXT4_DIND_BLOCK] = 0; 1474 + } 1475 + case EXT4_DIND_BLOCK: 1476 + nr = i_data[EXT4_TIND_BLOCK]; 1477 + if (nr) { 1478 + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 1479 + i_data[EXT4_TIND_BLOCK] = 0; 1480 + } 1481 + case EXT4_TIND_BLOCK: 1482 + ; 1483 + } 1484 + 1485 + out_unlock: 1486 + up_write(&ei->i_data_sem); 1487 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 1488 + ext4_mark_inode_dirty(handle, inode); 1489 + 1490 + /* 1491 + * In a multi-transaction truncate, we only make the final transaction 1492 + * synchronous 1493 + */ 1494 + if (IS_SYNC(inode)) 1495 + ext4_handle_sync(handle); 1496 + out_stop: 1497 + /* 1498 + * If this was a simple ftruncate(), and the file will remain alive 1499 + * then we need to clear up the orphan record which we created above. 1500 + * However, if this was a real unlink then we were called by 1501 + * ext4_delete_inode(), and we allow that function to clean up the 1502 + * orphan info for us. 1503 + */ 1504 + if (inode->i_nlink) 1505 + ext4_orphan_del(handle, inode); 1506 + 1507 + ext4_journal_stop(handle); 1508 + trace_ext4_truncate_exit(inode); 1509 + } 1510 +
-1486
fs/ext4/inode.c
··· 12 12 * 13 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 14 * 15 - * Goal-directed block allocation by Stephen Tweedie 16 - * (sct@redhat.com), 1993, 1998 17 - * Big-endian to little-endian byte-swapping/bitmaps by 18 - * David S. Miller (davem@caip.rutgers.edu), 1995 19 15 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 16 * (jj@sunsite.ms.mff.cuni.cz) 21 17 * ··· 83 87 (inode->i_sb->s_blocksize >> 9) : 0; 84 88 85 89 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 86 - } 87 - 88 - /* 89 - * Truncate transactions can be complex and absolutely huge. So we need to 90 - * be able to restart the transaction at a conventient checkpoint to make 91 - * sure we don't overflow the journal. 92 - * 93 - * start_transaction gets us a new handle for a truncate transaction, 94 - * and extend_transaction tries to extend the existing one a bit. If 95 - * extend fails, we need to propagate the failure up and restart the 96 - * transaction in the top-level truncate loop. --sct 97 - */ 98 - static handle_t *start_transaction(struct inode *inode) 99 - { 100 - handle_t *result; 101 - 102 - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); 103 - if (!IS_ERR(result)) 104 - return result; 105 - 106 - ext4_std_error(inode->i_sb, PTR_ERR(result)); 107 - return result; 108 - } 109 - 110 - /* 111 - * Try to extend this transaction for the purposes of truncation. 112 - * 113 - * Returns 0 if we managed to create more room. If we can't create more 114 - * room, and the transaction must be restarted we return 1. 115 - */ 116 - static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 117 - { 118 - if (!ext4_handle_valid(handle)) 119 - return 0; 120 - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 121 - return 0; 122 - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) 123 - return 0; 124 - return 1; 125 90 } 126 91 127 92 /* ··· 208 251 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 209 252 } 210 253 211 - typedef struct { 212 - __le32 *p; 213 - __le32 key; 214 - struct buffer_head *bh; 215 - } Indirect; 216 - 217 - static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 218 - { 219 - p->key = *(p->p = v); 220 - p->bh = bh; 221 - } 222 - 223 - /** 224 - * ext4_block_to_path - parse the block number into array of offsets 225 - * @inode: inode in question (we are only interested in its superblock) 226 - * @i_block: block number to be parsed 227 - * @offsets: array to store the offsets in 228 - * @boundary: set this non-zero if the referred-to block is likely to be 229 - * followed (on disk) by an indirect block. 230 - * 231 - * To store the locations of file's data ext4 uses a data structure common 232 - * for UNIX filesystems - tree of pointers anchored in the inode, with 233 - * data blocks at leaves and indirect blocks in intermediate nodes. 234 - * This function translates the block number into path in that tree - 235 - * return value is the path length and @offsets[n] is the offset of 236 - * pointer to (n+1)th node in the nth one. If @block is out of range 237 - * (negative or too large) warning is printed and zero returned. 238 - * 239 - * Note: function doesn't find node addresses, so no IO is needed. All 240 - * we need to know is the capacity of indirect blocks (taken from the 241 - * inode->i_sb). 242 - */ 243 - 244 - /* 245 - * Portability note: the last comparison (check that we fit into triple 246 - * indirect block) is spelled differently, because otherwise on an 247 - * architecture with 32-bit longs and 8Kb pages we might get into trouble 248 - * if our filesystem had 8Kb blocks. We might use long long, but that would 249 - * kill us on x86. Oh, well, at least the sign propagation does not matter - 250 - * i_block would have to be negative in the very beginning, so we would not 251 - * get there at all. 252 - */ 253 - 254 - static int ext4_block_to_path(struct inode *inode, 255 - ext4_lblk_t i_block, 256 - ext4_lblk_t offsets[4], int *boundary) 257 - { 258 - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 259 - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 260 - const long direct_blocks = EXT4_NDIR_BLOCKS, 261 - indirect_blocks = ptrs, 262 - double_blocks = (1 << (ptrs_bits * 2)); 263 - int n = 0; 264 - int final = 0; 265 - 266 - if (i_block < direct_blocks) { 267 - offsets[n++] = i_block; 268 - final = direct_blocks; 269 - } else if ((i_block -= direct_blocks) < indirect_blocks) { 270 - offsets[n++] = EXT4_IND_BLOCK; 271 - offsets[n++] = i_block; 272 - final = ptrs; 273 - } else if ((i_block -= indirect_blocks) < double_blocks) { 274 - offsets[n++] = EXT4_DIND_BLOCK; 275 - offsets[n++] = i_block >> ptrs_bits; 276 - offsets[n++] = i_block & (ptrs - 1); 277 - final = ptrs; 278 - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 279 - offsets[n++] = EXT4_TIND_BLOCK; 280 - offsets[n++] = i_block >> (ptrs_bits * 2); 281 - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 282 - offsets[n++] = i_block & (ptrs - 1); 283 - final = ptrs; 284 - } else { 285 - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", 286 - i_block + direct_blocks + 287 - indirect_blocks + double_blocks, inode->i_ino); 288 - } 289 - if (boundary) 290 - *boundary = final - 1 - (i_block & (ptrs - 1)); 291 - return n; 292 - } 293 - 294 - /** 295 - * ext4_get_branch - read the chain of indirect blocks leading to data 296 - * @inode: inode in question 297 - * @depth: depth of the chain (1 - direct pointer, etc.) 298 - * @offsets: offsets of pointers in inode/indirect blocks 299 - * @chain: place to store the result 300 - * @err: here we store the error value 301 - * 302 - * Function fills the array of triples <key, p, bh> and returns %NULL 303 - * if everything went OK or the pointer to the last filled triple 304 - * (incomplete one) otherwise. Upon the return chain[i].key contains 305 - * the number of (i+1)-th block in the chain (as it is stored in memory, 306 - * i.e. little-endian 32-bit), chain[i].p contains the address of that 307 - * number (it points into struct inode for i==0 and into the bh->b_data 308 - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 309 - * block for i>0 and NULL for i==0. In other words, it holds the block 310 - * numbers of the chain, addresses they were taken from (and where we can 311 - * verify that chain did not change) and buffer_heads hosting these 312 - * numbers. 313 - * 314 - * Function stops when it stumbles upon zero pointer (absent block) 315 - * (pointer to last triple returned, *@err == 0) 316 - * or when it gets an IO error reading an indirect block 317 - * (ditto, *@err == -EIO) 318 - * or when it reads all @depth-1 indirect blocks successfully and finds 319 - * the whole chain, all way to the data (returns %NULL, *err == 0). 320 - * 321 - * Need to be called with 322 - * down_read(&EXT4_I(inode)->i_data_sem) 323 - */ 324 - static Indirect *ext4_get_branch(struct inode *inode, int depth, 325 - ext4_lblk_t *offsets, 326 - Indirect chain[4], int *err) 327 - { 328 - struct super_block *sb = inode->i_sb; 329 - Indirect *p = chain; 330 - struct buffer_head *bh; 331 - 332 - *err = 0; 333 - /* i_data is not going away, no lock needed */ 334 - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 335 - if (!p->key) 336 - goto no_block; 337 - while (--depth) { 338 - bh = sb_getblk(sb, le32_to_cpu(p->key)); 339 - if (unlikely(!bh)) 340 - goto failure; 341 - 342 - if (!bh_uptodate_or_lock(bh)) { 343 - if (bh_submit_read(bh) < 0) { 344 - put_bh(bh); 345 - goto failure; 346 - } 347 - /* validate block references */ 348 - if (ext4_check_indirect_blockref(inode, bh)) { 349 - put_bh(bh); 350 - goto failure; 351 - } 352 - } 353 - 354 - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 355 - /* Reader: end */ 356 - if (!p->key) 357 - goto no_block; 358 - } 359 - return NULL; 360 - 361 - failure: 362 - *err = -EIO; 363 - no_block: 364 - return p; 365 - } 366 - 367 - /** 368 - * ext4_find_near - find a place for allocation with sufficient locality 369 - * @inode: owner 370 - * @ind: descriptor of indirect block. 371 - * 372 - * This function returns the preferred place for block allocation. 373 - * It is used when heuristic for sequential allocation fails. 374 - * Rules are: 375 - * + if there is a block to the left of our position - allocate near it. 376 - * + if pointer will live in indirect block - allocate near that block. 377 - * + if pointer will live in inode - allocate in the same 378 - * cylinder group. 379 - * 380 - * In the latter case we colour the starting block by the callers PID to 381 - * prevent it from clashing with concurrent allocations for a different inode 382 - * in the same block group. The PID is used here so that functionally related 383 - * files will be close-by on-disk. 384 - * 385 - * Caller must make sure that @ind is valid and will stay that way. 386 - */ 387 - static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 388 - { 389 - struct ext4_inode_info *ei = EXT4_I(inode); 390 - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 391 - __le32 *p; 392 - ext4_fsblk_t bg_start; 393 - ext4_fsblk_t last_block; 394 - ext4_grpblk_t colour; 395 - ext4_group_t block_group; 396 - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 397 - 398 - /* Try to find previous block */ 399 - for (p = ind->p - 1; p >= start; p--) { 400 - if (*p) 401 - return le32_to_cpu(*p); 402 - } 403 - 404 - /* No such thing, so let's try location of indirect block */ 405 - if (ind->bh) 406 - return ind->bh->b_blocknr; 407 - 408 - /* 409 - * It is going to be referred to from the inode itself? OK, just put it 410 - * into the same cylinder group then. 411 - */ 412 - block_group = ei->i_block_group; 413 - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 414 - block_group &= ~(flex_size-1); 415 - if (S_ISREG(inode->i_mode)) 416 - block_group++; 417 - } 418 - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 419 - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 420 - 421 - /* 422 - * If we are doing delayed allocation, we don't need take 423 - * colour into account. 424 - */ 425 - if (test_opt(inode->i_sb, DELALLOC)) 426 - return bg_start; 427 - 428 - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 429 - colour = (current->pid % 16) * 430 - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 431 - else 432 - colour = (current->pid % 16) * ((last_block - bg_start) / 16); 433 - return bg_start + colour; 434 - } 435 - 436 - /** 437 - * ext4_find_goal - find a preferred place for allocation. 438 - * @inode: owner 439 - * @block: block we want 440 - * @partial: pointer to the last triple within a chain 441 - * 442 - * Normally this function find the preferred place for block allocation, 443 - * returns it. 444 - * Because this is only used for non-extent files, we limit the block nr 445 - * to 32 bits. 446 - */ 447 - static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 448 - Indirect *partial) 449 - { 450 - ext4_fsblk_t goal; 451 - 452 - /* 453 - * XXX need to get goal block from mballoc's data structures 454 - */ 455 - 456 - goal = ext4_find_near(inode, partial); 457 - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 458 - return goal; 459 - } 460 - 461 - /** 462 - * ext4_blks_to_allocate - Look up the block map and count the number 463 - * of direct blocks need to be allocated for the given branch. 464 - * 465 - * @branch: chain of indirect blocks 466 - * @k: number of blocks need for indirect blocks 467 - * @blks: number of data blocks to be mapped. 468 - * @blocks_to_boundary: the offset in the indirect block 469 - * 470 - * return the total number of blocks to be allocate, including the 471 - * direct and indirect blocks. 472 - */ 473 - static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 474 - int blocks_to_boundary) 475 - { 476 - unsigned int count = 0; 477 - 478 - /* 479 - * Simple case, [t,d]Indirect block(s) has not allocated yet 480 - * then it's clear blocks on that path have not allocated 481 - */ 482 - if (k > 0) { 483 - /* right now we don't handle cross boundary allocation */ 484 - if (blks < blocks_to_boundary + 1) 485 - count += blks; 486 - else 487 - count += blocks_to_boundary + 1; 488 - return count; 489 - } 490 - 491 - count++; 492 - while (count < blks && count <= blocks_to_boundary && 493 - le32_to_cpu(*(branch[0].p + count)) == 0) { 494 - count++; 495 - } 496 - return count; 497 - } 498 - 499 - /** 500 - * ext4_alloc_blocks: multiple allocate blocks needed for a branch 501 - * @handle: handle for this transaction 502 - * @inode: inode which needs allocated blocks 503 - * @iblock: the logical block to start allocated at 504 - * @goal: preferred physical block of allocation 505 - * @indirect_blks: the number of blocks need to allocate for indirect 506 - * blocks 507 - * @blks: number of desired blocks 508 - * @new_blocks: on return it will store the new block numbers for 509 - * the indirect blocks(if needed) and the first direct block, 510 - * @err: on return it will store the error code 511 - * 512 - * This function will return the number of blocks allocated as 513 - * requested by the passed-in parameters. 514 - */ 515 - static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 516 - ext4_lblk_t iblock, ext4_fsblk_t goal, 517 - int indirect_blks, int blks, 518 - ext4_fsblk_t new_blocks[4], int *err) 519 - { 520 - struct ext4_allocation_request ar; 521 - int target, i; 522 - unsigned long count = 0, blk_allocated = 0; 523 - int index = 0; 524 - ext4_fsblk_t current_block = 0; 525 - int ret = 0; 526 - 527 - /* 528 - * Here we try to allocate the requested multiple blocks at once, 529 - * on a best-effort basis. 530 - * To build a branch, we should allocate blocks for 531 - * the indirect blocks(if not allocated yet), and at least 532 - * the first direct block of this branch. That's the 533 - * minimum number of blocks need to allocate(required) 534 - */ 535 - /* first we try to allocate the indirect blocks */ 536 - target = indirect_blks; 537 - while (target > 0) { 538 - count = target; 539 - /* allocating blocks for indirect blocks and direct blocks */ 540 - current_block = ext4_new_meta_blocks(handle, inode, goal, 541 - 0, &count, err); 542 - if (*err) 543 - goto failed_out; 544 - 545 - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { 546 - EXT4_ERROR_INODE(inode, 547 - "current_block %llu + count %lu > %d!", 548 - current_block, count, 549 - EXT4_MAX_BLOCK_FILE_PHYS); 550 - *err = -EIO; 551 - goto failed_out; 552 - } 553 - 554 - target -= count; 555 - /* allocate blocks for indirect blocks */ 556 - while (index < indirect_blks && count) { 557 - new_blocks[index++] = current_block++; 558 - count--; 559 - } 560 - if (count > 0) { 561 - /* 562 - * save the new block number 563 - * for the first direct block 564 - */ 565 - new_blocks[index] = current_block; 566 - printk(KERN_INFO "%s returned more blocks than " 567 - "requested\n", __func__); 568 - WARN_ON(1); 569 - break; 570 - } 571 - } 572 - 573 - target = blks - count ; 574 - blk_allocated = count; 575 - if (!target) 576 - goto allocated; 577 - /* Now allocate data blocks */ 578 - memset(&ar, 0, sizeof(ar)); 579 - ar.inode = inode; 580 - ar.goal = goal; 581 - ar.len = target; 582 - ar.logical = iblock; 583 - if (S_ISREG(inode->i_mode)) 584 - /* enable in-core preallocation only for regular files */ 585 - ar.flags = EXT4_MB_HINT_DATA; 586 - 587 - current_block = ext4_mb_new_blocks(handle, &ar, err); 588 - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { 589 - EXT4_ERROR_INODE(inode, 590 - "current_block %llu + ar.len %d > %d!", 591 - current_block, ar.len, 592 - EXT4_MAX_BLOCK_FILE_PHYS); 593 - *err = -EIO; 594 - goto failed_out; 595 - } 596 - 597 - if (*err && (target == blks)) { 598 - /* 599 - * if the allocation failed and we didn't allocate 600 - * any blocks before 601 - */ 602 - goto failed_out; 603 - } 604 - if (!*err) { 605 - if (target == blks) { 606 - /* 607 - * save the new block number 608 - * for the first direct block 609 - */ 610 - new_blocks[index] = current_block; 611 - } 612 - blk_allocated += ar.len; 613 - } 614 - allocated: 615 - /* total number of blocks allocated for direct blocks */ 616 - ret = blk_allocated; 617 - *err = 0; 618 - return ret; 619 - failed_out: 620 - for (i = 0; i < index; i++) 621 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 622 - return ret; 623 - } 624 - 625 - /** 626 - * ext4_alloc_branch - allocate and set up a chain of blocks. 627 - * @handle: handle for this transaction 628 - * @inode: owner 629 - * @indirect_blks: number of allocated indirect blocks 630 - * @blks: number of allocated direct blocks 631 - * @goal: preferred place for allocation 632 - * @offsets: offsets (in the blocks) to store the pointers to next. 633 - * @branch: place to store the chain in. 634 - * 635 - * This function allocates blocks, zeroes out all but the last one, 636 - * links them into chain and (if we are synchronous) writes them to disk. 637 - * In other words, it prepares a branch that can be spliced onto the 638 - * inode. It stores the information about that chain in the branch[], in 639 - * the same format as ext4_get_branch() would do. We are calling it after 640 - * we had read the existing part of chain and partial points to the last 641 - * triple of that (one with zero ->key). Upon the exit we have the same 642 - * picture as after the successful ext4_get_block(), except that in one 643 - * place chain is disconnected - *branch->p is still zero (we did not 644 - * set the last link), but branch->key contains the number that should 645 - * be placed into *branch->p to fill that gap. 646 - * 647 - * If allocation fails we free all blocks we've allocated (and forget 648 - * their buffer_heads) and return the error value the from failed 649 - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 650 - * as described above and return 0. 651 - */ 652 - static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 653 - ext4_lblk_t iblock, int indirect_blks, 654 - int *blks, ext4_fsblk_t goal, 655 - ext4_lblk_t *offsets, Indirect *branch) 656 - { 657 - int blocksize = inode->i_sb->s_blocksize; 658 - int i, n = 0; 659 - int err = 0; 660 - struct buffer_head *bh; 661 - int num; 662 - ext4_fsblk_t new_blocks[4]; 663 - ext4_fsblk_t current_block; 664 - 665 - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 666 - *blks, new_blocks, &err); 667 - if (err) 668 - return err; 669 - 670 - branch[0].key = cpu_to_le32(new_blocks[0]); 671 - /* 672 - * metadata blocks and data blocks are allocated. 673 - */ 674 - for (n = 1; n <= indirect_blks; n++) { 675 - /* 676 - * Get buffer_head for parent block, zero it out 677 - * and set the pointer to new one, then send 678 - * parent to disk. 679 - */ 680 - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 681 - if (unlikely(!bh)) { 682 - err = -EIO; 683 - goto failed; 684 - } 685 - 686 - branch[n].bh = bh; 687 - lock_buffer(bh); 688 - BUFFER_TRACE(bh, "call get_create_access"); 689 - err = ext4_journal_get_create_access(handle, bh); 690 - if (err) { 691 - /* Don't brelse(bh) here; it's done in 692 - * ext4_journal_forget() below */ 693 - unlock_buffer(bh); 694 - goto failed; 695 - } 696 - 697 - memset(bh->b_data, 0, blocksize); 698 - branch[n].p = (__le32 *) bh->b_data + offsets[n]; 699 - branch[n].key = cpu_to_le32(new_blocks[n]); 700 - *branch[n].p = branch[n].key; 701 - if (n == indirect_blks) { 702 - current_block = new_blocks[n]; 703 - /* 704 - * End of chain, update the last new metablock of 705 - * the chain to point to the new allocated 706 - * data blocks numbers 707 - */ 708 - for (i = 1; i < num; i++) 709 - *(branch[n].p + i) = cpu_to_le32(++current_block); 710 - } 711 - BUFFER_TRACE(bh, "marking uptodate"); 712 - set_buffer_uptodate(bh); 713 - unlock_buffer(bh); 714 - 715 - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 716 - err = ext4_handle_dirty_metadata(handle, inode, bh); 717 - if (err) 718 - goto failed; 719 - } 720 - *blks = num; 721 - return err; 722 - failed: 723 - /* Allocation failed, free what we already allocated */ 724 - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 725 - for (i = 1; i <= n ; i++) { 726 - /* 727 - * branch[i].bh is newly allocated, so there is no 728 - * need to revoke the block, which is why we don't 729 - * need to set EXT4_FREE_BLOCKS_METADATA. 730 - */ 731 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 732 - EXT4_FREE_BLOCKS_FORGET); 733 - } 734 - for (i = n+1; i < indirect_blks; i++) 735 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 736 - 737 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); 738 - 739 - return err; 740 - } 741 - 742 - /** 743 - * ext4_splice_branch - splice the allocated branch onto inode. 744 - * @handle: handle for this transaction 745 - * @inode: owner 746 - * @block: (logical) number of block we are adding 747 - * @chain: chain of indirect blocks (with a missing link - see 748 - * ext4_alloc_branch) 749 - * @where: location of missing link 750 - * @num: number of indirect blocks we are adding 751 - * @blks: number of direct blocks we are adding 752 - * 753 - * This function fills the missing link and does all housekeeping needed in 754 - * inode (->i_blocks, etc.). In case of success we end up with the full 755 - * chain to new block and return 0. 756 - */ 757 - static int ext4_splice_branch(handle_t *handle, struct inode *inode, 758 - ext4_lblk_t block, Indirect *where, int num, 759 - int blks) 760 - { 761 - int i; 762 - int err = 0; 763 - ext4_fsblk_t current_block; 764 - 765 - /* 766 - * If we're splicing into a [td]indirect block (as opposed to the 767 - * inode) then we need to get write access to the [td]indirect block 768 - * before the splice. 769 - */ 770 - if (where->bh) { 771 - BUFFER_TRACE(where->bh, "get_write_access"); 772 - err = ext4_journal_get_write_access(handle, where->bh); 773 - if (err) 774 - goto err_out; 775 - } 776 - /* That's it */ 777 - 778 - *where->p = where->key; 779 - 780 - /* 781 - * Update the host buffer_head or inode to point to more just allocated 782 - * direct blocks blocks 783 - */ 784 - if (num == 0 && blks > 1) { 785 - current_block = le32_to_cpu(where->key) + 1; 786 - for (i = 1; i < blks; i++) 787 - *(where->p + i) = cpu_to_le32(current_block++); 788 - } 789 - 790 - /* We are done with atomic stuff, now do the rest of housekeeping */ 791 - /* had we spliced it onto indirect block? */ 792 - if (where->bh) { 793 - /* 794 - * If we spliced it onto an indirect block, we haven't 795 - * altered the inode. Note however that if it is being spliced 796 - * onto an indirect block at the very end of the file (the 797 - * file is growing) then we *will* alter the inode to reflect 798 - * the new i_size. But that is not done here - it is done in 799 - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 800 - */ 801 - jbd_debug(5, "splicing indirect only\n"); 802 - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 803 - err = ext4_handle_dirty_metadata(handle, inode, where->bh); 804 - if (err) 805 - goto err_out; 806 - } else { 807 - /* 808 - * OK, we spliced it into the inode itself on a direct block. 809 - */ 810 - ext4_mark_inode_dirty(handle, inode); 811 - jbd_debug(5, "splicing direct\n"); 812 - } 813 - return err; 814 - 815 - err_out: 816 - for (i = 1; i <= num; i++) { 817 - /* 818 - * branch[i].bh is newly allocated, so there is no 819 - * need to revoke the block, which is why we don't 820 - * need to set EXT4_FREE_BLOCKS_METADATA. 821 - */ 822 - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 823 - EXT4_FREE_BLOCKS_FORGET); 824 - } 825 - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), 826 - blks, 0); 827 - 828 - return err; 829 - } 830 - 831 - /* 832 - * The ext4_ind_map_blocks() function handles non-extents inodes 833 - * (i.e., using the traditional indirect/double-indirect i_blocks 834 - * scheme) for ext4_map_blocks(). 835 - * 836 - * Allocation strategy is simple: if we have to allocate something, we will 837 - * have to go the whole way to leaf. So let's do it before attaching anything 838 - * to tree, set linkage between the newborn blocks, write them if sync is 839 - * required, recheck the path, free and repeat if check fails, otherwise 840 - * set the last missing link (that will protect us from any truncate-generated 841 - * removals - all blocks on the path are immune now) and possibly force the 842 - * write on the parent block. 843 - * That has a nice additional property: no special recovery from the failed 844 - * allocations is needed - we simply release blocks and do not touch anything 845 - * reachable from inode. 846 - * 847 - * `handle' can be NULL if create == 0. 848 - * 849 - * return > 0, # of blocks mapped or allocated. 850 - * return = 0, if plain lookup failed. 851 - * return < 0, error case. 852 - * 853 - * The ext4_ind_get_blocks() function should be called with 854 - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem 855 - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or 856 - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 857 - * blocks. 858 - */ 859 - static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 860 - struct ext4_map_blocks *map, 861 - int flags) 862 - { 863 - int err = -EIO; 864 - ext4_lblk_t offsets[4]; 865 - Indirect chain[4]; 866 - Indirect *partial; 867 - ext4_fsblk_t goal; 868 - int indirect_blks; 869 - int blocks_to_boundary = 0; 870 - int depth; 871 - int count = 0; 872 - ext4_fsblk_t first_block = 0; 873 - 874 - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 875 - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 876 - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 877 - depth = ext4_block_to_path(inode, map->m_lblk, offsets, 878 - &blocks_to_boundary); 879 - 880 - if (depth == 0) 881 - goto out; 882 - 883 - partial = ext4_get_branch(inode, depth, offsets, chain, &err); 884 - 885 - /* Simplest case - block found, no allocation needed */ 886 - if (!partial) { 887 - first_block = le32_to_cpu(chain[depth - 1].key); 888 - count++; 889 - /*map more blocks*/ 890 - while (count < map->m_len && count <= blocks_to_boundary) { 891 - ext4_fsblk_t blk; 892 - 893 - blk = le32_to_cpu(*(chain[depth-1].p + count)); 894 - 895 - if (blk == first_block + count) 896 - count++; 897 - else 898 - break; 899 - } 900 - goto got_it; 901 - } 902 - 903 - /* Next simple case - plain lookup or failed read of indirect block */ 904 - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 905 - goto cleanup; 906 - 907 - /* 908 - * Okay, we need to do block allocation. 909 - */ 910 - goal = ext4_find_goal(inode, map->m_lblk, partial); 911 - 912 - /* the number of blocks need to allocate for [d,t]indirect blocks */ 913 - indirect_blks = (chain + depth) - partial - 1; 914 - 915 - /* 916 - * Next look up the indirect map to count the totoal number of 917 - * direct blocks to allocate for this branch. 918 - */ 919 - count = ext4_blks_to_allocate(partial, indirect_blks, 920 - map->m_len, blocks_to_boundary); 921 - /* 922 - * Block out ext4_truncate while we alter the tree 923 - */ 924 - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, 925 - &count, goal, 926 - offsets + (partial - chain), partial); 927 - 928 - /* 929 - * The ext4_splice_branch call will free and forget any buffers 930 - * on the new chain if there is a failure, but that risks using 931 - * up transaction credits, especially for bitmaps where the 932 - * credits cannot be returned. Can we handle this somehow? We 933 - * may need to return -EAGAIN upwards in the worst case. --sct 934 - */ 935 - if (!err) 936 - err = ext4_splice_branch(handle, inode, map->m_lblk, 937 - partial, indirect_blks, count); 938 - if (err) 939 - goto cleanup; 940 - 941 - map->m_flags |= EXT4_MAP_NEW; 942 - 943 - ext4_update_inode_fsync_trans(handle, inode, 1); 944 - got_it: 945 - map->m_flags |= EXT4_MAP_MAPPED; 946 - map->m_pblk = le32_to_cpu(chain[depth-1].key); 947 - map->m_len = count; 948 - if (count > blocks_to_boundary) 949 - map->m_flags |= EXT4_MAP_BOUNDARY; 950 - err = count; 951 - /* Clean up and exit */ 952 - partial = chain + depth - 1; /* the whole chain */ 953 - cleanup: 954 - while (partial > chain) { 955 - BUFFER_TRACE(partial->bh, "call brelse"); 956 - brelse(partial->bh); 957 - partial--; 958 - } 959 - out: 960 - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 961 - map->m_pblk, map->m_len, err); 962 - return err; 963 - } 964 - 965 254 #ifdef CONFIG_QUOTA 966 255 qsize_t *ext4_get_reserved_space(struct inode *inode) 967 256 { 968 257 return &EXT4_I(inode)->i_reserved_quota; 969 258 } 970 259 #endif 971 - 972 - /* 973 - * Calculate the number of metadata blocks need to reserve 974 - * to allocate a new block at @lblocks for non extent file based file 975 - */ 976 - static int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) 977 - { 978 - struct ext4_inode_info *ei = EXT4_I(inode); 979 - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); 980 - int blk_bits; 981 - 982 - if (lblock < EXT4_NDIR_BLOCKS) 983 - return 0; 984 - 985 - lblock -= EXT4_NDIR_BLOCKS; 986 - 987 - if (ei->i_da_metadata_calc_len && 988 - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { 989 - ei->i_da_metadata_calc_len++; 990 - return 0; 991 - } 992 - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 993 - ei->i_da_metadata_calc_len = 1; 994 - blk_bits = order_base_2(lblock); 995 - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 996 - } 997 260 998 261 /* 999 262 * Calculate the number of metadata blocks need to reserve ··· 2557 3380 } 2558 3381 2559 3382 /* 2560 - * O_DIRECT for ext3 (or indirect map) based files 2561 - * 2562 - * If the O_DIRECT write will extend the file then add this inode to the 2563 - * orphan list. So recovery will truncate it back to the original size 2564 - * if the machine crashes during the write. 2565 - * 2566 - * If the O_DIRECT write is intantiating holes inside i_size and the machine 2567 - * crashes then stale disk data _may_ be exposed inside the file. But current 2568 - * VFS code falls back into buffered path in that case so we are safe. 2569 - */ 2570 - static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 2571 - const struct iovec *iov, loff_t offset, 2572 - unsigned long nr_segs) 2573 - { 2574 - struct file *file = iocb->ki_filp; 2575 - struct inode *inode = file->f_mapping->host; 2576 - struct ext4_inode_info *ei = EXT4_I(inode); 2577 - handle_t *handle; 2578 - ssize_t ret; 2579 - int orphan = 0; 2580 - size_t count = iov_length(iov, nr_segs); 2581 - int retries = 0; 2582 - 2583 - if (rw == WRITE) { 2584 - loff_t final_size = offset + count; 2585 - 2586 - if (final_size > inode->i_size) { 2587 - /* Credits for sb + inode write */ 2588 - handle = ext4_journal_start(inode, 2); 2589 - if (IS_ERR(handle)) { 2590 - ret = PTR_ERR(handle); 2591 - goto out; 2592 - } 2593 - ret = ext4_orphan_add(handle, inode); 2594 - if (ret) { 2595 - ext4_journal_stop(handle); 2596 - goto out; 2597 - } 2598 - orphan = 1; 2599 - ei->i_disksize = inode->i_size; 2600 - ext4_journal_stop(handle); 2601 - } 2602 - } 2603 - 2604 - retry: 2605 - if (rw == READ && ext4_should_dioread_nolock(inode)) 2606 - ret = __blockdev_direct_IO(rw, iocb, inode, 2607 - inode->i_sb->s_bdev, iov, 2608 - offset, nr_segs, 2609 - ext4_get_block, NULL, NULL, 0); 2610 - else { 2611 - ret = blockdev_direct_IO(rw, iocb, inode, 2612 - inode->i_sb->s_bdev, iov, 2613 - offset, nr_segs, 2614 - ext4_get_block, NULL); 2615 - 2616 - if (unlikely((rw & WRITE) && ret < 0)) { 2617 - loff_t isize = i_size_read(inode); 2618 - loff_t end = offset + iov_length(iov, nr_segs); 2619 - 2620 - if (end > isize) 2621 - ext4_truncate_failed_write(inode); 2622 - } 2623 - } 2624 - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2625 - goto retry; 2626 - 2627 - if (orphan) { 2628 - int err; 2629 - 2630 - /* Credits for sb + inode write */ 2631 - handle = ext4_journal_start(inode, 2); 2632 - if (IS_ERR(handle)) { 2633 - /* This is really bad luck. We've written the data 2634 - * but cannot extend i_size. Bail out and pretend 2635 - * the write failed... */ 2636 - ret = PTR_ERR(handle); 2637 - if (inode->i_nlink) 2638 - ext4_orphan_del(NULL, inode); 2639 - 2640 - goto out; 2641 - } 2642 - if (inode->i_nlink) 2643 - ext4_orphan_del(handle, inode); 2644 - if (ret > 0) { 2645 - loff_t end = offset + ret; 2646 - if (end > inode->i_size) { 2647 - ei->i_disksize = end; 2648 - i_size_write(inode, end); 2649 - /* 2650 - * We're going to return a positive `ret' 2651 - * here due to non-zero-length I/O, so there's 2652 - * no way of reporting error returns from 2653 - * ext4_mark_inode_dirty() to userspace. So 2654 - * ignore it. 2655 - */ 2656 - ext4_mark_inode_dirty(handle, inode); 2657 - } 2658 - } 2659 - err = ext4_journal_stop(handle); 2660 - if (ret == 0) 2661 - ret = err; 2662 - } 2663 - out: 2664 - return ret; 2665 - } 2666 - 2667 - /* 2668 3383 * ext4_get_block used when preparing for a DIO write or buffer write. 2669 3384 * We allocate an uinitialized extent if blocks haven't been allocated. 2670 3385 * The extent will be converted to initialized after the IO is complete. ··· 3027 3958 return err; 3028 3959 } 3029 3960 3030 - /* 3031 - * Probably it should be a library function... search for first non-zero word 3032 - * or memcmp with zero_page, whatever is better for particular architecture. 3033 - * Linus? 3034 - */ 3035 - static inline int all_zeroes(__le32 *p, __le32 *q) 3036 - { 3037 - while (p < q) 3038 - if (*p++) 3039 - return 0; 3040 - return 1; 3041 - } 3042 - 3043 - /** 3044 - * ext4_find_shared - find the indirect blocks for partial truncation. 3045 - * @inode: inode in question 3046 - * @depth: depth of the affected branch 3047 - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3048 - * @chain: place to store the pointers to partial indirect blocks 3049 - * @top: place to the (detached) top of branch 3050 - * 3051 - * This is a helper function used by ext4_truncate(). 3052 - * 3053 - * When we do truncate() we may have to clean the ends of several 3054 - * indirect blocks but leave the blocks themselves alive. Block is 3055 - * partially truncated if some data below the new i_size is referred 3056 - * from it (and it is on the path to the first completely truncated 3057 - * data block, indeed). We have to free the top of that path along 3058 - * with everything to the right of the path. Since no allocation 3059 - * past the truncation point is possible until ext4_truncate() 3060 - * finishes, we may safely do the latter, but top of branch may 3061 - * require special attention - pageout below the truncation point 3062 - * might try to populate it. 3063 - * 3064 - * We atomically detach the top of branch from the tree, store the 3065 - * block number of its root in *@top, pointers to buffer_heads of 3066 - * partially truncated blocks - in @chain[].bh and pointers to 3067 - * their last elements that should not be removed - in 3068 - * @chain[].p. Return value is the pointer to last filled element 3069 - * of @chain. 3070 - * 3071 - * The work left to caller to do the actual freeing of subtrees: 3072 - * a) free the subtree starting from *@top 3073 - * b) free the subtrees whose roots are stored in 3074 - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3075 - * c) free the subtrees growing from the inode past the @chain[0]. 3076 - * (no partially truncated stuff there). */ 3077 - 3078 - static Indirect *ext4_find_shared(struct inode *inode, int depth, 3079 - ext4_lblk_t offsets[4], Indirect chain[4], 3080 - __le32 *top) 3081 - { 3082 - Indirect *partial, *p; 3083 - int k, err; 3084 - 3085 - *top = 0; 3086 - /* Make k index the deepest non-null offset + 1 */ 3087 - for (k = depth; k > 1 && !offsets[k-1]; k--) 3088 - ; 3089 - partial = ext4_get_branch(inode, k, offsets, chain, &err); 3090 - /* Writer: pointers */ 3091 - if (!partial) 3092 - partial = chain + k-1; 3093 - /* 3094 - * If the branch acquired continuation since we've looked at it - 3095 - * fine, it should all survive and (new) top doesn't belong to us. 3096 - */ 3097 - if (!partial->key && *partial->p) 3098 - /* Writer: end */ 3099 - goto no_top; 3100 - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3101 - ; 3102 - /* 3103 - * OK, we've found the last block that must survive. The rest of our 3104 - * branch should be detached before unlocking. However, if that rest 3105 - * of branch is all ours and does not grow immediately from the inode 3106 - * it's easier to cheat and just decrement partial->p. 3107 - */ 3108 - if (p == chain + k - 1 && p > chain) { 3109 - p->p--; 3110 - } else { 3111 - *top = *p->p; 3112 - /* Nope, don't do this in ext4. Must leave the tree intact */ 3113 - #if 0 3114 - *p->p = 0; 3115 - #endif 3116 - } 3117 - /* Writer: end */ 3118 - 3119 - while (partial > p) { 3120 - brelse(partial->bh); 3121 - partial--; 3122 - } 3123 - no_top: 3124 - return partial; 3125 - } 3126 - 3127 - /* 3128 - * Zero a number of block pointers in either an inode or an indirect block. 3129 - * If we restart the transaction we must again get write access to the 3130 - * indirect block for further modification. 3131 - * 3132 - * We release `count' blocks on disk, but (last - first) may be greater 3133 - * than `count' because there can be holes in there. 3134 - * 3135 - * Return 0 on success, 1 on invalid block range 3136 - * and < 0 on fatal error. 3137 - */ 3138 - static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 3139 - struct buffer_head *bh, 3140 - ext4_fsblk_t block_to_free, 3141 - unsigned long count, __le32 *first, 3142 - __le32 *last) 3143 - { 3144 - __le32 *p; 3145 - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 3146 - int err; 3147 - 3148 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 3149 - flags |= EXT4_FREE_BLOCKS_METADATA; 3150 - 3151 - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 3152 - count)) { 3153 - EXT4_ERROR_INODE(inode, "attempt to clear invalid " 3154 - "blocks %llu len %lu", 3155 - (unsigned long long) block_to_free, count); 3156 - return 1; 3157 - } 3158 - 3159 - if (try_to_extend_transaction(handle, inode)) { 3160 - if (bh) { 3161 - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 3162 - err = ext4_handle_dirty_metadata(handle, inode, bh); 3163 - if (unlikely(err)) 3164 - goto out_err; 3165 - } 3166 - err = ext4_mark_inode_dirty(handle, inode); 3167 - if (unlikely(err)) 3168 - goto out_err; 3169 - err = ext4_truncate_restart_trans(handle, inode, 3170 - ext4_blocks_for_truncate(inode)); 3171 - if (unlikely(err)) 3172 - goto out_err; 3173 - if (bh) { 3174 - BUFFER_TRACE(bh, "retaking write access"); 3175 - err = ext4_journal_get_write_access(handle, bh); 3176 - if (unlikely(err)) 3177 - goto out_err; 3178 - } 3179 - } 3180 - 3181 - for (p = first; p < last; p++) 3182 - *p = 0; 3183 - 3184 - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); 3185 - return 0; 3186 - out_err: 3187 - ext4_std_error(inode->i_sb, err); 3188 - return err; 3189 - } 3190 - 3191 - /** 3192 - * ext4_free_data - free a list of data blocks 3193 - * @handle: handle for this transaction 3194 - * @inode: inode we are dealing with 3195 - * @this_bh: indirect buffer_head which contains *@first and *@last 3196 - * @first: array of block numbers 3197 - * @last: points immediately past the end of array 3198 - * 3199 - * We are freeing all blocks referred from that array (numbers are stored as 3200 - * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3201 - * 3202 - * We accumulate contiguous runs of blocks to free. Conveniently, if these 3203 - * blocks are contiguous then releasing them at one time will only affect one 3204 - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3205 - * actually use a lot of journal space. 3206 - * 3207 - * @this_bh will be %NULL if @first and @last point into the inode's direct 3208 - * block pointers. 3209 - */ 3210 - static void ext4_free_data(handle_t *handle, struct inode *inode, 3211 - struct buffer_head *this_bh, 3212 - __le32 *first, __le32 *last) 3213 - { 3214 - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3215 - unsigned long count = 0; /* Number of blocks in the run */ 3216 - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3217 - corresponding to 3218 - block_to_free */ 3219 - ext4_fsblk_t nr; /* Current block # */ 3220 - __le32 *p; /* Pointer into inode/ind 3221 - for current block */ 3222 - int err = 0; 3223 - 3224 - if (this_bh) { /* For indirect block */ 3225 - BUFFER_TRACE(this_bh, "get_write_access"); 3226 - err = ext4_journal_get_write_access(handle, this_bh); 3227 - /* Important: if we can't update the indirect pointers 3228 - * to the blocks, we can't free them. */ 3229 - if (err) 3230 - return; 3231 - } 3232 - 3233 - for (p = first; p < last; p++) { 3234 - nr = le32_to_cpu(*p); 3235 - if (nr) { 3236 - /* accumulate blocks to free if they're contiguous */ 3237 - if (count == 0) { 3238 - block_to_free = nr; 3239 - block_to_free_p = p; 3240 - count = 1; 3241 - } else if (nr == block_to_free + count) { 3242 - count++; 3243 - } else { 3244 - err = ext4_clear_blocks(handle, inode, this_bh, 3245 - block_to_free, count, 3246 - block_to_free_p, p); 3247 - if (err) 3248 - break; 3249 - block_to_free = nr; 3250 - block_to_free_p = p; 3251 - count = 1; 3252 - } 3253 - } 3254 - } 3255 - 3256 - if (!err && count > 0) 3257 - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3258 - count, block_to_free_p, p); 3259 - if (err < 0) 3260 - /* fatal error */ 3261 - return; 3262 - 3263 - if (this_bh) { 3264 - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 3265 - 3266 - /* 3267 - * The buffer head should have an attached journal head at this 3268 - * point. However, if the data is corrupted and an indirect 3269 - * block pointed to itself, it would have been detached when 3270 - * the block was cleared. Check for this instead of OOPSing. 3271 - */ 3272 - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 3273 - ext4_handle_dirty_metadata(handle, inode, this_bh); 3274 - else 3275 - EXT4_ERROR_INODE(inode, 3276 - "circular indirect block detected at " 3277 - "block %llu", 3278 - (unsigned long long) this_bh->b_blocknr); 3279 - } 3280 - } 3281 - 3282 - /** 3283 - * ext4_free_branches - free an array of branches 3284 - * @handle: JBD handle for this transaction 3285 - * @inode: inode we are dealing with 3286 - * @parent_bh: the buffer_head which contains *@first and *@last 3287 - * @first: array of block numbers 3288 - * @last: pointer immediately past the end of array 3289 - * @depth: depth of the branches to free 3290 - * 3291 - * We are freeing all blocks referred from these branches (numbers are 3292 - * stored as little-endian 32-bit) and updating @inode->i_blocks 3293 - * appropriately. 3294 - */ 3295 - static void ext4_free_branches(handle_t *handle, struct inode *inode, 3296 - struct buffer_head *parent_bh, 3297 - __le32 *first, __le32 *last, int depth) 3298 - { 3299 - ext4_fsblk_t nr; 3300 - __le32 *p; 3301 - 3302 - if (ext4_handle_is_aborted(handle)) 3303 - return; 3304 - 3305 - if (depth--) { 3306 - struct buffer_head *bh; 3307 - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3308 - p = last; 3309 - while (--p >= first) { 3310 - nr = le32_to_cpu(*p); 3311 - if (!nr) 3312 - continue; /* A hole */ 3313 - 3314 - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 3315 - nr, 1)) { 3316 - EXT4_ERROR_INODE(inode, 3317 - "invalid indirect mapped " 3318 - "block %lu (level %d)", 3319 - (unsigned long) nr, depth); 3320 - break; 3321 - } 3322 - 3323 - /* Go read the buffer for the next level down */ 3324 - bh = sb_bread(inode->i_sb, nr); 3325 - 3326 - /* 3327 - * A read failure? Report error and clear slot 3328 - * (should be rare). 3329 - */ 3330 - if (!bh) { 3331 - EXT4_ERROR_INODE_BLOCK(inode, nr, 3332 - "Read failure"); 3333 - continue; 3334 - } 3335 - 3336 - /* This zaps the entire block. Bottom up. */ 3337 - BUFFER_TRACE(bh, "free child branches"); 3338 - ext4_free_branches(handle, inode, bh, 3339 - (__le32 *) bh->b_data, 3340 - (__le32 *) bh->b_data + addr_per_block, 3341 - depth); 3342 - brelse(bh); 3343 - 3344 - /* 3345 - * Everything below this this pointer has been 3346 - * released. Now let this top-of-subtree go. 3347 - * 3348 - * We want the freeing of this indirect block to be 3349 - * atomic in the journal with the updating of the 3350 - * bitmap block which owns it. So make some room in 3351 - * the journal. 3352 - * 3353 - * We zero the parent pointer *after* freeing its 3354 - * pointee in the bitmaps, so if extend_transaction() 3355 - * for some reason fails to put the bitmap changes and 3356 - * the release into the same transaction, recovery 3357 - * will merely complain about releasing a free block, 3358 - * rather than leaking blocks. 3359 - */ 3360 - if (ext4_handle_is_aborted(handle)) 3361 - return; 3362 - if (try_to_extend_transaction(handle, inode)) { 3363 - ext4_mark_inode_dirty(handle, inode); 3364 - ext4_truncate_restart_trans(handle, inode, 3365 - ext4_blocks_for_truncate(inode)); 3366 - } 3367 - 3368 - /* 3369 - * The forget flag here is critical because if 3370 - * we are journaling (and not doing data 3371 - * journaling), we have to make sure a revoke 3372 - * record is written to prevent the journal 3373 - * replay from overwriting the (former) 3374 - * indirect block if it gets reallocated as a 3375 - * data block. This must happen in the same 3376 - * transaction where the data blocks are 3377 - * actually freed. 3378 - */ 3379 - ext4_free_blocks(handle, inode, NULL, nr, 1, 3380 - EXT4_FREE_BLOCKS_METADATA| 3381 - EXT4_FREE_BLOCKS_FORGET); 3382 - 3383 - if (parent_bh) { 3384 - /* 3385 - * The block which we have just freed is 3386 - * pointed to by an indirect block: journal it 3387 - */ 3388 - BUFFER_TRACE(parent_bh, "get_write_access"); 3389 - if (!ext4_journal_get_write_access(handle, 3390 - parent_bh)){ 3391 - *p = 0; 3392 - BUFFER_TRACE(parent_bh, 3393 - "call ext4_handle_dirty_metadata"); 3394 - ext4_handle_dirty_metadata(handle, 3395 - inode, 3396 - parent_bh); 3397 - } 3398 - } 3399 - } 3400 - } else { 3401 - /* We have reached the bottom of the tree. */ 3402 - BUFFER_TRACE(parent_bh, "free data blocks"); 3403 - ext4_free_data(handle, inode, parent_bh, first, last); 3404 - } 3405 - } 3406 - 3407 3961 int ext4_can_truncate(struct inode *inode) 3408 3962 { 3409 3963 if (S_ISREG(inode->i_mode)) ··· 3108 4416 else 3109 4417 ext4_ind_truncate(inode); 3110 4418 3111 - trace_ext4_truncate_exit(inode); 3112 - } 3113 - 3114 - void ext4_ind_truncate(struct inode *inode) 3115 - { 3116 - handle_t *handle; 3117 - struct ext4_inode_info *ei = EXT4_I(inode); 3118 - __le32 *i_data = ei->i_data; 3119 - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3120 - struct address_space *mapping = inode->i_mapping; 3121 - ext4_lblk_t offsets[4]; 3122 - Indirect chain[4]; 3123 - Indirect *partial; 3124 - __le32 nr = 0; 3125 - int n = 0; 3126 - ext4_lblk_t last_block, max_block; 3127 - unsigned blocksize = inode->i_sb->s_blocksize; 3128 - 3129 - handle = start_transaction(inode); 3130 - if (IS_ERR(handle)) 3131 - return; /* AKPM: return what? */ 3132 - 3133 - last_block = (inode->i_size + blocksize-1) 3134 - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3135 - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 3136 - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3137 - 3138 - if (inode->i_size & (blocksize - 1)) 3139 - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 3140 - goto out_stop; 3141 - 3142 - if (last_block != max_block) { 3143 - n = ext4_block_to_path(inode, last_block, offsets, NULL); 3144 - if (n == 0) 3145 - goto out_stop; /* error */ 3146 - } 3147 - 3148 - /* 3149 - * OK. This truncate is going to happen. We add the inode to the 3150 - * orphan list, so that if this truncate spans multiple transactions, 3151 - * and we crash, we will resume the truncate when the filesystem 3152 - * recovers. It also marks the inode dirty, to catch the new size. 3153 - * 3154 - * Implication: the file must always be in a sane, consistent 3155 - * truncatable state while each transaction commits. 3156 - */ 3157 - if (ext4_orphan_add(handle, inode)) 3158 - goto out_stop; 3159 - 3160 - /* 3161 - * From here we block out all ext4_get_block() callers who want to 3162 - * modify the block allocation tree. 3163 - */ 3164 - down_write(&ei->i_data_sem); 3165 - 3166 - ext4_discard_preallocations(inode); 3167 - 3168 - /* 3169 - * The orphan list entry will now protect us from any crash which 3170 - * occurs before the truncate completes, so it is now safe to propagate 3171 - * the new, shorter inode size (held for now in i_size) into the 3172 - * on-disk inode. We do this via i_disksize, which is the value which 3173 - * ext4 *really* writes onto the disk inode. 3174 - */ 3175 - ei->i_disksize = inode->i_size; 3176 - 3177 - if (last_block == max_block) { 3178 - /* 3179 - * It is unnecessary to free any data blocks if last_block is 3180 - * equal to the indirect block limit. 3181 - */ 3182 - goto out_unlock; 3183 - } else if (n == 1) { /* direct blocks */ 3184 - ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3185 - i_data + EXT4_NDIR_BLOCKS); 3186 - goto do_indirects; 3187 - } 3188 - 3189 - partial = ext4_find_shared(inode, n, offsets, chain, &nr); 3190 - /* Kill the top of shared branch (not detached) */ 3191 - if (nr) { 3192 - if (partial == chain) { 3193 - /* Shared branch grows from the inode */ 3194 - ext4_free_branches(handle, inode, NULL, 3195 - &nr, &nr+1, (chain+n-1) - partial); 3196 - *partial->p = 0; 3197 - /* 3198 - * We mark the inode dirty prior to restart, 3199 - * and prior to stop. No need for it here. 3200 - */ 3201 - } else { 3202 - /* Shared branch grows from an indirect block */ 3203 - BUFFER_TRACE(partial->bh, "get_write_access"); 3204 - ext4_free_branches(handle, inode, partial->bh, 3205 - partial->p, 3206 - partial->p+1, (chain+n-1) - partial); 3207 - } 3208 - } 3209 - /* Clear the ends of indirect blocks on the shared branch */ 3210 - while (partial > chain) { 3211 - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 3212 - (__le32*)partial->bh->b_data+addr_per_block, 3213 - (chain+n-1) - partial); 3214 - BUFFER_TRACE(partial->bh, "call brelse"); 3215 - brelse(partial->bh); 3216 - partial--; 3217 - } 3218 - do_indirects: 3219 - /* Kill the remaining (whole) subtrees */ 3220 - switch (offsets[0]) { 3221 - default: 3222 - nr = i_data[EXT4_IND_BLOCK]; 3223 - if (nr) { 3224 - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 3225 - i_data[EXT4_IND_BLOCK] = 0; 3226 - } 3227 - case EXT4_IND_BLOCK: 3228 - nr = i_data[EXT4_DIND_BLOCK]; 3229 - if (nr) { 3230 - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 3231 - i_data[EXT4_DIND_BLOCK] = 0; 3232 - } 3233 - case EXT4_DIND_BLOCK: 3234 - nr = i_data[EXT4_TIND_BLOCK]; 3235 - if (nr) { 3236 - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 3237 - i_data[EXT4_TIND_BLOCK] = 0; 3238 - } 3239 - case EXT4_TIND_BLOCK: 3240 - ; 3241 - } 3242 - 3243 - out_unlock: 3244 - up_write(&ei->i_data_sem); 3245 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3246 - ext4_mark_inode_dirty(handle, inode); 3247 - 3248 - /* 3249 - * In a multi-transaction truncate, we only make the final transaction 3250 - * synchronous 3251 - */ 3252 - if (IS_SYNC(inode)) 3253 - ext4_handle_sync(handle); 3254 - out_stop: 3255 - /* 3256 - * If this was a simple ftruncate(), and the file will remain alive 3257 - * then we need to clear up the orphan record which we created above. 3258 - * However, if this was a real unlink then we were called by 3259 - * ext4_delete_inode(), and we allow that function to clean up the 3260 - * orphan info for us. 3261 - */ 3262 - if (inode->i_nlink) 3263 - ext4_orphan_del(handle, inode); 3264 - 3265 - ext4_journal_stop(handle); 3266 4419 trace_ext4_truncate_exit(inode); 3267 4420 } 3268 4421 ··· 3921 5384 3922 5385 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 3923 5386 return 0; 3924 - } 3925 - 3926 - static int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 3927 - { 3928 - int indirects; 3929 - 3930 - /* if nrblocks are contiguous */ 3931 - if (chunk) { 3932 - /* 3933 - * With N contiguous data blocks, we need at most 3934 - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, 3935 - * 2 dindirect blocks, and 1 tindirect block 3936 - */ 3937 - return DIV_ROUND_UP(nrblocks, 3938 - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; 3939 - } 3940 - /* 3941 - * if nrblocks are not contiguous, worse case, each block touch 3942 - * a indirect block, and each indirect block touch a double indirect 3943 - * block, plus a triple indirect block 3944 - */ 3945 - indirects = nrblocks * 2 + 1; 3946 - return indirects; 3947 5387 } 3948 5388 3949 5389 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)