Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.19 3219 lines 96 kB view raw
1/* 2 * linux/fs/ext3/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 23 */ 24 25#include <linux/module.h> 26#include <linux/fs.h> 27#include <linux/time.h> 28#include <linux/ext3_jbd.h> 29#include <linux/jbd.h> 30#include <linux/smp_lock.h> 31#include <linux/highuid.h> 32#include <linux/pagemap.h> 33#include <linux/quotaops.h> 34#include <linux/string.h> 35#include <linux/buffer_head.h> 36#include <linux/writeback.h> 37#include <linux/mpage.h> 38#include <linux/uio.h> 39#include <linux/bio.h> 40#include "xattr.h" 41#include "acl.h" 42 43static int ext3_writepage_trans_blocks(struct inode *inode); 44 45/* 46 * Test whether an inode is a fast symlink. 47 */ 48static int ext3_inode_is_fast_symlink(struct inode *inode) 49{ 50 int ea_blocks = EXT3_I(inode)->i_file_acl ? 51 (inode->i_sb->s_blocksize >> 9) : 0; 52 53 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 54} 55 56/* 57 * The ext3 forget function must perform a revoke if we are freeing data 58 * which has been journaled. Metadata (eg. indirect blocks) must be 59 * revoked in all cases. 60 * 61 * "bh" may be NULL: a metadata block may have been freed from memory 62 * but there may still be a record of it in the journal, and that record 63 * still needs to be revoked. 64 */ 65int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, 66 struct buffer_head *bh, ext3_fsblk_t blocknr) 67{ 68 int err; 69 70 might_sleep(); 71 72 BUFFER_TRACE(bh, "enter"); 73 74 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 75 "data mode %lx\n", 76 bh, is_metadata, inode->i_mode, 77 test_opt(inode->i_sb, DATA_FLAGS)); 78 79 /* Never use the revoke function if we are doing full data 80 * journaling: there is no need to, and a V1 superblock won't 81 * support it. Otherwise, only skip the revoke on un-journaled 82 * data blocks. */ 83 84 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || 85 (!is_metadata && !ext3_should_journal_data(inode))) { 86 if (bh) { 87 BUFFER_TRACE(bh, "call journal_forget"); 88 return ext3_journal_forget(handle, bh); 89 } 90 return 0; 91 } 92 93 /* 94 * data!=journal && (is_metadata || should_journal_data(inode)) 95 */ 96 BUFFER_TRACE(bh, "call ext3_journal_revoke"); 97 err = ext3_journal_revoke(handle, blocknr, bh); 98 if (err) 99 ext3_abort(inode->i_sb, __FUNCTION__, 100 "error %d when attempting revoke", err); 101 BUFFER_TRACE(bh, "exit"); 102 return err; 103} 104 105/* 106 * Work out how many blocks we need to proceed with the next chunk of a 107 * truncate transaction. 108 */ 109static unsigned long blocks_for_truncate(struct inode *inode) 110{ 111 unsigned long needed; 112 113 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 114 115 /* Give ourselves just enough room to cope with inodes in which 116 * i_blocks is corrupt: we've seen disk corruptions in the past 117 * which resulted in random data in an inode which looked enough 118 * like a regular file for ext3 to try to delete it. Things 119 * will go a bit crazy if that happens, but at least we should 120 * try not to panic the whole kernel. */ 121 if (needed < 2) 122 needed = 2; 123 124 /* But we need to bound the transaction so we don't overflow the 125 * journal. */ 126 if (needed > EXT3_MAX_TRANS_DATA) 127 needed = EXT3_MAX_TRANS_DATA; 128 129 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 130} 131 132/* 133 * Truncate transactions can be complex and absolutely huge. So we need to 134 * be able to restart the transaction at a conventient checkpoint to make 135 * sure we don't overflow the journal. 136 * 137 * start_transaction gets us a new handle for a truncate transaction, 138 * and extend_transaction tries to extend the existing one a bit. If 139 * extend fails, we need to propagate the failure up and restart the 140 * transaction in the top-level truncate loop. --sct 141 */ 142static handle_t *start_transaction(struct inode *inode) 143{ 144 handle_t *result; 145 146 result = ext3_journal_start(inode, blocks_for_truncate(inode)); 147 if (!IS_ERR(result)) 148 return result; 149 150 ext3_std_error(inode->i_sb, PTR_ERR(result)); 151 return result; 152} 153 154/* 155 * Try to extend this transaction for the purposes of truncation. 156 * 157 * Returns 0 if we managed to create more room. If we can't create more 158 * room, and the transaction must be restarted we return 1. 159 */ 160static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 161{ 162 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) 163 return 0; 164 if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) 165 return 0; 166 return 1; 167} 168 169/* 170 * Restart the transaction associated with *handle. This does a commit, 171 * so before we call here everything must be consistently dirtied against 172 * this transaction. 173 */ 174static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) 175{ 176 jbd_debug(2, "restarting handle %p\n", handle); 177 return ext3_journal_restart(handle, blocks_for_truncate(inode)); 178} 179 180/* 181 * Called at the last iput() if i_nlink is zero. 182 */ 183void ext3_delete_inode (struct inode * inode) 184{ 185 handle_t *handle; 186 187 truncate_inode_pages(&inode->i_data, 0); 188 189 if (is_bad_inode(inode)) 190 goto no_delete; 191 192 handle = start_transaction(inode); 193 if (IS_ERR(handle)) { 194 /* 195 * If we're going to skip the normal cleanup, we still need to 196 * make sure that the in-core orphan linked list is properly 197 * cleaned up. 198 */ 199 ext3_orphan_del(NULL, inode); 200 goto no_delete; 201 } 202 203 if (IS_SYNC(inode)) 204 handle->h_sync = 1; 205 inode->i_size = 0; 206 if (inode->i_blocks) 207 ext3_truncate(inode); 208 /* 209 * Kill off the orphan record which ext3_truncate created. 210 * AKPM: I think this can be inside the above `if'. 211 * Note that ext3_orphan_del() has to be able to cope with the 212 * deletion of a non-existent orphan - this is because we don't 213 * know if ext3_truncate() actually created an orphan record. 214 * (Well, we could do this if we need to, but heck - it works) 215 */ 216 ext3_orphan_del(handle, inode); 217 EXT3_I(inode)->i_dtime = get_seconds(); 218 219 /* 220 * One subtle ordering requirement: if anything has gone wrong 221 * (transaction abort, IO errors, whatever), then we can still 222 * do these next steps (the fs will already have been marked as 223 * having errors), but we can't free the inode if the mark_dirty 224 * fails. 225 */ 226 if (ext3_mark_inode_dirty(handle, inode)) 227 /* If that failed, just do the required in-core inode clear. */ 228 clear_inode(inode); 229 else 230 ext3_free_inode(handle, inode); 231 ext3_journal_stop(handle); 232 return; 233no_delete: 234 clear_inode(inode); /* We must guarantee clearing of inode... */ 235} 236 237typedef struct { 238 __le32 *p; 239 __le32 key; 240 struct buffer_head *bh; 241} Indirect; 242 243static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 244{ 245 p->key = *(p->p = v); 246 p->bh = bh; 247} 248 249static int verify_chain(Indirect *from, Indirect *to) 250{ 251 while (from <= to && from->key == *from->p) 252 from++; 253 return (from > to); 254} 255 256/** 257 * ext3_block_to_path - parse the block number into array of offsets 258 * @inode: inode in question (we are only interested in its superblock) 259 * @i_block: block number to be parsed 260 * @offsets: array to store the offsets in 261 * @boundary: set this non-zero if the referred-to block is likely to be 262 * followed (on disk) by an indirect block. 263 * 264 * To store the locations of file's data ext3 uses a data structure common 265 * for UNIX filesystems - tree of pointers anchored in the inode, with 266 * data blocks at leaves and indirect blocks in intermediate nodes. 267 * This function translates the block number into path in that tree - 268 * return value is the path length and @offsets[n] is the offset of 269 * pointer to (n+1)th node in the nth one. If @block is out of range 270 * (negative or too large) warning is printed and zero returned. 271 * 272 * Note: function doesn't find node addresses, so no IO is needed. All 273 * we need to know is the capacity of indirect blocks (taken from the 274 * inode->i_sb). 275 */ 276 277/* 278 * Portability note: the last comparison (check that we fit into triple 279 * indirect block) is spelled differently, because otherwise on an 280 * architecture with 32-bit longs and 8Kb pages we might get into trouble 281 * if our filesystem had 8Kb blocks. We might use long long, but that would 282 * kill us on x86. Oh, well, at least the sign propagation does not matter - 283 * i_block would have to be negative in the very beginning, so we would not 284 * get there at all. 285 */ 286 287static int ext3_block_to_path(struct inode *inode, 288 long i_block, int offsets[4], int *boundary) 289{ 290 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); 291 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); 292 const long direct_blocks = EXT3_NDIR_BLOCKS, 293 indirect_blocks = ptrs, 294 double_blocks = (1 << (ptrs_bits * 2)); 295 int n = 0; 296 int final = 0; 297 298 if (i_block < 0) { 299 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); 300 } else if (i_block < direct_blocks) { 301 offsets[n++] = i_block; 302 final = direct_blocks; 303 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 304 offsets[n++] = EXT3_IND_BLOCK; 305 offsets[n++] = i_block; 306 final = ptrs; 307 } else if ((i_block -= indirect_blocks) < double_blocks) { 308 offsets[n++] = EXT3_DIND_BLOCK; 309 offsets[n++] = i_block >> ptrs_bits; 310 offsets[n++] = i_block & (ptrs - 1); 311 final = ptrs; 312 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 313 offsets[n++] = EXT3_TIND_BLOCK; 314 offsets[n++] = i_block >> (ptrs_bits * 2); 315 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 316 offsets[n++] = i_block & (ptrs - 1); 317 final = ptrs; 318 } else { 319 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); 320 } 321 if (boundary) 322 *boundary = final - 1 - (i_block & (ptrs - 1)); 323 return n; 324} 325 326/** 327 * ext3_get_branch - read the chain of indirect blocks leading to data 328 * @inode: inode in question 329 * @depth: depth of the chain (1 - direct pointer, etc.) 330 * @offsets: offsets of pointers in inode/indirect blocks 331 * @chain: place to store the result 332 * @err: here we store the error value 333 * 334 * Function fills the array of triples <key, p, bh> and returns %NULL 335 * if everything went OK or the pointer to the last filled triple 336 * (incomplete one) otherwise. Upon the return chain[i].key contains 337 * the number of (i+1)-th block in the chain (as it is stored in memory, 338 * i.e. little-endian 32-bit), chain[i].p contains the address of that 339 * number (it points into struct inode for i==0 and into the bh->b_data 340 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 341 * block for i>0 and NULL for i==0. In other words, it holds the block 342 * numbers of the chain, addresses they were taken from (and where we can 343 * verify that chain did not change) and buffer_heads hosting these 344 * numbers. 345 * 346 * Function stops when it stumbles upon zero pointer (absent block) 347 * (pointer to last triple returned, *@err == 0) 348 * or when it gets an IO error reading an indirect block 349 * (ditto, *@err == -EIO) 350 * or when it notices that chain had been changed while it was reading 351 * (ditto, *@err == -EAGAIN) 352 * or when it reads all @depth-1 indirect blocks successfully and finds 353 * the whole chain, all way to the data (returns %NULL, *err == 0). 354 */ 355static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, 356 Indirect chain[4], int *err) 357{ 358 struct super_block *sb = inode->i_sb; 359 Indirect *p = chain; 360 struct buffer_head *bh; 361 362 *err = 0; 363 /* i_data is not going away, no lock needed */ 364 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); 365 if (!p->key) 366 goto no_block; 367 while (--depth) { 368 bh = sb_bread(sb, le32_to_cpu(p->key)); 369 if (!bh) 370 goto failure; 371 /* Reader: pointers */ 372 if (!verify_chain(chain, p)) 373 goto changed; 374 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 375 /* Reader: end */ 376 if (!p->key) 377 goto no_block; 378 } 379 return NULL; 380 381changed: 382 brelse(bh); 383 *err = -EAGAIN; 384 goto no_block; 385failure: 386 *err = -EIO; 387no_block: 388 return p; 389} 390 391/** 392 * ext3_find_near - find a place for allocation with sufficient locality 393 * @inode: owner 394 * @ind: descriptor of indirect block. 395 * 396 * This function returns the prefered place for block allocation. 397 * It is used when heuristic for sequential allocation fails. 398 * Rules are: 399 * + if there is a block to the left of our position - allocate near it. 400 * + if pointer will live in indirect block - allocate near that block. 401 * + if pointer will live in inode - allocate in the same 402 * cylinder group. 403 * 404 * In the latter case we colour the starting block by the callers PID to 405 * prevent it from clashing with concurrent allocations for a different inode 406 * in the same block group. The PID is used here so that functionally related 407 * files will be close-by on-disk. 408 * 409 * Caller must make sure that @ind is valid and will stay that way. 410 */ 411static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) 412{ 413 struct ext3_inode_info *ei = EXT3_I(inode); 414 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 415 __le32 *p; 416 ext3_fsblk_t bg_start; 417 ext3_grpblk_t colour; 418 419 /* Try to find previous block */ 420 for (p = ind->p - 1; p >= start; p--) { 421 if (*p) 422 return le32_to_cpu(*p); 423 } 424 425 /* No such thing, so let's try location of indirect block */ 426 if (ind->bh) 427 return ind->bh->b_blocknr; 428 429 /* 430 * It is going to be referred to from the inode itself? OK, just put it 431 * into the same cylinder group then. 432 */ 433 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); 434 colour = (current->pid % 16) * 435 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); 436 return bg_start + colour; 437} 438 439/** 440 * ext3_find_goal - find a prefered place for allocation. 441 * @inode: owner 442 * @block: block we want 443 * @chain: chain of indirect blocks 444 * @partial: pointer to the last triple within a chain 445 * @goal: place to store the result. 446 * 447 * Normally this function find the prefered place for block allocation, 448 * stores it in *@goal and returns zero. 449 */ 450 451static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, 452 Indirect chain[4], Indirect *partial) 453{ 454 struct ext3_block_alloc_info *block_i; 455 456 block_i = EXT3_I(inode)->i_block_alloc_info; 457 458 /* 459 * try the heuristic for sequential allocation, 460 * failing that at least try to get decent locality. 461 */ 462 if (block_i && (block == block_i->last_alloc_logical_block + 1) 463 && (block_i->last_alloc_physical_block != 0)) { 464 return block_i->last_alloc_physical_block + 1; 465 } 466 467 return ext3_find_near(inode, partial); 468} 469 470/** 471 * ext3_blks_to_allocate: Look up the block map and count the number 472 * of direct blocks need to be allocated for the given branch. 473 * 474 * @branch: chain of indirect blocks 475 * @k: number of blocks need for indirect blocks 476 * @blks: number of data blocks to be mapped. 477 * @blocks_to_boundary: the offset in the indirect block 478 * 479 * return the total number of blocks to be allocate, including the 480 * direct and indirect blocks. 481 */ 482static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 483 int blocks_to_boundary) 484{ 485 unsigned long count = 0; 486 487 /* 488 * Simple case, [t,d]Indirect block(s) has not allocated yet 489 * then it's clear blocks on that path have not allocated 490 */ 491 if (k > 0) { 492 /* right now we don't handle cross boundary allocation */ 493 if (blks < blocks_to_boundary + 1) 494 count += blks; 495 else 496 count += blocks_to_boundary + 1; 497 return count; 498 } 499 500 count++; 501 while (count < blks && count <= blocks_to_boundary && 502 le32_to_cpu(*(branch[0].p + count)) == 0) { 503 count++; 504 } 505 return count; 506} 507 508/** 509 * ext3_alloc_blocks: multiple allocate blocks needed for a branch 510 * @indirect_blks: the number of blocks need to allocate for indirect 511 * blocks 512 * 513 * @new_blocks: on return it will store the new block numbers for 514 * the indirect blocks(if needed) and the first direct block, 515 * @blks: on return it will store the total number of allocated 516 * direct blocks 517 */ 518static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 519 ext3_fsblk_t goal, int indirect_blks, int blks, 520 ext3_fsblk_t new_blocks[4], int *err) 521{ 522 int target, i; 523 unsigned long count = 0; 524 int index = 0; 525 ext3_fsblk_t current_block = 0; 526 int ret = 0; 527 528 /* 529 * Here we try to allocate the requested multiple blocks at once, 530 * on a best-effort basis. 531 * To build a branch, we should allocate blocks for 532 * the indirect blocks(if not allocated yet), and at least 533 * the first direct block of this branch. That's the 534 * minimum number of blocks need to allocate(required) 535 */ 536 target = blks + indirect_blks; 537 538 while (1) { 539 count = target; 540 /* allocating blocks for indirect blocks and direct blocks */ 541 current_block = ext3_new_blocks(handle,inode,goal,&count,err); 542 if (*err) 543 goto failed_out; 544 545 target -= count; 546 /* allocate blocks for indirect blocks */ 547 while (index < indirect_blks && count) { 548 new_blocks[index++] = current_block++; 549 count--; 550 } 551 552 if (count > 0) 553 break; 554 } 555 556 /* save the new block number for the first direct block */ 557 new_blocks[index] = current_block; 558 559 /* total number of blocks allocated for direct blocks */ 560 ret = count; 561 *err = 0; 562 return ret; 563failed_out: 564 for (i = 0; i <index; i++) 565 ext3_free_blocks(handle, inode, new_blocks[i], 1); 566 return ret; 567} 568 569/** 570 * ext3_alloc_branch - allocate and set up a chain of blocks. 571 * @inode: owner 572 * @indirect_blks: number of allocated indirect blocks 573 * @blks: number of allocated direct blocks 574 * @offsets: offsets (in the blocks) to store the pointers to next. 575 * @branch: place to store the chain in. 576 * 577 * This function allocates blocks, zeroes out all but the last one, 578 * links them into chain and (if we are synchronous) writes them to disk. 579 * In other words, it prepares a branch that can be spliced onto the 580 * inode. It stores the information about that chain in the branch[], in 581 * the same format as ext3_get_branch() would do. We are calling it after 582 * we had read the existing part of chain and partial points to the last 583 * triple of that (one with zero ->key). Upon the exit we have the same 584 * picture as after the successful ext3_get_block(), except that in one 585 * place chain is disconnected - *branch->p is still zero (we did not 586 * set the last link), but branch->key contains the number that should 587 * be placed into *branch->p to fill that gap. 588 * 589 * If allocation fails we free all blocks we've allocated (and forget 590 * their buffer_heads) and return the error value the from failed 591 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain 592 * as described above and return 0. 593 */ 594static int ext3_alloc_branch(handle_t *handle, struct inode *inode, 595 int indirect_blks, int *blks, ext3_fsblk_t goal, 596 int *offsets, Indirect *branch) 597{ 598 int blocksize = inode->i_sb->s_blocksize; 599 int i, n = 0; 600 int err = 0; 601 struct buffer_head *bh; 602 int num; 603 ext3_fsblk_t new_blocks[4]; 604 ext3_fsblk_t current_block; 605 606 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, 607 *blks, new_blocks, &err); 608 if (err) 609 return err; 610 611 branch[0].key = cpu_to_le32(new_blocks[0]); 612 /* 613 * metadata blocks and data blocks are allocated. 614 */ 615 for (n = 1; n <= indirect_blks; n++) { 616 /* 617 * Get buffer_head for parent block, zero it out 618 * and set the pointer to new one, then send 619 * parent to disk. 620 */ 621 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 622 branch[n].bh = bh; 623 lock_buffer(bh); 624 BUFFER_TRACE(bh, "call get_create_access"); 625 err = ext3_journal_get_create_access(handle, bh); 626 if (err) { 627 unlock_buffer(bh); 628 brelse(bh); 629 goto failed; 630 } 631 632 memset(bh->b_data, 0, blocksize); 633 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 634 branch[n].key = cpu_to_le32(new_blocks[n]); 635 *branch[n].p = branch[n].key; 636 if ( n == indirect_blks) { 637 current_block = new_blocks[n]; 638 /* 639 * End of chain, update the last new metablock of 640 * the chain to point to the new allocated 641 * data blocks numbers 642 */ 643 for (i=1; i < num; i++) 644 *(branch[n].p + i) = cpu_to_le32(++current_block); 645 } 646 BUFFER_TRACE(bh, "marking uptodate"); 647 set_buffer_uptodate(bh); 648 unlock_buffer(bh); 649 650 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 651 err = ext3_journal_dirty_metadata(handle, bh); 652 if (err) 653 goto failed; 654 } 655 *blks = num; 656 return err; 657failed: 658 /* Allocation failed, free what we already allocated */ 659 for (i = 1; i <= n ; i++) { 660 BUFFER_TRACE(branch[i].bh, "call journal_forget"); 661 ext3_journal_forget(handle, branch[i].bh); 662 } 663 for (i = 0; i <indirect_blks; i++) 664 ext3_free_blocks(handle, inode, new_blocks[i], 1); 665 666 ext3_free_blocks(handle, inode, new_blocks[i], num); 667 668 return err; 669} 670 671/** 672 * ext3_splice_branch - splice the allocated branch onto inode. 673 * @inode: owner 674 * @block: (logical) number of block we are adding 675 * @chain: chain of indirect blocks (with a missing link - see 676 * ext3_alloc_branch) 677 * @where: location of missing link 678 * @num: number of indirect blocks we are adding 679 * @blks: number of direct blocks we are adding 680 * 681 * This function fills the missing link and does all housekeeping needed in 682 * inode (->i_blocks, etc.). In case of success we end up with the full 683 * chain to new block and return 0. 684 */ 685static int ext3_splice_branch(handle_t *handle, struct inode *inode, 686 long block, Indirect *where, int num, int blks) 687{ 688 int i; 689 int err = 0; 690 struct ext3_block_alloc_info *block_i; 691 ext3_fsblk_t current_block; 692 693 block_i = EXT3_I(inode)->i_block_alloc_info; 694 /* 695 * If we're splicing into a [td]indirect block (as opposed to the 696 * inode) then we need to get write access to the [td]indirect block 697 * before the splice. 698 */ 699 if (where->bh) { 700 BUFFER_TRACE(where->bh, "get_write_access"); 701 err = ext3_journal_get_write_access(handle, where->bh); 702 if (err) 703 goto err_out; 704 } 705 /* That's it */ 706 707 *where->p = where->key; 708 709 /* 710 * Update the host buffer_head or inode to point to more just allocated 711 * direct blocks blocks 712 */ 713 if (num == 0 && blks > 1) { 714 current_block = le32_to_cpu(where->key) + 1; 715 for (i = 1; i < blks; i++) 716 *(where->p + i ) = cpu_to_le32(current_block++); 717 } 718 719 /* 720 * update the most recently allocated logical & physical block 721 * in i_block_alloc_info, to assist find the proper goal block for next 722 * allocation 723 */ 724 if (block_i) { 725 block_i->last_alloc_logical_block = block + blks - 1; 726 block_i->last_alloc_physical_block = 727 le32_to_cpu(where[num].key) + blks - 1; 728 } 729 730 /* We are done with atomic stuff, now do the rest of housekeeping */ 731 732 inode->i_ctime = CURRENT_TIME_SEC; 733 ext3_mark_inode_dirty(handle, inode); 734 735 /* had we spliced it onto indirect block? */ 736 if (where->bh) { 737 /* 738 * If we spliced it onto an indirect block, we haven't 739 * altered the inode. Note however that if it is being spliced 740 * onto an indirect block at the very end of the file (the 741 * file is growing) then we *will* alter the inode to reflect 742 * the new i_size. But that is not done here - it is done in 743 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. 744 */ 745 jbd_debug(5, "splicing indirect only\n"); 746 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); 747 err = ext3_journal_dirty_metadata(handle, where->bh); 748 if (err) 749 goto err_out; 750 } else { 751 /* 752 * OK, we spliced it into the inode itself on a direct block. 753 * Inode was dirtied above. 754 */ 755 jbd_debug(5, "splicing direct\n"); 756 } 757 return err; 758 759err_out: 760 for (i = 1; i <= num; i++) { 761 BUFFER_TRACE(where[i].bh, "call journal_forget"); 762 ext3_journal_forget(handle, where[i].bh); 763 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); 764 } 765 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); 766 767 return err; 768} 769 770/* 771 * Allocation strategy is simple: if we have to allocate something, we will 772 * have to go the whole way to leaf. So let's do it before attaching anything 773 * to tree, set linkage between the newborn blocks, write them if sync is 774 * required, recheck the path, free and repeat if check fails, otherwise 775 * set the last missing link (that will protect us from any truncate-generated 776 * removals - all blocks on the path are immune now) and possibly force the 777 * write on the parent block. 778 * That has a nice additional property: no special recovery from the failed 779 * allocations is needed - we simply release blocks and do not touch anything 780 * reachable from inode. 781 * 782 * `handle' can be NULL if create == 0. 783 * 784 * The BKL may not be held on entry here. Be sure to take it early. 785 * return > 0, # of blocks mapped or allocated. 786 * return = 0, if plain lookup failed. 787 * return < 0, error case. 788 */ 789int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 790 sector_t iblock, unsigned long maxblocks, 791 struct buffer_head *bh_result, 792 int create, int extend_disksize) 793{ 794 int err = -EIO; 795 int offsets[4]; 796 Indirect chain[4]; 797 Indirect *partial; 798 ext3_fsblk_t goal; 799 int indirect_blks; 800 int blocks_to_boundary = 0; 801 int depth; 802 struct ext3_inode_info *ei = EXT3_I(inode); 803 int count = 0; 804 ext3_fsblk_t first_block = 0; 805 806 807 J_ASSERT(handle != NULL || create == 0); 808 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 809 810 if (depth == 0) 811 goto out; 812 813 partial = ext3_get_branch(inode, depth, offsets, chain, &err); 814 815 /* Simplest case - block found, no allocation needed */ 816 if (!partial) { 817 first_block = le32_to_cpu(chain[depth - 1].key); 818 clear_buffer_new(bh_result); 819 count++; 820 /*map more blocks*/ 821 while (count < maxblocks && count <= blocks_to_boundary) { 822 ext3_fsblk_t blk; 823 824 if (!verify_chain(chain, partial)) { 825 /* 826 * Indirect block might be removed by 827 * truncate while we were reading it. 828 * Handling of that case: forget what we've 829 * got now. Flag the err as EAGAIN, so it 830 * will reread. 831 */ 832 err = -EAGAIN; 833 count = 0; 834 break; 835 } 836 blk = le32_to_cpu(*(chain[depth-1].p + count)); 837 838 if (blk == first_block + count) 839 count++; 840 else 841 break; 842 } 843 if (err != -EAGAIN) 844 goto got_it; 845 } 846 847 /* Next simple case - plain lookup or failed read of indirect block */ 848 if (!create || err == -EIO) 849 goto cleanup; 850 851 mutex_lock(&ei->truncate_mutex); 852 853 /* 854 * If the indirect block is missing while we are reading 855 * the chain(ext3_get_branch() returns -EAGAIN err), or 856 * if the chain has been changed after we grab the semaphore, 857 * (either because another process truncated this branch, or 858 * another get_block allocated this branch) re-grab the chain to see if 859 * the request block has been allocated or not. 860 * 861 * Since we already block the truncate/other get_block 862 * at this point, we will have the current copy of the chain when we 863 * splice the branch into the tree. 864 */ 865 if (err == -EAGAIN || !verify_chain(chain, partial)) { 866 while (partial > chain) { 867 brelse(partial->bh); 868 partial--; 869 } 870 partial = ext3_get_branch(inode, depth, offsets, chain, &err); 871 if (!partial) { 872 count++; 873 mutex_unlock(&ei->truncate_mutex); 874 if (err) 875 goto cleanup; 876 clear_buffer_new(bh_result); 877 goto got_it; 878 } 879 } 880 881 /* 882 * Okay, we need to do block allocation. Lazily initialize the block 883 * allocation info here if necessary 884 */ 885 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) 886 ext3_init_block_alloc_info(inode); 887 888 goal = ext3_find_goal(inode, iblock, chain, partial); 889 890 /* the number of blocks need to allocate for [d,t]indirect blocks */ 891 indirect_blks = (chain + depth) - partial - 1; 892 893 /* 894 * Next look up the indirect map to count the totoal number of 895 * direct blocks to allocate for this branch. 896 */ 897 count = ext3_blks_to_allocate(partial, indirect_blks, 898 maxblocks, blocks_to_boundary); 899 /* 900 * Block out ext3_truncate while we alter the tree 901 */ 902 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, 903 offsets + (partial - chain), partial); 904 905 /* 906 * The ext3_splice_branch call will free and forget any buffers 907 * on the new chain if there is a failure, but that risks using 908 * up transaction credits, especially for bitmaps where the 909 * credits cannot be returned. Can we handle this somehow? We 910 * may need to return -EAGAIN upwards in the worst case. --sct 911 */ 912 if (!err) 913 err = ext3_splice_branch(handle, inode, iblock, 914 partial, indirect_blks, count); 915 /* 916 * i_disksize growing is protected by truncate_mutex. Don't forget to 917 * protect it if you're about to implement concurrent 918 * ext3_get_block() -bzzz 919 */ 920 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 921 ei->i_disksize = inode->i_size; 922 mutex_unlock(&ei->truncate_mutex); 923 if (err) 924 goto cleanup; 925 926 set_buffer_new(bh_result); 927got_it: 928 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 929 if (count > blocks_to_boundary) 930 set_buffer_boundary(bh_result); 931 err = count; 932 /* Clean up and exit */ 933 partial = chain + depth - 1; /* the whole chain */ 934cleanup: 935 while (partial > chain) { 936 BUFFER_TRACE(partial->bh, "call brelse"); 937 brelse(partial->bh); 938 partial--; 939 } 940 BUFFER_TRACE(bh_result, "returned"); 941out: 942 return err; 943} 944 945#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) 946 947static int ext3_get_block(struct inode *inode, sector_t iblock, 948 struct buffer_head *bh_result, int create) 949{ 950 handle_t *handle = journal_current_handle(); 951 int ret = 0; 952 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 953 954 if (!create) 955 goto get_block; /* A read */ 956 957 if (max_blocks == 1) 958 goto get_block; /* A single block get */ 959 960 if (handle->h_transaction->t_state == T_LOCKED) { 961 /* 962 * Huge direct-io writes can hold off commits for long 963 * periods of time. Let this commit run. 964 */ 965 ext3_journal_stop(handle); 966 handle = ext3_journal_start(inode, DIO_CREDITS); 967 if (IS_ERR(handle)) 968 ret = PTR_ERR(handle); 969 goto get_block; 970 } 971 972 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) { 973 /* 974 * Getting low on buffer credits... 975 */ 976 ret = ext3_journal_extend(handle, DIO_CREDITS); 977 if (ret > 0) { 978 /* 979 * Couldn't extend the transaction. Start a new one. 980 */ 981 ret = ext3_journal_restart(handle, DIO_CREDITS); 982 } 983 } 984 985get_block: 986 if (ret == 0) { 987 ret = ext3_get_blocks_handle(handle, inode, iblock, 988 max_blocks, bh_result, create, 0); 989 if (ret > 0) { 990 bh_result->b_size = (ret << inode->i_blkbits); 991 ret = 0; 992 } 993 } 994 return ret; 995} 996 997/* 998 * `handle' can be NULL if create is zero 999 */ 1000struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, 1001 long block, int create, int *errp) 1002{ 1003 struct buffer_head dummy; 1004 int fatal = 0, err; 1005 1006 J_ASSERT(handle != NULL || create == 0); 1007 1008 dummy.b_state = 0; 1009 dummy.b_blocknr = -1000; 1010 buffer_trace_init(&dummy.b_history); 1011 err = ext3_get_blocks_handle(handle, inode, block, 1, 1012 &dummy, create, 1); 1013 /* 1014 * ext3_get_blocks_handle() returns number of blocks 1015 * mapped. 0 in case of a HOLE. 1016 */ 1017 if (err > 0) { 1018 if (err > 1) 1019 WARN_ON(1); 1020 err = 0; 1021 } 1022 *errp = err; 1023 if (!err && buffer_mapped(&dummy)) { 1024 struct buffer_head *bh; 1025 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1026 if (!bh) { 1027 *errp = -EIO; 1028 goto err; 1029 } 1030 if (buffer_new(&dummy)) { 1031 J_ASSERT(create != 0); 1032 J_ASSERT(handle != 0); 1033 1034 /* 1035 * Now that we do not always journal data, we should 1036 * keep in mind whether this should always journal the 1037 * new buffer as metadata. For now, regular file 1038 * writes use ext3_get_block instead, so it's not a 1039 * problem. 1040 */ 1041 lock_buffer(bh); 1042 BUFFER_TRACE(bh, "call get_create_access"); 1043 fatal = ext3_journal_get_create_access(handle, bh); 1044 if (!fatal && !buffer_uptodate(bh)) { 1045 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1046 set_buffer_uptodate(bh); 1047 } 1048 unlock_buffer(bh); 1049 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1050 err = ext3_journal_dirty_metadata(handle, bh); 1051 if (!fatal) 1052 fatal = err; 1053 } else { 1054 BUFFER_TRACE(bh, "not a new buffer"); 1055 } 1056 if (fatal) { 1057 *errp = fatal; 1058 brelse(bh); 1059 bh = NULL; 1060 } 1061 return bh; 1062 } 1063err: 1064 return NULL; 1065} 1066 1067struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, 1068 int block, int create, int *err) 1069{ 1070 struct buffer_head * bh; 1071 1072 bh = ext3_getblk(handle, inode, block, create, err); 1073 if (!bh) 1074 return bh; 1075 if (buffer_uptodate(bh)) 1076 return bh; 1077 ll_rw_block(READ_META, 1, &bh); 1078 wait_on_buffer(bh); 1079 if (buffer_uptodate(bh)) 1080 return bh; 1081 put_bh(bh); 1082 *err = -EIO; 1083 return NULL; 1084} 1085 1086static int walk_page_buffers( handle_t *handle, 1087 struct buffer_head *head, 1088 unsigned from, 1089 unsigned to, 1090 int *partial, 1091 int (*fn)( handle_t *handle, 1092 struct buffer_head *bh)) 1093{ 1094 struct buffer_head *bh; 1095 unsigned block_start, block_end; 1096 unsigned blocksize = head->b_size; 1097 int err, ret = 0; 1098 struct buffer_head *next; 1099 1100 for ( bh = head, block_start = 0; 1101 ret == 0 && (bh != head || !block_start); 1102 block_start = block_end, bh = next) 1103 { 1104 next = bh->b_this_page; 1105 block_end = block_start + blocksize; 1106 if (block_end <= from || block_start >= to) { 1107 if (partial && !buffer_uptodate(bh)) 1108 *partial = 1; 1109 continue; 1110 } 1111 err = (*fn)(handle, bh); 1112 if (!ret) 1113 ret = err; 1114 } 1115 return ret; 1116} 1117 1118/* 1119 * To preserve ordering, it is essential that the hole instantiation and 1120 * the data write be encapsulated in a single transaction. We cannot 1121 * close off a transaction and start a new one between the ext3_get_block() 1122 * and the commit_write(). So doing the journal_start at the start of 1123 * prepare_write() is the right place. 1124 * 1125 * Also, this function can nest inside ext3_writepage() -> 1126 * block_write_full_page(). In that case, we *know* that ext3_writepage() 1127 * has generated enough buffer credits to do the whole page. So we won't 1128 * block on the journal in that case, which is good, because the caller may 1129 * be PF_MEMALLOC. 1130 * 1131 * By accident, ext3 can be reentered when a transaction is open via 1132 * quota file writes. If we were to commit the transaction while thus 1133 * reentered, there can be a deadlock - we would be holding a quota 1134 * lock, and the commit would never complete if another thread had a 1135 * transaction open and was blocking on the quota lock - a ranking 1136 * violation. 1137 * 1138 * So what we do is to rely on the fact that journal_stop/journal_start 1139 * will _not_ run commit under these circumstances because handle->h_ref 1140 * is elevated. We'll still have enough credits for the tiny quotafile 1141 * write. 1142 */ 1143static int do_journal_get_write_access(handle_t *handle, 1144 struct buffer_head *bh) 1145{ 1146 if (!buffer_mapped(bh) || buffer_freed(bh)) 1147 return 0; 1148 return ext3_journal_get_write_access(handle, bh); 1149} 1150 1151static int ext3_prepare_write(struct file *file, struct page *page, 1152 unsigned from, unsigned to) 1153{ 1154 struct inode *inode = page->mapping->host; 1155 int ret, needed_blocks = ext3_writepage_trans_blocks(inode); 1156 handle_t *handle; 1157 int retries = 0; 1158 1159retry: 1160 handle = ext3_journal_start(inode, needed_blocks); 1161 if (IS_ERR(handle)) { 1162 ret = PTR_ERR(handle); 1163 goto out; 1164 } 1165 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) 1166 ret = nobh_prepare_write(page, from, to, ext3_get_block); 1167 else 1168 ret = block_prepare_write(page, from, to, ext3_get_block); 1169 if (ret) 1170 goto prepare_write_failed; 1171 1172 if (ext3_should_journal_data(inode)) { 1173 ret = walk_page_buffers(handle, page_buffers(page), 1174 from, to, NULL, do_journal_get_write_access); 1175 } 1176prepare_write_failed: 1177 if (ret) 1178 ext3_journal_stop(handle); 1179 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1180 goto retry; 1181out: 1182 return ret; 1183} 1184 1185int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) 1186{ 1187 int err = journal_dirty_data(handle, bh); 1188 if (err) 1189 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, 1190 bh, handle,err); 1191 return err; 1192} 1193 1194/* For commit_write() in data=journal mode */ 1195static int commit_write_fn(handle_t *handle, struct buffer_head *bh) 1196{ 1197 if (!buffer_mapped(bh) || buffer_freed(bh)) 1198 return 0; 1199 set_buffer_uptodate(bh); 1200 return ext3_journal_dirty_metadata(handle, bh); 1201} 1202 1203/* 1204 * We need to pick up the new inode size which generic_commit_write gave us 1205 * `file' can be NULL - eg, when called from page_symlink(). 1206 * 1207 * ext3 never places buffers on inode->i_mapping->private_list. metadata 1208 * buffers are managed internally. 1209 */ 1210static int ext3_ordered_commit_write(struct file *file, struct page *page, 1211 unsigned from, unsigned to) 1212{ 1213 handle_t *handle = ext3_journal_current_handle(); 1214 struct inode *inode = page->mapping->host; 1215 int ret = 0, ret2; 1216 1217 ret = walk_page_buffers(handle, page_buffers(page), 1218 from, to, NULL, ext3_journal_dirty_data); 1219 1220 if (ret == 0) { 1221 /* 1222 * generic_commit_write() will run mark_inode_dirty() if i_size 1223 * changes. So let's piggyback the i_disksize mark_inode_dirty 1224 * into that. 1225 */ 1226 loff_t new_i_size; 1227 1228 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 1229 if (new_i_size > EXT3_I(inode)->i_disksize) 1230 EXT3_I(inode)->i_disksize = new_i_size; 1231 ret = generic_commit_write(file, page, from, to); 1232 } 1233 ret2 = ext3_journal_stop(handle); 1234 if (!ret) 1235 ret = ret2; 1236 return ret; 1237} 1238 1239static int ext3_writeback_commit_write(struct file *file, struct page *page, 1240 unsigned from, unsigned to) 1241{ 1242 handle_t *handle = ext3_journal_current_handle(); 1243 struct inode *inode = page->mapping->host; 1244 int ret = 0, ret2; 1245 loff_t new_i_size; 1246 1247 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 1248 if (new_i_size > EXT3_I(inode)->i_disksize) 1249 EXT3_I(inode)->i_disksize = new_i_size; 1250 1251 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) 1252 ret = nobh_commit_write(file, page, from, to); 1253 else 1254 ret = generic_commit_write(file, page, from, to); 1255 1256 ret2 = ext3_journal_stop(handle); 1257 if (!ret) 1258 ret = ret2; 1259 return ret; 1260} 1261 1262static int ext3_journalled_commit_write(struct file *file, 1263 struct page *page, unsigned from, unsigned to) 1264{ 1265 handle_t *handle = ext3_journal_current_handle(); 1266 struct inode *inode = page->mapping->host; 1267 int ret = 0, ret2; 1268 int partial = 0; 1269 loff_t pos; 1270 1271 /* 1272 * Here we duplicate the generic_commit_write() functionality 1273 */ 1274 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 1275 1276 ret = walk_page_buffers(handle, page_buffers(page), from, 1277 to, &partial, commit_write_fn); 1278 if (!partial) 1279 SetPageUptodate(page); 1280 if (pos > inode->i_size) 1281 i_size_write(inode, pos); 1282 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1283 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1284 EXT3_I(inode)->i_disksize = inode->i_size; 1285 ret2 = ext3_mark_inode_dirty(handle, inode); 1286 if (!ret) 1287 ret = ret2; 1288 } 1289 ret2 = ext3_journal_stop(handle); 1290 if (!ret) 1291 ret = ret2; 1292 return ret; 1293} 1294 1295/* 1296 * bmap() is special. It gets used by applications such as lilo and by 1297 * the swapper to find the on-disk block of a specific piece of data. 1298 * 1299 * Naturally, this is dangerous if the block concerned is still in the 1300 * journal. If somebody makes a swapfile on an ext3 data-journaling 1301 * filesystem and enables swap, then they may get a nasty shock when the 1302 * data getting swapped to that swapfile suddenly gets overwritten by 1303 * the original zero's written out previously to the journal and 1304 * awaiting writeback in the kernel's buffer cache. 1305 * 1306 * So, if we see any bmap calls here on a modified, data-journaled file, 1307 * take extra steps to flush any blocks which might be in the cache. 1308 */ 1309static sector_t ext3_bmap(struct address_space *mapping, sector_t block) 1310{ 1311 struct inode *inode = mapping->host; 1312 journal_t *journal; 1313 int err; 1314 1315 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { 1316 /* 1317 * This is a REALLY heavyweight approach, but the use of 1318 * bmap on dirty files is expected to be extremely rare: 1319 * only if we run lilo or swapon on a freshly made file 1320 * do we expect this to happen. 1321 * 1322 * (bmap requires CAP_SYS_RAWIO so this does not 1323 * represent an unprivileged user DOS attack --- we'd be 1324 * in trouble if mortal users could trigger this path at 1325 * will.) 1326 * 1327 * NB. EXT3_STATE_JDATA is not set on files other than 1328 * regular files. If somebody wants to bmap a directory 1329 * or symlink and gets confused because the buffer 1330 * hasn't yet been flushed to disk, they deserve 1331 * everything they get. 1332 */ 1333 1334 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; 1335 journal = EXT3_JOURNAL(inode); 1336 journal_lock_updates(journal); 1337 err = journal_flush(journal); 1338 journal_unlock_updates(journal); 1339 1340 if (err) 1341 return 0; 1342 } 1343 1344 return generic_block_bmap(mapping,block,ext3_get_block); 1345} 1346 1347static int bget_one(handle_t *handle, struct buffer_head *bh) 1348{ 1349 get_bh(bh); 1350 return 0; 1351} 1352 1353static int bput_one(handle_t *handle, struct buffer_head *bh) 1354{ 1355 put_bh(bh); 1356 return 0; 1357} 1358 1359static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1360{ 1361 if (buffer_mapped(bh)) 1362 return ext3_journal_dirty_data(handle, bh); 1363 return 0; 1364} 1365 1366/* 1367 * Note that we always start a transaction even if we're not journalling 1368 * data. This is to preserve ordering: any hole instantiation within 1369 * __block_write_full_page -> ext3_get_block() should be journalled 1370 * along with the data so we don't crash and then get metadata which 1371 * refers to old data. 1372 * 1373 * In all journalling modes block_write_full_page() will start the I/O. 1374 * 1375 * Problem: 1376 * 1377 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1378 * ext3_writepage() 1379 * 1380 * Similar for: 1381 * 1382 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... 1383 * 1384 * Same applies to ext3_get_block(). We will deadlock on various things like 1385 * lock_journal and i_truncate_mutex. 1386 * 1387 * Setting PF_MEMALLOC here doesn't work - too many internal memory 1388 * allocations fail. 1389 * 1390 * 16May01: If we're reentered then journal_current_handle() will be 1391 * non-zero. We simply *return*. 1392 * 1393 * 1 July 2001: @@@ FIXME: 1394 * In journalled data mode, a data buffer may be metadata against the 1395 * current transaction. But the same file is part of a shared mapping 1396 * and someone does a writepage() on it. 1397 * 1398 * We will move the buffer onto the async_data list, but *after* it has 1399 * been dirtied. So there's a small window where we have dirty data on 1400 * BJ_Metadata. 1401 * 1402 * Note that this only applies to the last partial page in the file. The 1403 * bit which block_write_full_page() uses prepare/commit for. (That's 1404 * broken code anyway: it's wrong for msync()). 1405 * 1406 * It's a rare case: affects the final partial page, for journalled data 1407 * where the file is subject to bith write() and writepage() in the same 1408 * transction. To fix it we'll need a custom block_write_full_page(). 1409 * We'll probably need that anyway for journalling writepage() output. 1410 * 1411 * We don't honour synchronous mounts for writepage(). That would be 1412 * disastrous. Any write() or metadata operation will sync the fs for 1413 * us. 1414 * 1415 * AKPM2: if all the page's buffers are mapped to disk and !data=journal, 1416 * we don't need to open a transaction here. 1417 */ 1418static int ext3_ordered_writepage(struct page *page, 1419 struct writeback_control *wbc) 1420{ 1421 struct inode *inode = page->mapping->host; 1422 struct buffer_head *page_bufs; 1423 handle_t *handle = NULL; 1424 int ret = 0; 1425 int err; 1426 1427 J_ASSERT(PageLocked(page)); 1428 1429 /* 1430 * We give up here if we're reentered, because it might be for a 1431 * different filesystem. 1432 */ 1433 if (ext3_journal_current_handle()) 1434 goto out_fail; 1435 1436 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1437 1438 if (IS_ERR(handle)) { 1439 ret = PTR_ERR(handle); 1440 goto out_fail; 1441 } 1442 1443 if (!page_has_buffers(page)) { 1444 create_empty_buffers(page, inode->i_sb->s_blocksize, 1445 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1446 } 1447 page_bufs = page_buffers(page); 1448 walk_page_buffers(handle, page_bufs, 0, 1449 PAGE_CACHE_SIZE, NULL, bget_one); 1450 1451 ret = block_write_full_page(page, ext3_get_block, wbc); 1452 1453 /* 1454 * The page can become unlocked at any point now, and 1455 * truncate can then come in and change things. So we 1456 * can't touch *page from now on. But *page_bufs is 1457 * safe due to elevated refcount. 1458 */ 1459 1460 /* 1461 * And attach them to the current transaction. But only if 1462 * block_write_full_page() succeeded. Otherwise they are unmapped, 1463 * and generally junk. 1464 */ 1465 if (ret == 0) { 1466 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, 1467 NULL, journal_dirty_data_fn); 1468 if (!ret) 1469 ret = err; 1470 } 1471 walk_page_buffers(handle, page_bufs, 0, 1472 PAGE_CACHE_SIZE, NULL, bput_one); 1473 err = ext3_journal_stop(handle); 1474 if (!ret) 1475 ret = err; 1476 return ret; 1477 1478out_fail: 1479 redirty_page_for_writepage(wbc, page); 1480 unlock_page(page); 1481 return ret; 1482} 1483 1484static int ext3_writeback_writepage(struct page *page, 1485 struct writeback_control *wbc) 1486{ 1487 struct inode *inode = page->mapping->host; 1488 handle_t *handle = NULL; 1489 int ret = 0; 1490 int err; 1491 1492 if (ext3_journal_current_handle()) 1493 goto out_fail; 1494 1495 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1496 if (IS_ERR(handle)) { 1497 ret = PTR_ERR(handle); 1498 goto out_fail; 1499 } 1500 1501 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) 1502 ret = nobh_writepage(page, ext3_get_block, wbc); 1503 else 1504 ret = block_write_full_page(page, ext3_get_block, wbc); 1505 1506 err = ext3_journal_stop(handle); 1507 if (!ret) 1508 ret = err; 1509 return ret; 1510 1511out_fail: 1512 redirty_page_for_writepage(wbc, page); 1513 unlock_page(page); 1514 return ret; 1515} 1516 1517static int ext3_journalled_writepage(struct page *page, 1518 struct writeback_control *wbc) 1519{ 1520 struct inode *inode = page->mapping->host; 1521 handle_t *handle = NULL; 1522 int ret = 0; 1523 int err; 1524 1525 if (ext3_journal_current_handle()) 1526 goto no_write; 1527 1528 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1529 if (IS_ERR(handle)) { 1530 ret = PTR_ERR(handle); 1531 goto no_write; 1532 } 1533 1534 if (!page_has_buffers(page) || PageChecked(page)) { 1535 /* 1536 * It's mmapped pagecache. Add buffers and journal it. There 1537 * doesn't seem much point in redirtying the page here. 1538 */ 1539 ClearPageChecked(page); 1540 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1541 ext3_get_block); 1542 if (ret != 0) { 1543 ext3_journal_stop(handle); 1544 goto out_unlock; 1545 } 1546 ret = walk_page_buffers(handle, page_buffers(page), 0, 1547 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 1548 1549 err = walk_page_buffers(handle, page_buffers(page), 0, 1550 PAGE_CACHE_SIZE, NULL, commit_write_fn); 1551 if (ret == 0) 1552 ret = err; 1553 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1554 unlock_page(page); 1555 } else { 1556 /* 1557 * It may be a page full of checkpoint-mode buffers. We don't 1558 * really know unless we go poke around in the buffer_heads. 1559 * But block_write_full_page will do the right thing. 1560 */ 1561 ret = block_write_full_page(page, ext3_get_block, wbc); 1562 } 1563 err = ext3_journal_stop(handle); 1564 if (!ret) 1565 ret = err; 1566out: 1567 return ret; 1568 1569no_write: 1570 redirty_page_for_writepage(wbc, page); 1571out_unlock: 1572 unlock_page(page); 1573 goto out; 1574} 1575 1576static int ext3_readpage(struct file *file, struct page *page) 1577{ 1578 return mpage_readpage(page, ext3_get_block); 1579} 1580 1581static int 1582ext3_readpages(struct file *file, struct address_space *mapping, 1583 struct list_head *pages, unsigned nr_pages) 1584{ 1585 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1586} 1587 1588static void ext3_invalidatepage(struct page *page, unsigned long offset) 1589{ 1590 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1591 1592 /* 1593 * If it's a full truncate we just forget about the pending dirtying 1594 */ 1595 if (offset == 0) 1596 ClearPageChecked(page); 1597 1598 journal_invalidatepage(journal, page, offset); 1599} 1600 1601static int ext3_releasepage(struct page *page, gfp_t wait) 1602{ 1603 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1604 1605 WARN_ON(PageChecked(page)); 1606 if (!page_has_buffers(page)) 1607 return 0; 1608 return journal_try_to_free_buffers(journal, page, wait); 1609} 1610 1611/* 1612 * If the O_DIRECT write will extend the file then add this inode to the 1613 * orphan list. So recovery will truncate it back to the original size 1614 * if the machine crashes during the write. 1615 * 1616 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1617 * crashes then stale disk data _may_ be exposed inside the file. 1618 */ 1619static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, 1620 const struct iovec *iov, loff_t offset, 1621 unsigned long nr_segs) 1622{ 1623 struct file *file = iocb->ki_filp; 1624 struct inode *inode = file->f_mapping->host; 1625 struct ext3_inode_info *ei = EXT3_I(inode); 1626 handle_t *handle = NULL; 1627 ssize_t ret; 1628 int orphan = 0; 1629 size_t count = iov_length(iov, nr_segs); 1630 1631 if (rw == WRITE) { 1632 loff_t final_size = offset + count; 1633 1634 handle = ext3_journal_start(inode, DIO_CREDITS); 1635 if (IS_ERR(handle)) { 1636 ret = PTR_ERR(handle); 1637 goto out; 1638 } 1639 if (final_size > inode->i_size) { 1640 ret = ext3_orphan_add(handle, inode); 1641 if (ret) 1642 goto out_stop; 1643 orphan = 1; 1644 ei->i_disksize = inode->i_size; 1645 } 1646 } 1647 1648 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1649 offset, nr_segs, 1650 ext3_get_block, NULL); 1651 1652 /* 1653 * Reacquire the handle: ext3_get_block() can restart the transaction 1654 */ 1655 handle = journal_current_handle(); 1656 1657out_stop: 1658 if (handle) { 1659 int err; 1660 1661 if (orphan && inode->i_nlink) 1662 ext3_orphan_del(handle, inode); 1663 if (orphan && ret > 0) { 1664 loff_t end = offset + ret; 1665 if (end > inode->i_size) { 1666 ei->i_disksize = end; 1667 i_size_write(inode, end); 1668 /* 1669 * We're going to return a positive `ret' 1670 * here due to non-zero-length I/O, so there's 1671 * no way of reporting error returns from 1672 * ext3_mark_inode_dirty() to userspace. So 1673 * ignore it. 1674 */ 1675 ext3_mark_inode_dirty(handle, inode); 1676 } 1677 } 1678 err = ext3_journal_stop(handle); 1679 if (ret == 0) 1680 ret = err; 1681 } 1682out: 1683 return ret; 1684} 1685 1686/* 1687 * Pages can be marked dirty completely asynchronously from ext3's journalling 1688 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 1689 * much here because ->set_page_dirty is called under VFS locks. The page is 1690 * not necessarily locked. 1691 * 1692 * We cannot just dirty the page and leave attached buffers clean, because the 1693 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 1694 * or jbddirty because all the journalling code will explode. 1695 * 1696 * So what we do is to mark the page "pending dirty" and next time writepage 1697 * is called, propagate that into the buffers appropriately. 1698 */ 1699static int ext3_journalled_set_page_dirty(struct page *page) 1700{ 1701 SetPageChecked(page); 1702 return __set_page_dirty_nobuffers(page); 1703} 1704 1705static const struct address_space_operations ext3_ordered_aops = { 1706 .readpage = ext3_readpage, 1707 .readpages = ext3_readpages, 1708 .writepage = ext3_ordered_writepage, 1709 .sync_page = block_sync_page, 1710 .prepare_write = ext3_prepare_write, 1711 .commit_write = ext3_ordered_commit_write, 1712 .bmap = ext3_bmap, 1713 .invalidatepage = ext3_invalidatepage, 1714 .releasepage = ext3_releasepage, 1715 .direct_IO = ext3_direct_IO, 1716 .migratepage = buffer_migrate_page, 1717}; 1718 1719static const struct address_space_operations ext3_writeback_aops = { 1720 .readpage = ext3_readpage, 1721 .readpages = ext3_readpages, 1722 .writepage = ext3_writeback_writepage, 1723 .sync_page = block_sync_page, 1724 .prepare_write = ext3_prepare_write, 1725 .commit_write = ext3_writeback_commit_write, 1726 .bmap = ext3_bmap, 1727 .invalidatepage = ext3_invalidatepage, 1728 .releasepage = ext3_releasepage, 1729 .direct_IO = ext3_direct_IO, 1730 .migratepage = buffer_migrate_page, 1731}; 1732 1733static const struct address_space_operations ext3_journalled_aops = { 1734 .readpage = ext3_readpage, 1735 .readpages = ext3_readpages, 1736 .writepage = ext3_journalled_writepage, 1737 .sync_page = block_sync_page, 1738 .prepare_write = ext3_prepare_write, 1739 .commit_write = ext3_journalled_commit_write, 1740 .set_page_dirty = ext3_journalled_set_page_dirty, 1741 .bmap = ext3_bmap, 1742 .invalidatepage = ext3_invalidatepage, 1743 .releasepage = ext3_releasepage, 1744}; 1745 1746void ext3_set_aops(struct inode *inode) 1747{ 1748 if (ext3_should_order_data(inode)) 1749 inode->i_mapping->a_ops = &ext3_ordered_aops; 1750 else if (ext3_should_writeback_data(inode)) 1751 inode->i_mapping->a_ops = &ext3_writeback_aops; 1752 else 1753 inode->i_mapping->a_ops = &ext3_journalled_aops; 1754} 1755 1756/* 1757 * ext3_block_truncate_page() zeroes out a mapping from file offset `from' 1758 * up to the end of the block which corresponds to `from'. 1759 * This required during truncate. We need to physically zero the tail end 1760 * of that block so it doesn't yield old data if the file is later grown. 1761 */ 1762static int ext3_block_truncate_page(handle_t *handle, struct page *page, 1763 struct address_space *mapping, loff_t from) 1764{ 1765 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; 1766 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1767 unsigned blocksize, iblock, length, pos; 1768 struct inode *inode = mapping->host; 1769 struct buffer_head *bh; 1770 int err = 0; 1771 void *kaddr; 1772 1773 blocksize = inode->i_sb->s_blocksize; 1774 length = blocksize - (offset & (blocksize - 1)); 1775 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1776 1777 /* 1778 * For "nobh" option, we can only work if we don't need to 1779 * read-in the page - otherwise we create buffers to do the IO. 1780 */ 1781 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 1782 ext3_should_writeback_data(inode) && PageUptodate(page)) { 1783 kaddr = kmap_atomic(page, KM_USER0); 1784 memset(kaddr + offset, 0, length); 1785 flush_dcache_page(page); 1786 kunmap_atomic(kaddr, KM_USER0); 1787 set_page_dirty(page); 1788 goto unlock; 1789 } 1790 1791 if (!page_has_buffers(page)) 1792 create_empty_buffers(page, blocksize, 0); 1793 1794 /* Find the buffer that contains "offset" */ 1795 bh = page_buffers(page); 1796 pos = blocksize; 1797 while (offset >= pos) { 1798 bh = bh->b_this_page; 1799 iblock++; 1800 pos += blocksize; 1801 } 1802 1803 err = 0; 1804 if (buffer_freed(bh)) { 1805 BUFFER_TRACE(bh, "freed: skip"); 1806 goto unlock; 1807 } 1808 1809 if (!buffer_mapped(bh)) { 1810 BUFFER_TRACE(bh, "unmapped"); 1811 ext3_get_block(inode, iblock, bh, 0); 1812 /* unmapped? It's a hole - nothing to do */ 1813 if (!buffer_mapped(bh)) { 1814 BUFFER_TRACE(bh, "still unmapped"); 1815 goto unlock; 1816 } 1817 } 1818 1819 /* Ok, it's mapped. Make sure it's up-to-date */ 1820 if (PageUptodate(page)) 1821 set_buffer_uptodate(bh); 1822 1823 if (!buffer_uptodate(bh)) { 1824 err = -EIO; 1825 ll_rw_block(READ, 1, &bh); 1826 wait_on_buffer(bh); 1827 /* Uhhuh. Read error. Complain and punt. */ 1828 if (!buffer_uptodate(bh)) 1829 goto unlock; 1830 } 1831 1832 if (ext3_should_journal_data(inode)) { 1833 BUFFER_TRACE(bh, "get write access"); 1834 err = ext3_journal_get_write_access(handle, bh); 1835 if (err) 1836 goto unlock; 1837 } 1838 1839 kaddr = kmap_atomic(page, KM_USER0); 1840 memset(kaddr + offset, 0, length); 1841 flush_dcache_page(page); 1842 kunmap_atomic(kaddr, KM_USER0); 1843 1844 BUFFER_TRACE(bh, "zeroed end of block"); 1845 1846 err = 0; 1847 if (ext3_should_journal_data(inode)) { 1848 err = ext3_journal_dirty_metadata(handle, bh); 1849 } else { 1850 if (ext3_should_order_data(inode)) 1851 err = ext3_journal_dirty_data(handle, bh); 1852 mark_buffer_dirty(bh); 1853 } 1854 1855unlock: 1856 unlock_page(page); 1857 page_cache_release(page); 1858 return err; 1859} 1860 1861/* 1862 * Probably it should be a library function... search for first non-zero word 1863 * or memcmp with zero_page, whatever is better for particular architecture. 1864 * Linus? 1865 */ 1866static inline int all_zeroes(__le32 *p, __le32 *q) 1867{ 1868 while (p < q) 1869 if (*p++) 1870 return 0; 1871 return 1; 1872} 1873 1874/** 1875 * ext3_find_shared - find the indirect blocks for partial truncation. 1876 * @inode: inode in question 1877 * @depth: depth of the affected branch 1878 * @offsets: offsets of pointers in that branch (see ext3_block_to_path) 1879 * @chain: place to store the pointers to partial indirect blocks 1880 * @top: place to the (detached) top of branch 1881 * 1882 * This is a helper function used by ext3_truncate(). 1883 * 1884 * When we do truncate() we may have to clean the ends of several 1885 * indirect blocks but leave the blocks themselves alive. Block is 1886 * partially truncated if some data below the new i_size is refered 1887 * from it (and it is on the path to the first completely truncated 1888 * data block, indeed). We have to free the top of that path along 1889 * with everything to the right of the path. Since no allocation 1890 * past the truncation point is possible until ext3_truncate() 1891 * finishes, we may safely do the latter, but top of branch may 1892 * require special attention - pageout below the truncation point 1893 * might try to populate it. 1894 * 1895 * We atomically detach the top of branch from the tree, store the 1896 * block number of its root in *@top, pointers to buffer_heads of 1897 * partially truncated blocks - in @chain[].bh and pointers to 1898 * their last elements that should not be removed - in 1899 * @chain[].p. Return value is the pointer to last filled element 1900 * of @chain. 1901 * 1902 * The work left to caller to do the actual freeing of subtrees: 1903 * a) free the subtree starting from *@top 1904 * b) free the subtrees whose roots are stored in 1905 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 1906 * c) free the subtrees growing from the inode past the @chain[0]. 1907 * (no partially truncated stuff there). */ 1908 1909static Indirect *ext3_find_shared(struct inode *inode, int depth, 1910 int offsets[4], Indirect chain[4], __le32 *top) 1911{ 1912 Indirect *partial, *p; 1913 int k, err; 1914 1915 *top = 0; 1916 /* Make k index the deepest non-null offest + 1 */ 1917 for (k = depth; k > 1 && !offsets[k-1]; k--) 1918 ; 1919 partial = ext3_get_branch(inode, k, offsets, chain, &err); 1920 /* Writer: pointers */ 1921 if (!partial) 1922 partial = chain + k-1; 1923 /* 1924 * If the branch acquired continuation since we've looked at it - 1925 * fine, it should all survive and (new) top doesn't belong to us. 1926 */ 1927 if (!partial->key && *partial->p) 1928 /* Writer: end */ 1929 goto no_top; 1930 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 1931 ; 1932 /* 1933 * OK, we've found the last block that must survive. The rest of our 1934 * branch should be detached before unlocking. However, if that rest 1935 * of branch is all ours and does not grow immediately from the inode 1936 * it's easier to cheat and just decrement partial->p. 1937 */ 1938 if (p == chain + k - 1 && p > chain) { 1939 p->p--; 1940 } else { 1941 *top = *p->p; 1942 /* Nope, don't do this in ext3. Must leave the tree intact */ 1943#if 0 1944 *p->p = 0; 1945#endif 1946 } 1947 /* Writer: end */ 1948 1949 while(partial > p) { 1950 brelse(partial->bh); 1951 partial--; 1952 } 1953no_top: 1954 return partial; 1955} 1956 1957/* 1958 * Zero a number of block pointers in either an inode or an indirect block. 1959 * If we restart the transaction we must again get write access to the 1960 * indirect block for further modification. 1961 * 1962 * We release `count' blocks on disk, but (last - first) may be greater 1963 * than `count' because there can be holes in there. 1964 */ 1965static void ext3_clear_blocks(handle_t *handle, struct inode *inode, 1966 struct buffer_head *bh, ext3_fsblk_t block_to_free, 1967 unsigned long count, __le32 *first, __le32 *last) 1968{ 1969 __le32 *p; 1970 if (try_to_extend_transaction(handle, inode)) { 1971 if (bh) { 1972 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1973 ext3_journal_dirty_metadata(handle, bh); 1974 } 1975 ext3_mark_inode_dirty(handle, inode); 1976 ext3_journal_test_restart(handle, inode); 1977 if (bh) { 1978 BUFFER_TRACE(bh, "retaking write access"); 1979 ext3_journal_get_write_access(handle, bh); 1980 } 1981 } 1982 1983 /* 1984 * Any buffers which are on the journal will be in memory. We find 1985 * them on the hash table so journal_revoke() will run journal_forget() 1986 * on them. We've already detached each block from the file, so 1987 * bforget() in journal_forget() should be safe. 1988 * 1989 * AKPM: turn on bforget in journal_forget()!!! 1990 */ 1991 for (p = first; p < last; p++) { 1992 u32 nr = le32_to_cpu(*p); 1993 if (nr) { 1994 struct buffer_head *bh; 1995 1996 *p = 0; 1997 bh = sb_find_get_block(inode->i_sb, nr); 1998 ext3_forget(handle, 0, inode, bh, nr); 1999 } 2000 } 2001 2002 ext3_free_blocks(handle, inode, block_to_free, count); 2003} 2004 2005/** 2006 * ext3_free_data - free a list of data blocks 2007 * @handle: handle for this transaction 2008 * @inode: inode we are dealing with 2009 * @this_bh: indirect buffer_head which contains *@first and *@last 2010 * @first: array of block numbers 2011 * @last: points immediately past the end of array 2012 * 2013 * We are freeing all blocks refered from that array (numbers are stored as 2014 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 2015 * 2016 * We accumulate contiguous runs of blocks to free. Conveniently, if these 2017 * blocks are contiguous then releasing them at one time will only affect one 2018 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 2019 * actually use a lot of journal space. 2020 * 2021 * @this_bh will be %NULL if @first and @last point into the inode's direct 2022 * block pointers. 2023 */ 2024static void ext3_free_data(handle_t *handle, struct inode *inode, 2025 struct buffer_head *this_bh, 2026 __le32 *first, __le32 *last) 2027{ 2028 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ 2029 unsigned long count = 0; /* Number of blocks in the run */ 2030 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 2031 corresponding to 2032 block_to_free */ 2033 ext3_fsblk_t nr; /* Current block # */ 2034 __le32 *p; /* Pointer into inode/ind 2035 for current block */ 2036 int err; 2037 2038 if (this_bh) { /* For indirect block */ 2039 BUFFER_TRACE(this_bh, "get_write_access"); 2040 err = ext3_journal_get_write_access(handle, this_bh); 2041 /* Important: if we can't update the indirect pointers 2042 * to the blocks, we can't free them. */ 2043 if (err) 2044 return; 2045 } 2046 2047 for (p = first; p < last; p++) { 2048 nr = le32_to_cpu(*p); 2049 if (nr) { 2050 /* accumulate blocks to free if they're contiguous */ 2051 if (count == 0) { 2052 block_to_free = nr; 2053 block_to_free_p = p; 2054 count = 1; 2055 } else if (nr == block_to_free + count) { 2056 count++; 2057 } else { 2058 ext3_clear_blocks(handle, inode, this_bh, 2059 block_to_free, 2060 count, block_to_free_p, p); 2061 block_to_free = nr; 2062 block_to_free_p = p; 2063 count = 1; 2064 } 2065 } 2066 } 2067 2068 if (count > 0) 2069 ext3_clear_blocks(handle, inode, this_bh, block_to_free, 2070 count, block_to_free_p, p); 2071 2072 if (this_bh) { 2073 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); 2074 ext3_journal_dirty_metadata(handle, this_bh); 2075 } 2076} 2077 2078/** 2079 * ext3_free_branches - free an array of branches 2080 * @handle: JBD handle for this transaction 2081 * @inode: inode we are dealing with 2082 * @parent_bh: the buffer_head which contains *@first and *@last 2083 * @first: array of block numbers 2084 * @last: pointer immediately past the end of array 2085 * @depth: depth of the branches to free 2086 * 2087 * We are freeing all blocks refered from these branches (numbers are 2088 * stored as little-endian 32-bit) and updating @inode->i_blocks 2089 * appropriately. 2090 */ 2091static void ext3_free_branches(handle_t *handle, struct inode *inode, 2092 struct buffer_head *parent_bh, 2093 __le32 *first, __le32 *last, int depth) 2094{ 2095 ext3_fsblk_t nr; 2096 __le32 *p; 2097 2098 if (is_handle_aborted(handle)) 2099 return; 2100 2101 if (depth--) { 2102 struct buffer_head *bh; 2103 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2104 p = last; 2105 while (--p >= first) { 2106 nr = le32_to_cpu(*p); 2107 if (!nr) 2108 continue; /* A hole */ 2109 2110 /* Go read the buffer for the next level down */ 2111 bh = sb_bread(inode->i_sb, nr); 2112 2113 /* 2114 * A read failure? Report error and clear slot 2115 * (should be rare). 2116 */ 2117 if (!bh) { 2118 ext3_error(inode->i_sb, "ext3_free_branches", 2119 "Read failure, inode=%lu, block="E3FSBLK, 2120 inode->i_ino, nr); 2121 continue; 2122 } 2123 2124 /* This zaps the entire block. Bottom up. */ 2125 BUFFER_TRACE(bh, "free child branches"); 2126 ext3_free_branches(handle, inode, bh, 2127 (__le32*)bh->b_data, 2128 (__le32*)bh->b_data + addr_per_block, 2129 depth); 2130 2131 /* 2132 * We've probably journalled the indirect block several 2133 * times during the truncate. But it's no longer 2134 * needed and we now drop it from the transaction via 2135 * journal_revoke(). 2136 * 2137 * That's easy if it's exclusively part of this 2138 * transaction. But if it's part of the committing 2139 * transaction then journal_forget() will simply 2140 * brelse() it. That means that if the underlying 2141 * block is reallocated in ext3_get_block(), 2142 * unmap_underlying_metadata() will find this block 2143 * and will try to get rid of it. damn, damn. 2144 * 2145 * If this block has already been committed to the 2146 * journal, a revoke record will be written. And 2147 * revoke records must be emitted *before* clearing 2148 * this block's bit in the bitmaps. 2149 */ 2150 ext3_forget(handle, 1, inode, bh, bh->b_blocknr); 2151 2152 /* 2153 * Everything below this this pointer has been 2154 * released. Now let this top-of-subtree go. 2155 * 2156 * We want the freeing of this indirect block to be 2157 * atomic in the journal with the updating of the 2158 * bitmap block which owns it. So make some room in 2159 * the journal. 2160 * 2161 * We zero the parent pointer *after* freeing its 2162 * pointee in the bitmaps, so if extend_transaction() 2163 * for some reason fails to put the bitmap changes and 2164 * the release into the same transaction, recovery 2165 * will merely complain about releasing a free block, 2166 * rather than leaking blocks. 2167 */ 2168 if (is_handle_aborted(handle)) 2169 return; 2170 if (try_to_extend_transaction(handle, inode)) { 2171 ext3_mark_inode_dirty(handle, inode); 2172 ext3_journal_test_restart(handle, inode); 2173 } 2174 2175 ext3_free_blocks(handle, inode, nr, 1); 2176 2177 if (parent_bh) { 2178 /* 2179 * The block which we have just freed is 2180 * pointed to by an indirect block: journal it 2181 */ 2182 BUFFER_TRACE(parent_bh, "get_write_access"); 2183 if (!ext3_journal_get_write_access(handle, 2184 parent_bh)){ 2185 *p = 0; 2186 BUFFER_TRACE(parent_bh, 2187 "call ext3_journal_dirty_metadata"); 2188 ext3_journal_dirty_metadata(handle, 2189 parent_bh); 2190 } 2191 } 2192 } 2193 } else { 2194 /* We have reached the bottom of the tree. */ 2195 BUFFER_TRACE(parent_bh, "free data blocks"); 2196 ext3_free_data(handle, inode, parent_bh, first, last); 2197 } 2198} 2199 2200/* 2201 * ext3_truncate() 2202 * 2203 * We block out ext3_get_block() block instantiations across the entire 2204 * transaction, and VFS/VM ensures that ext3_truncate() cannot run 2205 * simultaneously on behalf of the same inode. 2206 * 2207 * As we work through the truncate and commmit bits of it to the journal there 2208 * is one core, guiding principle: the file's tree must always be consistent on 2209 * disk. We must be able to restart the truncate after a crash. 2210 * 2211 * The file's tree may be transiently inconsistent in memory (although it 2212 * probably isn't), but whenever we close off and commit a journal transaction, 2213 * the contents of (the filesystem + the journal) must be consistent and 2214 * restartable. It's pretty simple, really: bottom up, right to left (although 2215 * left-to-right works OK too). 2216 * 2217 * Note that at recovery time, journal replay occurs *before* the restart of 2218 * truncate against the orphan inode list. 2219 * 2220 * The committed inode has the new, desired i_size (which is the same as 2221 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see 2222 * that this inode's truncate did not complete and it will again call 2223 * ext3_truncate() to have another go. So there will be instantiated blocks 2224 * to the right of the truncation point in a crashed ext3 filesystem. But 2225 * that's fine - as long as they are linked from the inode, the post-crash 2226 * ext3_truncate() run will find them and release them. 2227 */ 2228void ext3_truncate(struct inode *inode) 2229{ 2230 handle_t *handle; 2231 struct ext3_inode_info *ei = EXT3_I(inode); 2232 __le32 *i_data = ei->i_data; 2233 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2234 struct address_space *mapping = inode->i_mapping; 2235 int offsets[4]; 2236 Indirect chain[4]; 2237 Indirect *partial; 2238 __le32 nr = 0; 2239 int n; 2240 long last_block; 2241 unsigned blocksize = inode->i_sb->s_blocksize; 2242 struct page *page; 2243 2244 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2245 S_ISLNK(inode->i_mode))) 2246 return; 2247 if (ext3_inode_is_fast_symlink(inode)) 2248 return; 2249 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2250 return; 2251 2252 /* 2253 * We have to lock the EOF page here, because lock_page() nests 2254 * outside journal_start(). 2255 */ 2256 if ((inode->i_size & (blocksize - 1)) == 0) { 2257 /* Block boundary? Nothing to do */ 2258 page = NULL; 2259 } else { 2260 page = grab_cache_page(mapping, 2261 inode->i_size >> PAGE_CACHE_SHIFT); 2262 if (!page) 2263 return; 2264 } 2265 2266 handle = start_transaction(inode); 2267 if (IS_ERR(handle)) { 2268 if (page) { 2269 clear_highpage(page); 2270 flush_dcache_page(page); 2271 unlock_page(page); 2272 page_cache_release(page); 2273 } 2274 return; /* AKPM: return what? */ 2275 } 2276 2277 last_block = (inode->i_size + blocksize-1) 2278 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); 2279 2280 if (page) 2281 ext3_block_truncate_page(handle, page, mapping, inode->i_size); 2282 2283 n = ext3_block_to_path(inode, last_block, offsets, NULL); 2284 if (n == 0) 2285 goto out_stop; /* error */ 2286 2287 /* 2288 * OK. This truncate is going to happen. We add the inode to the 2289 * orphan list, so that if this truncate spans multiple transactions, 2290 * and we crash, we will resume the truncate when the filesystem 2291 * recovers. It also marks the inode dirty, to catch the new size. 2292 * 2293 * Implication: the file must always be in a sane, consistent 2294 * truncatable state while each transaction commits. 2295 */ 2296 if (ext3_orphan_add(handle, inode)) 2297 goto out_stop; 2298 2299 /* 2300 * The orphan list entry will now protect us from any crash which 2301 * occurs before the truncate completes, so it is now safe to propagate 2302 * the new, shorter inode size (held for now in i_size) into the 2303 * on-disk inode. We do this via i_disksize, which is the value which 2304 * ext3 *really* writes onto the disk inode. 2305 */ 2306 ei->i_disksize = inode->i_size; 2307 2308 /* 2309 * From here we block out all ext3_get_block() callers who want to 2310 * modify the block allocation tree. 2311 */ 2312 mutex_lock(&ei->truncate_mutex); 2313 2314 if (n == 1) { /* direct blocks */ 2315 ext3_free_data(handle, inode, NULL, i_data+offsets[0], 2316 i_data + EXT3_NDIR_BLOCKS); 2317 goto do_indirects; 2318 } 2319 2320 partial = ext3_find_shared(inode, n, offsets, chain, &nr); 2321 /* Kill the top of shared branch (not detached) */ 2322 if (nr) { 2323 if (partial == chain) { 2324 /* Shared branch grows from the inode */ 2325 ext3_free_branches(handle, inode, NULL, 2326 &nr, &nr+1, (chain+n-1) - partial); 2327 *partial->p = 0; 2328 /* 2329 * We mark the inode dirty prior to restart, 2330 * and prior to stop. No need for it here. 2331 */ 2332 } else { 2333 /* Shared branch grows from an indirect block */ 2334 BUFFER_TRACE(partial->bh, "get_write_access"); 2335 ext3_free_branches(handle, inode, partial->bh, 2336 partial->p, 2337 partial->p+1, (chain+n-1) - partial); 2338 } 2339 } 2340 /* Clear the ends of indirect blocks on the shared branch */ 2341 while (partial > chain) { 2342 ext3_free_branches(handle, inode, partial->bh, partial->p + 1, 2343 (__le32*)partial->bh->b_data+addr_per_block, 2344 (chain+n-1) - partial); 2345 BUFFER_TRACE(partial->bh, "call brelse"); 2346 brelse (partial->bh); 2347 partial--; 2348 } 2349do_indirects: 2350 /* Kill the remaining (whole) subtrees */ 2351 switch (offsets[0]) { 2352 default: 2353 nr = i_data[EXT3_IND_BLOCK]; 2354 if (nr) { 2355 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 2356 i_data[EXT3_IND_BLOCK] = 0; 2357 } 2358 case EXT3_IND_BLOCK: 2359 nr = i_data[EXT3_DIND_BLOCK]; 2360 if (nr) { 2361 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 2362 i_data[EXT3_DIND_BLOCK] = 0; 2363 } 2364 case EXT3_DIND_BLOCK: 2365 nr = i_data[EXT3_TIND_BLOCK]; 2366 if (nr) { 2367 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 2368 i_data[EXT3_TIND_BLOCK] = 0; 2369 } 2370 case EXT3_TIND_BLOCK: 2371 ; 2372 } 2373 2374 ext3_discard_reservation(inode); 2375 2376 mutex_unlock(&ei->truncate_mutex); 2377 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 2378 ext3_mark_inode_dirty(handle, inode); 2379 2380 /* 2381 * In a multi-transaction truncate, we only make the final transaction 2382 * synchronous 2383 */ 2384 if (IS_SYNC(inode)) 2385 handle->h_sync = 1; 2386out_stop: 2387 /* 2388 * If this was a simple ftruncate(), and the file will remain alive 2389 * then we need to clear up the orphan record which we created above. 2390 * However, if this was a real unlink then we were called by 2391 * ext3_delete_inode(), and we allow that function to clean up the 2392 * orphan info for us. 2393 */ 2394 if (inode->i_nlink) 2395 ext3_orphan_del(handle, inode); 2396 2397 ext3_journal_stop(handle); 2398} 2399 2400static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, 2401 unsigned long ino, struct ext3_iloc *iloc) 2402{ 2403 unsigned long desc, group_desc, block_group; 2404 unsigned long offset; 2405 ext3_fsblk_t block; 2406 struct buffer_head *bh; 2407 struct ext3_group_desc * gdp; 2408 2409 if (!ext3_valid_inum(sb, ino)) { 2410 /* 2411 * This error is already checked for in namei.c unless we are 2412 * looking at an NFS filehandle, in which case no error 2413 * report is needed 2414 */ 2415 return 0; 2416 } 2417 2418 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); 2419 if (block_group >= EXT3_SB(sb)->s_groups_count) { 2420 ext3_error(sb,"ext3_get_inode_block","group >= groups count"); 2421 return 0; 2422 } 2423 smp_rmb(); 2424 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); 2425 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); 2426 bh = EXT3_SB(sb)->s_group_desc[group_desc]; 2427 if (!bh) { 2428 ext3_error (sb, "ext3_get_inode_block", 2429 "Descriptor not loaded"); 2430 return 0; 2431 } 2432 2433 gdp = (struct ext3_group_desc *)bh->b_data; 2434 /* 2435 * Figure out the offset within the block group inode table 2436 */ 2437 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * 2438 EXT3_INODE_SIZE(sb); 2439 block = le32_to_cpu(gdp[desc].bg_inode_table) + 2440 (offset >> EXT3_BLOCK_SIZE_BITS(sb)); 2441 2442 iloc->block_group = block_group; 2443 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); 2444 return block; 2445} 2446 2447/* 2448 * ext3_get_inode_loc returns with an extra refcount against the inode's 2449 * underlying buffer_head on success. If 'in_mem' is true, we have all 2450 * data in memory that is needed to recreate the on-disk version of this 2451 * inode. 2452 */ 2453static int __ext3_get_inode_loc(struct inode *inode, 2454 struct ext3_iloc *iloc, int in_mem) 2455{ 2456 ext3_fsblk_t block; 2457 struct buffer_head *bh; 2458 2459 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); 2460 if (!block) 2461 return -EIO; 2462 2463 bh = sb_getblk(inode->i_sb, block); 2464 if (!bh) { 2465 ext3_error (inode->i_sb, "ext3_get_inode_loc", 2466 "unable to read inode block - " 2467 "inode=%lu, block="E3FSBLK, 2468 inode->i_ino, block); 2469 return -EIO; 2470 } 2471 if (!buffer_uptodate(bh)) { 2472 lock_buffer(bh); 2473 if (buffer_uptodate(bh)) { 2474 /* someone brought it uptodate while we waited */ 2475 unlock_buffer(bh); 2476 goto has_buffer; 2477 } 2478 2479 /* 2480 * If we have all information of the inode in memory and this 2481 * is the only valid inode in the block, we need not read the 2482 * block. 2483 */ 2484 if (in_mem) { 2485 struct buffer_head *bitmap_bh; 2486 struct ext3_group_desc *desc; 2487 int inodes_per_buffer; 2488 int inode_offset, i; 2489 int block_group; 2490 int start; 2491 2492 block_group = (inode->i_ino - 1) / 2493 EXT3_INODES_PER_GROUP(inode->i_sb); 2494 inodes_per_buffer = bh->b_size / 2495 EXT3_INODE_SIZE(inode->i_sb); 2496 inode_offset = ((inode->i_ino - 1) % 2497 EXT3_INODES_PER_GROUP(inode->i_sb)); 2498 start = inode_offset & ~(inodes_per_buffer - 1); 2499 2500 /* Is the inode bitmap in cache? */ 2501 desc = ext3_get_group_desc(inode->i_sb, 2502 block_group, NULL); 2503 if (!desc) 2504 goto make_io; 2505 2506 bitmap_bh = sb_getblk(inode->i_sb, 2507 le32_to_cpu(desc->bg_inode_bitmap)); 2508 if (!bitmap_bh) 2509 goto make_io; 2510 2511 /* 2512 * If the inode bitmap isn't in cache then the 2513 * optimisation may end up performing two reads instead 2514 * of one, so skip it. 2515 */ 2516 if (!buffer_uptodate(bitmap_bh)) { 2517 brelse(bitmap_bh); 2518 goto make_io; 2519 } 2520 for (i = start; i < start + inodes_per_buffer; i++) { 2521 if (i == inode_offset) 2522 continue; 2523 if (ext3_test_bit(i, bitmap_bh->b_data)) 2524 break; 2525 } 2526 brelse(bitmap_bh); 2527 if (i == start + inodes_per_buffer) { 2528 /* all other inodes are free, so skip I/O */ 2529 memset(bh->b_data, 0, bh->b_size); 2530 set_buffer_uptodate(bh); 2531 unlock_buffer(bh); 2532 goto has_buffer; 2533 } 2534 } 2535 2536make_io: 2537 /* 2538 * There are other valid inodes in the buffer, this inode 2539 * has in-inode xattrs, or we don't have this inode in memory. 2540 * Read the block from disk. 2541 */ 2542 get_bh(bh); 2543 bh->b_end_io = end_buffer_read_sync; 2544 submit_bh(READ_META, bh); 2545 wait_on_buffer(bh); 2546 if (!buffer_uptodate(bh)) { 2547 ext3_error(inode->i_sb, "ext3_get_inode_loc", 2548 "unable to read inode block - " 2549 "inode=%lu, block="E3FSBLK, 2550 inode->i_ino, block); 2551 brelse(bh); 2552 return -EIO; 2553 } 2554 } 2555has_buffer: 2556 iloc->bh = bh; 2557 return 0; 2558} 2559 2560int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) 2561{ 2562 /* We have all inode data except xattrs in memory here. */ 2563 return __ext3_get_inode_loc(inode, iloc, 2564 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); 2565} 2566 2567void ext3_set_inode_flags(struct inode *inode) 2568{ 2569 unsigned int flags = EXT3_I(inode)->i_flags; 2570 2571 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 2572 if (flags & EXT3_SYNC_FL) 2573 inode->i_flags |= S_SYNC; 2574 if (flags & EXT3_APPEND_FL) 2575 inode->i_flags |= S_APPEND; 2576 if (flags & EXT3_IMMUTABLE_FL) 2577 inode->i_flags |= S_IMMUTABLE; 2578 if (flags & EXT3_NOATIME_FL) 2579 inode->i_flags |= S_NOATIME; 2580 if (flags & EXT3_DIRSYNC_FL) 2581 inode->i_flags |= S_DIRSYNC; 2582} 2583 2584void ext3_read_inode(struct inode * inode) 2585{ 2586 struct ext3_iloc iloc; 2587 struct ext3_inode *raw_inode; 2588 struct ext3_inode_info *ei = EXT3_I(inode); 2589 struct buffer_head *bh; 2590 int block; 2591 2592#ifdef CONFIG_EXT3_FS_POSIX_ACL 2593 ei->i_acl = EXT3_ACL_NOT_CACHED; 2594 ei->i_default_acl = EXT3_ACL_NOT_CACHED; 2595#endif 2596 ei->i_block_alloc_info = NULL; 2597 2598 if (__ext3_get_inode_loc(inode, &iloc, 0)) 2599 goto bad_inode; 2600 bh = iloc.bh; 2601 raw_inode = ext3_raw_inode(&iloc); 2602 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 2603 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 2604 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 2605 if(!(test_opt (inode->i_sb, NO_UID32))) { 2606 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 2607 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2608 } 2609 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2610 inode->i_size = le32_to_cpu(raw_inode->i_size); 2611 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); 2612 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); 2613 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); 2614 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2615 2616 ei->i_state = 0; 2617 ei->i_dir_start_lookup = 0; 2618 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2619 /* We now have enough fields to check if the inode was active or not. 2620 * This is needed because nfsd might try to access dead inodes 2621 * the test is that same one that e2fsck uses 2622 * NeilBrown 1999oct15 2623 */ 2624 if (inode->i_nlink == 0) { 2625 if (inode->i_mode == 0 || 2626 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { 2627 /* this inode is deleted */ 2628 brelse (bh); 2629 goto bad_inode; 2630 } 2631 /* The only unlinked inodes we let through here have 2632 * valid i_mode and are being read by the orphan 2633 * recovery code: that's fine, we're about to complete 2634 * the process of deleting those. */ 2635 } 2636 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); 2637 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 2638#ifdef EXT3_FRAGMENTS 2639 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); 2640 ei->i_frag_no = raw_inode->i_frag; 2641 ei->i_frag_size = raw_inode->i_fsize; 2642#endif 2643 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 2644 if (!S_ISREG(inode->i_mode)) { 2645 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); 2646 } else { 2647 inode->i_size |= 2648 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; 2649 } 2650 ei->i_disksize = inode->i_size; 2651 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 2652 ei->i_block_group = iloc.block_group; 2653 /* 2654 * NOTE! The in-memory inode i_data array is in little-endian order 2655 * even on big-endian machines: we do NOT byteswap the block numbers! 2656 */ 2657 for (block = 0; block < EXT3_N_BLOCKS; block++) 2658 ei->i_data[block] = raw_inode->i_block[block]; 2659 INIT_LIST_HEAD(&ei->i_orphan); 2660 2661 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 2662 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { 2663 /* 2664 * When mke2fs creates big inodes it does not zero out 2665 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, 2666 * so ignore those first few inodes. 2667 */ 2668 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 2669 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 2670 EXT3_INODE_SIZE(inode->i_sb)) 2671 goto bad_inode; 2672 if (ei->i_extra_isize == 0) { 2673 /* The extra space is currently unused. Use it. */ 2674 ei->i_extra_isize = sizeof(struct ext3_inode) - 2675 EXT3_GOOD_OLD_INODE_SIZE; 2676 } else { 2677 __le32 *magic = (void *)raw_inode + 2678 EXT3_GOOD_OLD_INODE_SIZE + 2679 ei->i_extra_isize; 2680 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) 2681 ei->i_state |= EXT3_STATE_XATTR; 2682 } 2683 } else 2684 ei->i_extra_isize = 0; 2685 2686 if (S_ISREG(inode->i_mode)) { 2687 inode->i_op = &ext3_file_inode_operations; 2688 inode->i_fop = &ext3_file_operations; 2689 ext3_set_aops(inode); 2690 } else if (S_ISDIR(inode->i_mode)) { 2691 inode->i_op = &ext3_dir_inode_operations; 2692 inode->i_fop = &ext3_dir_operations; 2693 } else if (S_ISLNK(inode->i_mode)) { 2694 if (ext3_inode_is_fast_symlink(inode)) 2695 inode->i_op = &ext3_fast_symlink_inode_operations; 2696 else { 2697 inode->i_op = &ext3_symlink_inode_operations; 2698 ext3_set_aops(inode); 2699 } 2700 } else { 2701 inode->i_op = &ext3_special_inode_operations; 2702 if (raw_inode->i_block[0]) 2703 init_special_inode(inode, inode->i_mode, 2704 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 2705 else 2706 init_special_inode(inode, inode->i_mode, 2707 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 2708 } 2709 brelse (iloc.bh); 2710 ext3_set_inode_flags(inode); 2711 return; 2712 2713bad_inode: 2714 make_bad_inode(inode); 2715 return; 2716} 2717 2718/* 2719 * Post the struct inode info into an on-disk inode location in the 2720 * buffer-cache. This gobbles the caller's reference to the 2721 * buffer_head in the inode location struct. 2722 * 2723 * The caller must have write access to iloc->bh. 2724 */ 2725static int ext3_do_update_inode(handle_t *handle, 2726 struct inode *inode, 2727 struct ext3_iloc *iloc) 2728{ 2729 struct ext3_inode *raw_inode = ext3_raw_inode(iloc); 2730 struct ext3_inode_info *ei = EXT3_I(inode); 2731 struct buffer_head *bh = iloc->bh; 2732 int err = 0, rc, block; 2733 2734 /* For fields not not tracking in the in-memory inode, 2735 * initialise them to zero for new inodes. */ 2736 if (ei->i_state & EXT3_STATE_NEW) 2737 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 2738 2739 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 2740 if(!(test_opt(inode->i_sb, NO_UID32))) { 2741 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 2742 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 2743/* 2744 * Fix up interoperability with old kernels. Otherwise, old inodes get 2745 * re-used with the upper 16 bits of the uid/gid intact 2746 */ 2747 if(!ei->i_dtime) { 2748 raw_inode->i_uid_high = 2749 cpu_to_le16(high_16_bits(inode->i_uid)); 2750 raw_inode->i_gid_high = 2751 cpu_to_le16(high_16_bits(inode->i_gid)); 2752 } else { 2753 raw_inode->i_uid_high = 0; 2754 raw_inode->i_gid_high = 0; 2755 } 2756 } else { 2757 raw_inode->i_uid_low = 2758 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 2759 raw_inode->i_gid_low = 2760 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 2761 raw_inode->i_uid_high = 0; 2762 raw_inode->i_gid_high = 0; 2763 } 2764 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 2765 raw_inode->i_size = cpu_to_le32(ei->i_disksize); 2766 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 2767 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 2768 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 2769 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 2770 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 2771 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 2772#ifdef EXT3_FRAGMENTS 2773 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); 2774 raw_inode->i_frag = ei->i_frag_no; 2775 raw_inode->i_fsize = ei->i_frag_size; 2776#endif 2777 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); 2778 if (!S_ISREG(inode->i_mode)) { 2779 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 2780 } else { 2781 raw_inode->i_size_high = 2782 cpu_to_le32(ei->i_disksize >> 32); 2783 if (ei->i_disksize > 0x7fffffffULL) { 2784 struct super_block *sb = inode->i_sb; 2785 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, 2786 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || 2787 EXT3_SB(sb)->s_es->s_rev_level == 2788 cpu_to_le32(EXT3_GOOD_OLD_REV)) { 2789 /* If this is the first large file 2790 * created, add a flag to the superblock. 2791 */ 2792 err = ext3_journal_get_write_access(handle, 2793 EXT3_SB(sb)->s_sbh); 2794 if (err) 2795 goto out_brelse; 2796 ext3_update_dynamic_rev(sb); 2797 EXT3_SET_RO_COMPAT_FEATURE(sb, 2798 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2799 sb->s_dirt = 1; 2800 handle->h_sync = 1; 2801 err = ext3_journal_dirty_metadata(handle, 2802 EXT3_SB(sb)->s_sbh); 2803 } 2804 } 2805 } 2806 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 2807 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 2808 if (old_valid_dev(inode->i_rdev)) { 2809 raw_inode->i_block[0] = 2810 cpu_to_le32(old_encode_dev(inode->i_rdev)); 2811 raw_inode->i_block[1] = 0; 2812 } else { 2813 raw_inode->i_block[0] = 0; 2814 raw_inode->i_block[1] = 2815 cpu_to_le32(new_encode_dev(inode->i_rdev)); 2816 raw_inode->i_block[2] = 0; 2817 } 2818 } else for (block = 0; block < EXT3_N_BLOCKS; block++) 2819 raw_inode->i_block[block] = ei->i_data[block]; 2820 2821 if (ei->i_extra_isize) 2822 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 2823 2824 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 2825 rc = ext3_journal_dirty_metadata(handle, bh); 2826 if (!err) 2827 err = rc; 2828 ei->i_state &= ~EXT3_STATE_NEW; 2829 2830out_brelse: 2831 brelse (bh); 2832 ext3_std_error(inode->i_sb, err); 2833 return err; 2834} 2835 2836/* 2837 * ext3_write_inode() 2838 * 2839 * We are called from a few places: 2840 * 2841 * - Within generic_file_write() for O_SYNC files. 2842 * Here, there will be no transaction running. We wait for any running 2843 * trasnaction to commit. 2844 * 2845 * - Within sys_sync(), kupdate and such. 2846 * We wait on commit, if tol to. 2847 * 2848 * - Within prune_icache() (PF_MEMALLOC == true) 2849 * Here we simply return. We can't afford to block kswapd on the 2850 * journal commit. 2851 * 2852 * In all cases it is actually safe for us to return without doing anything, 2853 * because the inode has been copied into a raw inode buffer in 2854 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 2855 * knfsd. 2856 * 2857 * Note that we are absolutely dependent upon all inode dirtiers doing the 2858 * right thing: they *must* call mark_inode_dirty() after dirtying info in 2859 * which we are interested. 2860 * 2861 * It would be a bug for them to not do this. The code: 2862 * 2863 * mark_inode_dirty(inode) 2864 * stuff(); 2865 * inode->i_size = expr; 2866 * 2867 * is in error because a kswapd-driven write_inode() could occur while 2868 * `stuff()' is running, and the new i_size will be lost. Plus the inode 2869 * will no longer be on the superblock's dirty inode list. 2870 */ 2871int ext3_write_inode(struct inode *inode, int wait) 2872{ 2873 if (current->flags & PF_MEMALLOC) 2874 return 0; 2875 2876 if (ext3_journal_current_handle()) { 2877 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); 2878 dump_stack(); 2879 return -EIO; 2880 } 2881 2882 if (!wait) 2883 return 0; 2884 2885 return ext3_force_commit(inode->i_sb); 2886} 2887 2888/* 2889 * ext3_setattr() 2890 * 2891 * Called from notify_change. 2892 * 2893 * We want to trap VFS attempts to truncate the file as soon as 2894 * possible. In particular, we want to make sure that when the VFS 2895 * shrinks i_size, we put the inode on the orphan list and modify 2896 * i_disksize immediately, so that during the subsequent flushing of 2897 * dirty pages and freeing of disk blocks, we can guarantee that any 2898 * commit will leave the blocks being flushed in an unused state on 2899 * disk. (On recovery, the inode will get truncated and the blocks will 2900 * be freed, so we have a strong guarantee that no future commit will 2901 * leave these blocks visible to the user.) 2902 * 2903 * Called with inode->sem down. 2904 */ 2905int ext3_setattr(struct dentry *dentry, struct iattr *attr) 2906{ 2907 struct inode *inode = dentry->d_inode; 2908 int error, rc = 0; 2909 const unsigned int ia_valid = attr->ia_valid; 2910 2911 error = inode_change_ok(inode, attr); 2912 if (error) 2913 return error; 2914 2915 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 2916 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 2917 handle_t *handle; 2918 2919 /* (user+group)*(old+new) structure, inode write (sb, 2920 * inode block, ? - but truncate inode update has it) */ 2921 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ 2922 EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 2923 if (IS_ERR(handle)) { 2924 error = PTR_ERR(handle); 2925 goto err_out; 2926 } 2927 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 2928 if (error) { 2929 ext3_journal_stop(handle); 2930 return error; 2931 } 2932 /* Update corresponding info in inode so that everything is in 2933 * one transaction */ 2934 if (attr->ia_valid & ATTR_UID) 2935 inode->i_uid = attr->ia_uid; 2936 if (attr->ia_valid & ATTR_GID) 2937 inode->i_gid = attr->ia_gid; 2938 error = ext3_mark_inode_dirty(handle, inode); 2939 ext3_journal_stop(handle); 2940 } 2941 2942 if (S_ISREG(inode->i_mode) && 2943 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 2944 handle_t *handle; 2945 2946 handle = ext3_journal_start(inode, 3); 2947 if (IS_ERR(handle)) { 2948 error = PTR_ERR(handle); 2949 goto err_out; 2950 } 2951 2952 error = ext3_orphan_add(handle, inode); 2953 EXT3_I(inode)->i_disksize = attr->ia_size; 2954 rc = ext3_mark_inode_dirty(handle, inode); 2955 if (!error) 2956 error = rc; 2957 ext3_journal_stop(handle); 2958 } 2959 2960 rc = inode_setattr(inode, attr); 2961 2962 /* If inode_setattr's call to ext3_truncate failed to get a 2963 * transaction handle at all, we need to clean up the in-core 2964 * orphan list manually. */ 2965 if (inode->i_nlink) 2966 ext3_orphan_del(NULL, inode); 2967 2968 if (!rc && (ia_valid & ATTR_MODE)) 2969 rc = ext3_acl_chmod(inode); 2970 2971err_out: 2972 ext3_std_error(inode->i_sb, error); 2973 if (!error) 2974 error = rc; 2975 return error; 2976} 2977 2978 2979/* 2980 * How many blocks doth make a writepage()? 2981 * 2982 * With N blocks per page, it may be: 2983 * N data blocks 2984 * 2 indirect block 2985 * 2 dindirect 2986 * 1 tindirect 2987 * N+5 bitmap blocks (from the above) 2988 * N+5 group descriptor summary blocks 2989 * 1 inode block 2990 * 1 superblock. 2991 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files 2992 * 2993 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS 2994 * 2995 * With ordered or writeback data it's the same, less the N data blocks. 2996 * 2997 * If the inode's direct blocks can hold an integral number of pages then a 2998 * page cannot straddle two indirect blocks, and we can only touch one indirect 2999 * and dindirect block, and the "5" above becomes "3". 3000 * 3001 * This still overestimates under most circumstances. If we were to pass the 3002 * start and end offsets in here as well we could do block_to_path() on each 3003 * block and work out the exact number of indirects which are touched. Pah. 3004 */ 3005 3006static int ext3_writepage_trans_blocks(struct inode *inode) 3007{ 3008 int bpp = ext3_journal_blocks_per_page(inode); 3009 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; 3010 int ret; 3011 3012 if (ext3_should_journal_data(inode)) 3013 ret = 3 * (bpp + indirects) + 2; 3014 else 3015 ret = 2 * (bpp + indirects) + 2; 3016 3017#ifdef CONFIG_QUOTA 3018 /* We know that structure was already allocated during DQUOT_INIT so 3019 * we will be updating only the data blocks + inodes */ 3020 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); 3021#endif 3022 3023 return ret; 3024} 3025 3026/* 3027 * The caller must have previously called ext3_reserve_inode_write(). 3028 * Give this, we know that the caller already has write access to iloc->bh. 3029 */ 3030int ext3_mark_iloc_dirty(handle_t *handle, 3031 struct inode *inode, struct ext3_iloc *iloc) 3032{ 3033 int err = 0; 3034 3035 /* the do_update_inode consumes one bh->b_count */ 3036 get_bh(iloc->bh); 3037 3038 /* ext3_do_update_inode() does journal_dirty_metadata */ 3039 err = ext3_do_update_inode(handle, inode, iloc); 3040 put_bh(iloc->bh); 3041 return err; 3042} 3043 3044/* 3045 * On success, We end up with an outstanding reference count against 3046 * iloc->bh. This _must_ be cleaned up later. 3047 */ 3048 3049int 3050ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 3051 struct ext3_iloc *iloc) 3052{ 3053 int err = 0; 3054 if (handle) { 3055 err = ext3_get_inode_loc(inode, iloc); 3056 if (!err) { 3057 BUFFER_TRACE(iloc->bh, "get_write_access"); 3058 err = ext3_journal_get_write_access(handle, iloc->bh); 3059 if (err) { 3060 brelse(iloc->bh); 3061 iloc->bh = NULL; 3062 } 3063 } 3064 } 3065 ext3_std_error(inode->i_sb, err); 3066 return err; 3067} 3068 3069/* 3070 * What we do here is to mark the in-core inode as clean with respect to inode 3071 * dirtiness (it may still be data-dirty). 3072 * This means that the in-core inode may be reaped by prune_icache 3073 * without having to perform any I/O. This is a very good thing, 3074 * because *any* task may call prune_icache - even ones which 3075 * have a transaction open against a different journal. 3076 * 3077 * Is this cheating? Not really. Sure, we haven't written the 3078 * inode out, but prune_icache isn't a user-visible syncing function. 3079 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 3080 * we start and wait on commits. 3081 * 3082 * Is this efficient/effective? Well, we're being nice to the system 3083 * by cleaning up our inodes proactively so they can be reaped 3084 * without I/O. But we are potentially leaving up to five seconds' 3085 * worth of inodes floating about which prune_icache wants us to 3086 * write out. One way to fix that would be to get prune_icache() 3087 * to do a write_super() to free up some memory. It has the desired 3088 * effect. 3089 */ 3090int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) 3091{ 3092 struct ext3_iloc iloc; 3093 int err; 3094 3095 might_sleep(); 3096 err = ext3_reserve_inode_write(handle, inode, &iloc); 3097 if (!err) 3098 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 3099 return err; 3100} 3101 3102/* 3103 * ext3_dirty_inode() is called from __mark_inode_dirty() 3104 * 3105 * We're really interested in the case where a file is being extended. 3106 * i_size has been changed by generic_commit_write() and we thus need 3107 * to include the updated inode in the current transaction. 3108 * 3109 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 3110 * are allocated to the file. 3111 * 3112 * If the inode is marked synchronous, we don't honour that here - doing 3113 * so would cause a commit on atime updates, which we don't bother doing. 3114 * We handle synchronous inodes at the highest possible level. 3115 */ 3116void ext3_dirty_inode(struct inode *inode) 3117{ 3118 handle_t *current_handle = ext3_journal_current_handle(); 3119 handle_t *handle; 3120 3121 handle = ext3_journal_start(inode, 2); 3122 if (IS_ERR(handle)) 3123 goto out; 3124 if (current_handle && 3125 current_handle->h_transaction != handle->h_transaction) { 3126 /* This task has a transaction open against a different fs */ 3127 printk(KERN_EMERG "%s: transactions do not match!\n", 3128 __FUNCTION__); 3129 } else { 3130 jbd_debug(5, "marking dirty. outer handle=%p\n", 3131 current_handle); 3132 ext3_mark_inode_dirty(handle, inode); 3133 } 3134 ext3_journal_stop(handle); 3135out: 3136 return; 3137} 3138 3139#if 0 3140/* 3141 * Bind an inode's backing buffer_head into this transaction, to prevent 3142 * it from being flushed to disk early. Unlike 3143 * ext3_reserve_inode_write, this leaves behind no bh reference and 3144 * returns no iloc structure, so the caller needs to repeat the iloc 3145 * lookup to mark the inode dirty later. 3146 */ 3147static int ext3_pin_inode(handle_t *handle, struct inode *inode) 3148{ 3149 struct ext3_iloc iloc; 3150 3151 int err = 0; 3152 if (handle) { 3153 err = ext3_get_inode_loc(inode, &iloc); 3154 if (!err) { 3155 BUFFER_TRACE(iloc.bh, "get_write_access"); 3156 err = journal_get_write_access(handle, iloc.bh); 3157 if (!err) 3158 err = ext3_journal_dirty_metadata(handle, 3159 iloc.bh); 3160 brelse(iloc.bh); 3161 } 3162 } 3163 ext3_std_error(inode->i_sb, err); 3164 return err; 3165} 3166#endif 3167 3168int ext3_change_inode_journal_flag(struct inode *inode, int val) 3169{ 3170 journal_t *journal; 3171 handle_t *handle; 3172 int err; 3173 3174 /* 3175 * We have to be very careful here: changing a data block's 3176 * journaling status dynamically is dangerous. If we write a 3177 * data block to the journal, change the status and then delete 3178 * that block, we risk forgetting to revoke the old log record 3179 * from the journal and so a subsequent replay can corrupt data. 3180 * So, first we make sure that the journal is empty and that 3181 * nobody is changing anything. 3182 */ 3183 3184 journal = EXT3_JOURNAL(inode); 3185 if (is_journal_aborted(journal) || IS_RDONLY(inode)) 3186 return -EROFS; 3187 3188 journal_lock_updates(journal); 3189 journal_flush(journal); 3190 3191 /* 3192 * OK, there are no updates running now, and all cached data is 3193 * synced to disk. We are now in a completely consistent state 3194 * which doesn't have anything in the journal, and we know that 3195 * no filesystem updates are running, so it is safe to modify 3196 * the inode's in-core data-journaling state flag now. 3197 */ 3198 3199 if (val) 3200 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; 3201 else 3202 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; 3203 ext3_set_aops(inode); 3204 3205 journal_unlock_updates(journal); 3206 3207 /* Finally we can mark the inode as dirty. */ 3208 3209 handle = ext3_journal_start(inode, 1); 3210 if (IS_ERR(handle)) 3211 return PTR_ERR(handle); 3212 3213 err = ext3_mark_inode_dirty(handle, inode); 3214 handle->h_sync = 1; 3215 ext3_journal_stop(handle); 3216 ext3_std_error(inode->i_sb, err); 3217 3218 return err; 3219}