at v3.9-rc4 3607 lines 108 kB view raw
1/* 2 * linux/fs/ext3/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 23 */ 24 25#include <linux/highuid.h> 26#include <linux/quotaops.h> 27#include <linux/writeback.h> 28#include <linux/mpage.h> 29#include <linux/namei.h> 30#include "ext3.h" 31#include "xattr.h" 32#include "acl.h" 33 34static int ext3_writepage_trans_blocks(struct inode *inode); 35static int ext3_block_truncate_page(struct inode *inode, loff_t from); 36 37/* 38 * Test whether an inode is a fast symlink. 39 */ 40static int ext3_inode_is_fast_symlink(struct inode *inode) 41{ 42 int ea_blocks = EXT3_I(inode)->i_file_acl ? 43 (inode->i_sb->s_blocksize >> 9) : 0; 44 45 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 46} 47 48/* 49 * The ext3 forget function must perform a revoke if we are freeing data 50 * which has been journaled. Metadata (eg. indirect blocks) must be 51 * revoked in all cases. 52 * 53 * "bh" may be NULL: a metadata block may have been freed from memory 54 * but there may still be a record of it in the journal, and that record 55 * still needs to be revoked. 56 */ 57int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, 58 struct buffer_head *bh, ext3_fsblk_t blocknr) 59{ 60 int err; 61 62 might_sleep(); 63 64 trace_ext3_forget(inode, is_metadata, blocknr); 65 BUFFER_TRACE(bh, "enter"); 66 67 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 68 "data mode %lx\n", 69 bh, is_metadata, inode->i_mode, 70 test_opt(inode->i_sb, DATA_FLAGS)); 71 72 /* Never use the revoke function if we are doing full data 73 * journaling: there is no need to, and a V1 superblock won't 74 * support it. Otherwise, only skip the revoke on un-journaled 75 * data blocks. */ 76 77 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || 78 (!is_metadata && !ext3_should_journal_data(inode))) { 79 if (bh) { 80 BUFFER_TRACE(bh, "call journal_forget"); 81 return ext3_journal_forget(handle, bh); 82 } 83 return 0; 84 } 85 86 /* 87 * data!=journal && (is_metadata || should_journal_data(inode)) 88 */ 89 BUFFER_TRACE(bh, "call ext3_journal_revoke"); 90 err = ext3_journal_revoke(handle, blocknr, bh); 91 if (err) 92 ext3_abort(inode->i_sb, __func__, 93 "error %d when attempting revoke", err); 94 BUFFER_TRACE(bh, "exit"); 95 return err; 96} 97 98/* 99 * Work out how many blocks we need to proceed with the next chunk of a 100 * truncate transaction. 101 */ 102static unsigned long blocks_for_truncate(struct inode *inode) 103{ 104 unsigned long needed; 105 106 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 107 108 /* Give ourselves just enough room to cope with inodes in which 109 * i_blocks is corrupt: we've seen disk corruptions in the past 110 * which resulted in random data in an inode which looked enough 111 * like a regular file for ext3 to try to delete it. Things 112 * will go a bit crazy if that happens, but at least we should 113 * try not to panic the whole kernel. */ 114 if (needed < 2) 115 needed = 2; 116 117 /* But we need to bound the transaction so we don't overflow the 118 * journal. */ 119 if (needed > EXT3_MAX_TRANS_DATA) 120 needed = EXT3_MAX_TRANS_DATA; 121 122 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 123} 124 125/* 126 * Truncate transactions can be complex and absolutely huge. So we need to 127 * be able to restart the transaction at a conventient checkpoint to make 128 * sure we don't overflow the journal. 129 * 130 * start_transaction gets us a new handle for a truncate transaction, 131 * and extend_transaction tries to extend the existing one a bit. If 132 * extend fails, we need to propagate the failure up and restart the 133 * transaction in the top-level truncate loop. --sct 134 */ 135static handle_t *start_transaction(struct inode *inode) 136{ 137 handle_t *result; 138 139 result = ext3_journal_start(inode, blocks_for_truncate(inode)); 140 if (!IS_ERR(result)) 141 return result; 142 143 ext3_std_error(inode->i_sb, PTR_ERR(result)); 144 return result; 145} 146 147/* 148 * Try to extend this transaction for the purposes of truncation. 149 * 150 * Returns 0 if we managed to create more room. If we can't create more 151 * room, and the transaction must be restarted we return 1. 152 */ 153static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 154{ 155 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) 156 return 0; 157 if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) 158 return 0; 159 return 1; 160} 161 162/* 163 * Restart the transaction associated with *handle. This does a commit, 164 * so before we call here everything must be consistently dirtied against 165 * this transaction. 166 */ 167static int truncate_restart_transaction(handle_t *handle, struct inode *inode) 168{ 169 int ret; 170 171 jbd_debug(2, "restarting handle %p\n", handle); 172 /* 173 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle 174 * At this moment, get_block can be called only for blocks inside 175 * i_size since page cache has been already dropped and writes are 176 * blocked by i_mutex. So we can safely drop the truncate_mutex. 177 */ 178 mutex_unlock(&EXT3_I(inode)->truncate_mutex); 179 ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); 180 mutex_lock(&EXT3_I(inode)->truncate_mutex); 181 return ret; 182} 183 184/* 185 * Called at inode eviction from icache 186 */ 187void ext3_evict_inode (struct inode *inode) 188{ 189 struct ext3_inode_info *ei = EXT3_I(inode); 190 struct ext3_block_alloc_info *rsv; 191 handle_t *handle; 192 int want_delete = 0; 193 194 trace_ext3_evict_inode(inode); 195 if (!inode->i_nlink && !is_bad_inode(inode)) { 196 dquot_initialize(inode); 197 want_delete = 1; 198 } 199 200 /* 201 * When journalling data dirty buffers are tracked only in the journal. 202 * So although mm thinks everything is clean and ready for reaping the 203 * inode might still have some pages to write in the running 204 * transaction or waiting to be checkpointed. Thus calling 205 * journal_invalidatepage() (via truncate_inode_pages()) to discard 206 * these buffers can cause data loss. Also even if we did not discard 207 * these buffers, we would have no way to find them after the inode 208 * is reaped and thus user could see stale data if he tries to read 209 * them before the transaction is checkpointed. So be careful and 210 * force everything to disk here... We use ei->i_datasync_tid to 211 * store the newest transaction containing inode's data. 212 * 213 * Note that directories do not have this problem because they don't 214 * use page cache. 215 * 216 * The s_journal check handles the case when ext3_get_journal() fails 217 * and puts the journal inode. 218 */ 219 if (inode->i_nlink && ext3_should_journal_data(inode) && 220 EXT3_SB(inode->i_sb)->s_journal && 221 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { 222 tid_t commit_tid = atomic_read(&ei->i_datasync_tid); 223 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 224 225 log_start_commit(journal, commit_tid); 226 log_wait_commit(journal, commit_tid); 227 filemap_write_and_wait(&inode->i_data); 228 } 229 truncate_inode_pages(&inode->i_data, 0); 230 231 ext3_discard_reservation(inode); 232 rsv = ei->i_block_alloc_info; 233 ei->i_block_alloc_info = NULL; 234 if (unlikely(rsv)) 235 kfree(rsv); 236 237 if (!want_delete) 238 goto no_delete; 239 240 handle = start_transaction(inode); 241 if (IS_ERR(handle)) { 242 /* 243 * If we're going to skip the normal cleanup, we still need to 244 * make sure that the in-core orphan linked list is properly 245 * cleaned up. 246 */ 247 ext3_orphan_del(NULL, inode); 248 goto no_delete; 249 } 250 251 if (IS_SYNC(inode)) 252 handle->h_sync = 1; 253 inode->i_size = 0; 254 if (inode->i_blocks) 255 ext3_truncate(inode); 256 /* 257 * Kill off the orphan record created when the inode lost the last 258 * link. Note that ext3_orphan_del() has to be able to cope with the 259 * deletion of a non-existent orphan - ext3_truncate() could 260 * have removed the record. 261 */ 262 ext3_orphan_del(handle, inode); 263 ei->i_dtime = get_seconds(); 264 265 /* 266 * One subtle ordering requirement: if anything has gone wrong 267 * (transaction abort, IO errors, whatever), then we can still 268 * do these next steps (the fs will already have been marked as 269 * having errors), but we can't free the inode if the mark_dirty 270 * fails. 271 */ 272 if (ext3_mark_inode_dirty(handle, inode)) { 273 /* If that failed, just dquot_drop() and be done with that */ 274 dquot_drop(inode); 275 clear_inode(inode); 276 } else { 277 ext3_xattr_delete_inode(handle, inode); 278 dquot_free_inode(inode); 279 dquot_drop(inode); 280 clear_inode(inode); 281 ext3_free_inode(handle, inode); 282 } 283 ext3_journal_stop(handle); 284 return; 285no_delete: 286 clear_inode(inode); 287 dquot_drop(inode); 288} 289 290typedef struct { 291 __le32 *p; 292 __le32 key; 293 struct buffer_head *bh; 294} Indirect; 295 296static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 297{ 298 p->key = *(p->p = v); 299 p->bh = bh; 300} 301 302static int verify_chain(Indirect *from, Indirect *to) 303{ 304 while (from <= to && from->key == *from->p) 305 from++; 306 return (from > to); 307} 308 309/** 310 * ext3_block_to_path - parse the block number into array of offsets 311 * @inode: inode in question (we are only interested in its superblock) 312 * @i_block: block number to be parsed 313 * @offsets: array to store the offsets in 314 * @boundary: set this non-zero if the referred-to block is likely to be 315 * followed (on disk) by an indirect block. 316 * 317 * To store the locations of file's data ext3 uses a data structure common 318 * for UNIX filesystems - tree of pointers anchored in the inode, with 319 * data blocks at leaves and indirect blocks in intermediate nodes. 320 * This function translates the block number into path in that tree - 321 * return value is the path length and @offsets[n] is the offset of 322 * pointer to (n+1)th node in the nth one. If @block is out of range 323 * (negative or too large) warning is printed and zero returned. 324 * 325 * Note: function doesn't find node addresses, so no IO is needed. All 326 * we need to know is the capacity of indirect blocks (taken from the 327 * inode->i_sb). 328 */ 329 330/* 331 * Portability note: the last comparison (check that we fit into triple 332 * indirect block) is spelled differently, because otherwise on an 333 * architecture with 32-bit longs and 8Kb pages we might get into trouble 334 * if our filesystem had 8Kb blocks. We might use long long, but that would 335 * kill us on x86. Oh, well, at least the sign propagation does not matter - 336 * i_block would have to be negative in the very beginning, so we would not 337 * get there at all. 338 */ 339 340static int ext3_block_to_path(struct inode *inode, 341 long i_block, int offsets[4], int *boundary) 342{ 343 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); 344 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); 345 const long direct_blocks = EXT3_NDIR_BLOCKS, 346 indirect_blocks = ptrs, 347 double_blocks = (1 << (ptrs_bits * 2)); 348 int n = 0; 349 int final = 0; 350 351 if (i_block < 0) { 352 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); 353 } else if (i_block < direct_blocks) { 354 offsets[n++] = i_block; 355 final = direct_blocks; 356 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 357 offsets[n++] = EXT3_IND_BLOCK; 358 offsets[n++] = i_block; 359 final = ptrs; 360 } else if ((i_block -= indirect_blocks) < double_blocks) { 361 offsets[n++] = EXT3_DIND_BLOCK; 362 offsets[n++] = i_block >> ptrs_bits; 363 offsets[n++] = i_block & (ptrs - 1); 364 final = ptrs; 365 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 366 offsets[n++] = EXT3_TIND_BLOCK; 367 offsets[n++] = i_block >> (ptrs_bits * 2); 368 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 369 offsets[n++] = i_block & (ptrs - 1); 370 final = ptrs; 371 } else { 372 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); 373 } 374 if (boundary) 375 *boundary = final - 1 - (i_block & (ptrs - 1)); 376 return n; 377} 378 379/** 380 * ext3_get_branch - read the chain of indirect blocks leading to data 381 * @inode: inode in question 382 * @depth: depth of the chain (1 - direct pointer, etc.) 383 * @offsets: offsets of pointers in inode/indirect blocks 384 * @chain: place to store the result 385 * @err: here we store the error value 386 * 387 * Function fills the array of triples <key, p, bh> and returns %NULL 388 * if everything went OK or the pointer to the last filled triple 389 * (incomplete one) otherwise. Upon the return chain[i].key contains 390 * the number of (i+1)-th block in the chain (as it is stored in memory, 391 * i.e. little-endian 32-bit), chain[i].p contains the address of that 392 * number (it points into struct inode for i==0 and into the bh->b_data 393 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 394 * block for i>0 and NULL for i==0. In other words, it holds the block 395 * numbers of the chain, addresses they were taken from (and where we can 396 * verify that chain did not change) and buffer_heads hosting these 397 * numbers. 398 * 399 * Function stops when it stumbles upon zero pointer (absent block) 400 * (pointer to last triple returned, *@err == 0) 401 * or when it gets an IO error reading an indirect block 402 * (ditto, *@err == -EIO) 403 * or when it notices that chain had been changed while it was reading 404 * (ditto, *@err == -EAGAIN) 405 * or when it reads all @depth-1 indirect blocks successfully and finds 406 * the whole chain, all way to the data (returns %NULL, *err == 0). 407 */ 408static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, 409 Indirect chain[4], int *err) 410{ 411 struct super_block *sb = inode->i_sb; 412 Indirect *p = chain; 413 struct buffer_head *bh; 414 415 *err = 0; 416 /* i_data is not going away, no lock needed */ 417 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); 418 if (!p->key) 419 goto no_block; 420 while (--depth) { 421 bh = sb_bread(sb, le32_to_cpu(p->key)); 422 if (!bh) 423 goto failure; 424 /* Reader: pointers */ 425 if (!verify_chain(chain, p)) 426 goto changed; 427 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 428 /* Reader: end */ 429 if (!p->key) 430 goto no_block; 431 } 432 return NULL; 433 434changed: 435 brelse(bh); 436 *err = -EAGAIN; 437 goto no_block; 438failure: 439 *err = -EIO; 440no_block: 441 return p; 442} 443 444/** 445 * ext3_find_near - find a place for allocation with sufficient locality 446 * @inode: owner 447 * @ind: descriptor of indirect block. 448 * 449 * This function returns the preferred place for block allocation. 450 * It is used when heuristic for sequential allocation fails. 451 * Rules are: 452 * + if there is a block to the left of our position - allocate near it. 453 * + if pointer will live in indirect block - allocate near that block. 454 * + if pointer will live in inode - allocate in the same 455 * cylinder group. 456 * 457 * In the latter case we colour the starting block by the callers PID to 458 * prevent it from clashing with concurrent allocations for a different inode 459 * in the same block group. The PID is used here so that functionally related 460 * files will be close-by on-disk. 461 * 462 * Caller must make sure that @ind is valid and will stay that way. 463 */ 464static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) 465{ 466 struct ext3_inode_info *ei = EXT3_I(inode); 467 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 468 __le32 *p; 469 ext3_fsblk_t bg_start; 470 ext3_grpblk_t colour; 471 472 /* Try to find previous block */ 473 for (p = ind->p - 1; p >= start; p--) { 474 if (*p) 475 return le32_to_cpu(*p); 476 } 477 478 /* No such thing, so let's try location of indirect block */ 479 if (ind->bh) 480 return ind->bh->b_blocknr; 481 482 /* 483 * It is going to be referred to from the inode itself? OK, just put it 484 * into the same cylinder group then. 485 */ 486 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); 487 colour = (current->pid % 16) * 488 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); 489 return bg_start + colour; 490} 491 492/** 493 * ext3_find_goal - find a preferred place for allocation. 494 * @inode: owner 495 * @block: block we want 496 * @partial: pointer to the last triple within a chain 497 * 498 * Normally this function find the preferred place for block allocation, 499 * returns it. 500 */ 501 502static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, 503 Indirect *partial) 504{ 505 struct ext3_block_alloc_info *block_i; 506 507 block_i = EXT3_I(inode)->i_block_alloc_info; 508 509 /* 510 * try the heuristic for sequential allocation, 511 * failing that at least try to get decent locality. 512 */ 513 if (block_i && (block == block_i->last_alloc_logical_block + 1) 514 && (block_i->last_alloc_physical_block != 0)) { 515 return block_i->last_alloc_physical_block + 1; 516 } 517 518 return ext3_find_near(inode, partial); 519} 520 521/** 522 * ext3_blks_to_allocate - Look up the block map and count the number 523 * of direct blocks need to be allocated for the given branch. 524 * 525 * @branch: chain of indirect blocks 526 * @k: number of blocks need for indirect blocks 527 * @blks: number of data blocks to be mapped. 528 * @blocks_to_boundary: the offset in the indirect block 529 * 530 * return the total number of blocks to be allocate, including the 531 * direct and indirect blocks. 532 */ 533static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 534 int blocks_to_boundary) 535{ 536 unsigned long count = 0; 537 538 /* 539 * Simple case, [t,d]Indirect block(s) has not allocated yet 540 * then it's clear blocks on that path have not allocated 541 */ 542 if (k > 0) { 543 /* right now we don't handle cross boundary allocation */ 544 if (blks < blocks_to_boundary + 1) 545 count += blks; 546 else 547 count += blocks_to_boundary + 1; 548 return count; 549 } 550 551 count++; 552 while (count < blks && count <= blocks_to_boundary && 553 le32_to_cpu(*(branch[0].p + count)) == 0) { 554 count++; 555 } 556 return count; 557} 558 559/** 560 * ext3_alloc_blocks - multiple allocate blocks needed for a branch 561 * @handle: handle for this transaction 562 * @inode: owner 563 * @goal: preferred place for allocation 564 * @indirect_blks: the number of blocks need to allocate for indirect 565 * blocks 566 * @blks: number of blocks need to allocated for direct blocks 567 * @new_blocks: on return it will store the new block numbers for 568 * the indirect blocks(if needed) and the first direct block, 569 * @err: here we store the error value 570 * 571 * return the number of direct blocks allocated 572 */ 573static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 574 ext3_fsblk_t goal, int indirect_blks, int blks, 575 ext3_fsblk_t new_blocks[4], int *err) 576{ 577 int target, i; 578 unsigned long count = 0; 579 int index = 0; 580 ext3_fsblk_t current_block = 0; 581 int ret = 0; 582 583 /* 584 * Here we try to allocate the requested multiple blocks at once, 585 * on a best-effort basis. 586 * To build a branch, we should allocate blocks for 587 * the indirect blocks(if not allocated yet), and at least 588 * the first direct block of this branch. That's the 589 * minimum number of blocks need to allocate(required) 590 */ 591 target = blks + indirect_blks; 592 593 while (1) { 594 count = target; 595 /* allocating blocks for indirect blocks and direct blocks */ 596 current_block = ext3_new_blocks(handle,inode,goal,&count,err); 597 if (*err) 598 goto failed_out; 599 600 target -= count; 601 /* allocate blocks for indirect blocks */ 602 while (index < indirect_blks && count) { 603 new_blocks[index++] = current_block++; 604 count--; 605 } 606 607 if (count > 0) 608 break; 609 } 610 611 /* save the new block number for the first direct block */ 612 new_blocks[index] = current_block; 613 614 /* total number of blocks allocated for direct blocks */ 615 ret = count; 616 *err = 0; 617 return ret; 618failed_out: 619 for (i = 0; i <index; i++) 620 ext3_free_blocks(handle, inode, new_blocks[i], 1); 621 return ret; 622} 623 624/** 625 * ext3_alloc_branch - allocate and set up a chain of blocks. 626 * @handle: handle for this transaction 627 * @inode: owner 628 * @indirect_blks: number of allocated indirect blocks 629 * @blks: number of allocated direct blocks 630 * @goal: preferred place for allocation 631 * @offsets: offsets (in the blocks) to store the pointers to next. 632 * @branch: place to store the chain in. 633 * 634 * This function allocates blocks, zeroes out all but the last one, 635 * links them into chain and (if we are synchronous) writes them to disk. 636 * In other words, it prepares a branch that can be spliced onto the 637 * inode. It stores the information about that chain in the branch[], in 638 * the same format as ext3_get_branch() would do. We are calling it after 639 * we had read the existing part of chain and partial points to the last 640 * triple of that (one with zero ->key). Upon the exit we have the same 641 * picture as after the successful ext3_get_block(), except that in one 642 * place chain is disconnected - *branch->p is still zero (we did not 643 * set the last link), but branch->key contains the number that should 644 * be placed into *branch->p to fill that gap. 645 * 646 * If allocation fails we free all blocks we've allocated (and forget 647 * their buffer_heads) and return the error value the from failed 648 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain 649 * as described above and return 0. 650 */ 651static int ext3_alloc_branch(handle_t *handle, struct inode *inode, 652 int indirect_blks, int *blks, ext3_fsblk_t goal, 653 int *offsets, Indirect *branch) 654{ 655 int blocksize = inode->i_sb->s_blocksize; 656 int i, n = 0; 657 int err = 0; 658 struct buffer_head *bh; 659 int num; 660 ext3_fsblk_t new_blocks[4]; 661 ext3_fsblk_t current_block; 662 663 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, 664 *blks, new_blocks, &err); 665 if (err) 666 return err; 667 668 branch[0].key = cpu_to_le32(new_blocks[0]); 669 /* 670 * metadata blocks and data blocks are allocated. 671 */ 672 for (n = 1; n <= indirect_blks; n++) { 673 /* 674 * Get buffer_head for parent block, zero it out 675 * and set the pointer to new one, then send 676 * parent to disk. 677 */ 678 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 679 if (unlikely(!bh)) { 680 err = -ENOMEM; 681 goto failed; 682 } 683 branch[n].bh = bh; 684 lock_buffer(bh); 685 BUFFER_TRACE(bh, "call get_create_access"); 686 err = ext3_journal_get_create_access(handle, bh); 687 if (err) { 688 unlock_buffer(bh); 689 brelse(bh); 690 goto failed; 691 } 692 693 memset(bh->b_data, 0, blocksize); 694 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 695 branch[n].key = cpu_to_le32(new_blocks[n]); 696 *branch[n].p = branch[n].key; 697 if ( n == indirect_blks) { 698 current_block = new_blocks[n]; 699 /* 700 * End of chain, update the last new metablock of 701 * the chain to point to the new allocated 702 * data blocks numbers 703 */ 704 for (i=1; i < num; i++) 705 *(branch[n].p + i) = cpu_to_le32(++current_block); 706 } 707 BUFFER_TRACE(bh, "marking uptodate"); 708 set_buffer_uptodate(bh); 709 unlock_buffer(bh); 710 711 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 712 err = ext3_journal_dirty_metadata(handle, bh); 713 if (err) 714 goto failed; 715 } 716 *blks = num; 717 return err; 718failed: 719 /* Allocation failed, free what we already allocated */ 720 for (i = 1; i <= n ; i++) { 721 BUFFER_TRACE(branch[i].bh, "call journal_forget"); 722 ext3_journal_forget(handle, branch[i].bh); 723 } 724 for (i = 0; i < indirect_blks; i++) 725 ext3_free_blocks(handle, inode, new_blocks[i], 1); 726 727 ext3_free_blocks(handle, inode, new_blocks[i], num); 728 729 return err; 730} 731 732/** 733 * ext3_splice_branch - splice the allocated branch onto inode. 734 * @handle: handle for this transaction 735 * @inode: owner 736 * @block: (logical) number of block we are adding 737 * @where: location of missing link 738 * @num: number of indirect blocks we are adding 739 * @blks: number of direct blocks we are adding 740 * 741 * This function fills the missing link and does all housekeeping needed in 742 * inode (->i_blocks, etc.). In case of success we end up with the full 743 * chain to new block and return 0. 744 */ 745static int ext3_splice_branch(handle_t *handle, struct inode *inode, 746 long block, Indirect *where, int num, int blks) 747{ 748 int i; 749 int err = 0; 750 struct ext3_block_alloc_info *block_i; 751 ext3_fsblk_t current_block; 752 struct ext3_inode_info *ei = EXT3_I(inode); 753 struct timespec now; 754 755 block_i = ei->i_block_alloc_info; 756 /* 757 * If we're splicing into a [td]indirect block (as opposed to the 758 * inode) then we need to get write access to the [td]indirect block 759 * before the splice. 760 */ 761 if (where->bh) { 762 BUFFER_TRACE(where->bh, "get_write_access"); 763 err = ext3_journal_get_write_access(handle, where->bh); 764 if (err) 765 goto err_out; 766 } 767 /* That's it */ 768 769 *where->p = where->key; 770 771 /* 772 * Update the host buffer_head or inode to point to more just allocated 773 * direct blocks blocks 774 */ 775 if (num == 0 && blks > 1) { 776 current_block = le32_to_cpu(where->key) + 1; 777 for (i = 1; i < blks; i++) 778 *(where->p + i ) = cpu_to_le32(current_block++); 779 } 780 781 /* 782 * update the most recently allocated logical & physical block 783 * in i_block_alloc_info, to assist find the proper goal block for next 784 * allocation 785 */ 786 if (block_i) { 787 block_i->last_alloc_logical_block = block + blks - 1; 788 block_i->last_alloc_physical_block = 789 le32_to_cpu(where[num].key) + blks - 1; 790 } 791 792 /* We are done with atomic stuff, now do the rest of housekeeping */ 793 now = CURRENT_TIME_SEC; 794 if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { 795 inode->i_ctime = now; 796 ext3_mark_inode_dirty(handle, inode); 797 } 798 /* ext3_mark_inode_dirty already updated i_sync_tid */ 799 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); 800 801 /* had we spliced it onto indirect block? */ 802 if (where->bh) { 803 /* 804 * If we spliced it onto an indirect block, we haven't 805 * altered the inode. Note however that if it is being spliced 806 * onto an indirect block at the very end of the file (the 807 * file is growing) then we *will* alter the inode to reflect 808 * the new i_size. But that is not done here - it is done in 809 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. 810 */ 811 jbd_debug(5, "splicing indirect only\n"); 812 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); 813 err = ext3_journal_dirty_metadata(handle, where->bh); 814 if (err) 815 goto err_out; 816 } else { 817 /* 818 * OK, we spliced it into the inode itself on a direct block. 819 * Inode was dirtied above. 820 */ 821 jbd_debug(5, "splicing direct\n"); 822 } 823 return err; 824 825err_out: 826 for (i = 1; i <= num; i++) { 827 BUFFER_TRACE(where[i].bh, "call journal_forget"); 828 ext3_journal_forget(handle, where[i].bh); 829 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); 830 } 831 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); 832 833 return err; 834} 835 836/* 837 * Allocation strategy is simple: if we have to allocate something, we will 838 * have to go the whole way to leaf. So let's do it before attaching anything 839 * to tree, set linkage between the newborn blocks, write them if sync is 840 * required, recheck the path, free and repeat if check fails, otherwise 841 * set the last missing link (that will protect us from any truncate-generated 842 * removals - all blocks on the path are immune now) and possibly force the 843 * write on the parent block. 844 * That has a nice additional property: no special recovery from the failed 845 * allocations is needed - we simply release blocks and do not touch anything 846 * reachable from inode. 847 * 848 * `handle' can be NULL if create == 0. 849 * 850 * The BKL may not be held on entry here. Be sure to take it early. 851 * return > 0, # of blocks mapped or allocated. 852 * return = 0, if plain lookup failed. 853 * return < 0, error case. 854 */ 855int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 856 sector_t iblock, unsigned long maxblocks, 857 struct buffer_head *bh_result, 858 int create) 859{ 860 int err = -EIO; 861 int offsets[4]; 862 Indirect chain[4]; 863 Indirect *partial; 864 ext3_fsblk_t goal; 865 int indirect_blks; 866 int blocks_to_boundary = 0; 867 int depth; 868 struct ext3_inode_info *ei = EXT3_I(inode); 869 int count = 0; 870 ext3_fsblk_t first_block = 0; 871 872 873 trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); 874 J_ASSERT(handle != NULL || create == 0); 875 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 876 877 if (depth == 0) 878 goto out; 879 880 partial = ext3_get_branch(inode, depth, offsets, chain, &err); 881 882 /* Simplest case - block found, no allocation needed */ 883 if (!partial) { 884 first_block = le32_to_cpu(chain[depth - 1].key); 885 clear_buffer_new(bh_result); 886 count++; 887 /*map more blocks*/ 888 while (count < maxblocks && count <= blocks_to_boundary) { 889 ext3_fsblk_t blk; 890 891 if (!verify_chain(chain, chain + depth - 1)) { 892 /* 893 * Indirect block might be removed by 894 * truncate while we were reading it. 895 * Handling of that case: forget what we've 896 * got now. Flag the err as EAGAIN, so it 897 * will reread. 898 */ 899 err = -EAGAIN; 900 count = 0; 901 break; 902 } 903 blk = le32_to_cpu(*(chain[depth-1].p + count)); 904 905 if (blk == first_block + count) 906 count++; 907 else 908 break; 909 } 910 if (err != -EAGAIN) 911 goto got_it; 912 } 913 914 /* Next simple case - plain lookup or failed read of indirect block */ 915 if (!create || err == -EIO) 916 goto cleanup; 917 918 /* 919 * Block out ext3_truncate while we alter the tree 920 */ 921 mutex_lock(&ei->truncate_mutex); 922 923 /* 924 * If the indirect block is missing while we are reading 925 * the chain(ext3_get_branch() returns -EAGAIN err), or 926 * if the chain has been changed after we grab the semaphore, 927 * (either because another process truncated this branch, or 928 * another get_block allocated this branch) re-grab the chain to see if 929 * the request block has been allocated or not. 930 * 931 * Since we already block the truncate/other get_block 932 * at this point, we will have the current copy of the chain when we 933 * splice the branch into the tree. 934 */ 935 if (err == -EAGAIN || !verify_chain(chain, partial)) { 936 while (partial > chain) { 937 brelse(partial->bh); 938 partial--; 939 } 940 partial = ext3_get_branch(inode, depth, offsets, chain, &err); 941 if (!partial) { 942 count++; 943 mutex_unlock(&ei->truncate_mutex); 944 if (err) 945 goto cleanup; 946 clear_buffer_new(bh_result); 947 goto got_it; 948 } 949 } 950 951 /* 952 * Okay, we need to do block allocation. Lazily initialize the block 953 * allocation info here if necessary 954 */ 955 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) 956 ext3_init_block_alloc_info(inode); 957 958 goal = ext3_find_goal(inode, iblock, partial); 959 960 /* the number of blocks need to allocate for [d,t]indirect blocks */ 961 indirect_blks = (chain + depth) - partial - 1; 962 963 /* 964 * Next look up the indirect map to count the totoal number of 965 * direct blocks to allocate for this branch. 966 */ 967 count = ext3_blks_to_allocate(partial, indirect_blks, 968 maxblocks, blocks_to_boundary); 969 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, 970 offsets + (partial - chain), partial); 971 972 /* 973 * The ext3_splice_branch call will free and forget any buffers 974 * on the new chain if there is a failure, but that risks using 975 * up transaction credits, especially for bitmaps where the 976 * credits cannot be returned. Can we handle this somehow? We 977 * may need to return -EAGAIN upwards in the worst case. --sct 978 */ 979 if (!err) 980 err = ext3_splice_branch(handle, inode, iblock, 981 partial, indirect_blks, count); 982 mutex_unlock(&ei->truncate_mutex); 983 if (err) 984 goto cleanup; 985 986 set_buffer_new(bh_result); 987got_it: 988 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 989 if (count > blocks_to_boundary) 990 set_buffer_boundary(bh_result); 991 err = count; 992 /* Clean up and exit */ 993 partial = chain + depth - 1; /* the whole chain */ 994cleanup: 995 while (partial > chain) { 996 BUFFER_TRACE(partial->bh, "call brelse"); 997 brelse(partial->bh); 998 partial--; 999 } 1000 BUFFER_TRACE(bh_result, "returned"); 1001out: 1002 trace_ext3_get_blocks_exit(inode, iblock, 1003 depth ? le32_to_cpu(chain[depth-1].key) : 0, 1004 count, err); 1005 return err; 1006} 1007 1008/* Maximum number of blocks we map for direct IO at once. */ 1009#define DIO_MAX_BLOCKS 4096 1010/* 1011 * Number of credits we need for writing DIO_MAX_BLOCKS: 1012 * We need sb + group descriptor + bitmap + inode -> 4 1013 * For B blocks with A block pointers per block we need: 1014 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). 1015 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. 1016 */ 1017#define DIO_CREDITS 25 1018 1019static int ext3_get_block(struct inode *inode, sector_t iblock, 1020 struct buffer_head *bh_result, int create) 1021{ 1022 handle_t *handle = ext3_journal_current_handle(); 1023 int ret = 0, started = 0; 1024 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1025 1026 if (create && !handle) { /* Direct IO write... */ 1027 if (max_blocks > DIO_MAX_BLOCKS) 1028 max_blocks = DIO_MAX_BLOCKS; 1029 handle = ext3_journal_start(inode, DIO_CREDITS + 1030 EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); 1031 if (IS_ERR(handle)) { 1032 ret = PTR_ERR(handle); 1033 goto out; 1034 } 1035 started = 1; 1036 } 1037 1038 ret = ext3_get_blocks_handle(handle, inode, iblock, 1039 max_blocks, bh_result, create); 1040 if (ret > 0) { 1041 bh_result->b_size = (ret << inode->i_blkbits); 1042 ret = 0; 1043 } 1044 if (started) 1045 ext3_journal_stop(handle); 1046out: 1047 return ret; 1048} 1049 1050int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1051 u64 start, u64 len) 1052{ 1053 return generic_block_fiemap(inode, fieinfo, start, len, 1054 ext3_get_block); 1055} 1056 1057/* 1058 * `handle' can be NULL if create is zero 1059 */ 1060struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, 1061 long block, int create, int *errp) 1062{ 1063 struct buffer_head dummy; 1064 int fatal = 0, err; 1065 1066 J_ASSERT(handle != NULL || create == 0); 1067 1068 dummy.b_state = 0; 1069 dummy.b_blocknr = -1000; 1070 buffer_trace_init(&dummy.b_history); 1071 err = ext3_get_blocks_handle(handle, inode, block, 1, 1072 &dummy, create); 1073 /* 1074 * ext3_get_blocks_handle() returns number of blocks 1075 * mapped. 0 in case of a HOLE. 1076 */ 1077 if (err > 0) { 1078 WARN_ON(err > 1); 1079 err = 0; 1080 } 1081 *errp = err; 1082 if (!err && buffer_mapped(&dummy)) { 1083 struct buffer_head *bh; 1084 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1085 if (unlikely(!bh)) { 1086 *errp = -ENOMEM; 1087 goto err; 1088 } 1089 if (buffer_new(&dummy)) { 1090 J_ASSERT(create != 0); 1091 J_ASSERT(handle != NULL); 1092 1093 /* 1094 * Now that we do not always journal data, we should 1095 * keep in mind whether this should always journal the 1096 * new buffer as metadata. For now, regular file 1097 * writes use ext3_get_block instead, so it's not a 1098 * problem. 1099 */ 1100 lock_buffer(bh); 1101 BUFFER_TRACE(bh, "call get_create_access"); 1102 fatal = ext3_journal_get_create_access(handle, bh); 1103 if (!fatal && !buffer_uptodate(bh)) { 1104 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1105 set_buffer_uptodate(bh); 1106 } 1107 unlock_buffer(bh); 1108 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1109 err = ext3_journal_dirty_metadata(handle, bh); 1110 if (!fatal) 1111 fatal = err; 1112 } else { 1113 BUFFER_TRACE(bh, "not a new buffer"); 1114 } 1115 if (fatal) { 1116 *errp = fatal; 1117 brelse(bh); 1118 bh = NULL; 1119 } 1120 return bh; 1121 } 1122err: 1123 return NULL; 1124} 1125 1126struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, 1127 int block, int create, int *err) 1128{ 1129 struct buffer_head * bh; 1130 1131 bh = ext3_getblk(handle, inode, block, create, err); 1132 if (!bh) 1133 return bh; 1134 if (bh_uptodate_or_lock(bh)) 1135 return bh; 1136 get_bh(bh); 1137 bh->b_end_io = end_buffer_read_sync; 1138 submit_bh(READ | REQ_META | REQ_PRIO, bh); 1139 wait_on_buffer(bh); 1140 if (buffer_uptodate(bh)) 1141 return bh; 1142 put_bh(bh); 1143 *err = -EIO; 1144 return NULL; 1145} 1146 1147static int walk_page_buffers( handle_t *handle, 1148 struct buffer_head *head, 1149 unsigned from, 1150 unsigned to, 1151 int *partial, 1152 int (*fn)( handle_t *handle, 1153 struct buffer_head *bh)) 1154{ 1155 struct buffer_head *bh; 1156 unsigned block_start, block_end; 1157 unsigned blocksize = head->b_size; 1158 int err, ret = 0; 1159 struct buffer_head *next; 1160 1161 for ( bh = head, block_start = 0; 1162 ret == 0 && (bh != head || !block_start); 1163 block_start = block_end, bh = next) 1164 { 1165 next = bh->b_this_page; 1166 block_end = block_start + blocksize; 1167 if (block_end <= from || block_start >= to) { 1168 if (partial && !buffer_uptodate(bh)) 1169 *partial = 1; 1170 continue; 1171 } 1172 err = (*fn)(handle, bh); 1173 if (!ret) 1174 ret = err; 1175 } 1176 return ret; 1177} 1178 1179/* 1180 * To preserve ordering, it is essential that the hole instantiation and 1181 * the data write be encapsulated in a single transaction. We cannot 1182 * close off a transaction and start a new one between the ext3_get_block() 1183 * and the commit_write(). So doing the journal_start at the start of 1184 * prepare_write() is the right place. 1185 * 1186 * Also, this function can nest inside ext3_writepage() -> 1187 * block_write_full_page(). In that case, we *know* that ext3_writepage() 1188 * has generated enough buffer credits to do the whole page. So we won't 1189 * block on the journal in that case, which is good, because the caller may 1190 * be PF_MEMALLOC. 1191 * 1192 * By accident, ext3 can be reentered when a transaction is open via 1193 * quota file writes. If we were to commit the transaction while thus 1194 * reentered, there can be a deadlock - we would be holding a quota 1195 * lock, and the commit would never complete if another thread had a 1196 * transaction open and was blocking on the quota lock - a ranking 1197 * violation. 1198 * 1199 * So what we do is to rely on the fact that journal_stop/journal_start 1200 * will _not_ run commit under these circumstances because handle->h_ref 1201 * is elevated. We'll still have enough credits for the tiny quotafile 1202 * write. 1203 */ 1204static int do_journal_get_write_access(handle_t *handle, 1205 struct buffer_head *bh) 1206{ 1207 int dirty = buffer_dirty(bh); 1208 int ret; 1209 1210 if (!buffer_mapped(bh) || buffer_freed(bh)) 1211 return 0; 1212 /* 1213 * __block_prepare_write() could have dirtied some buffers. Clean 1214 * the dirty bit as jbd2_journal_get_write_access() could complain 1215 * otherwise about fs integrity issues. Setting of the dirty bit 1216 * by __block_prepare_write() isn't a real problem here as we clear 1217 * the bit before releasing a page lock and thus writeback cannot 1218 * ever write the buffer. 1219 */ 1220 if (dirty) 1221 clear_buffer_dirty(bh); 1222 ret = ext3_journal_get_write_access(handle, bh); 1223 if (!ret && dirty) 1224 ret = ext3_journal_dirty_metadata(handle, bh); 1225 return ret; 1226} 1227 1228/* 1229 * Truncate blocks that were not used by write. We have to truncate the 1230 * pagecache as well so that corresponding buffers get properly unmapped. 1231 */ 1232static void ext3_truncate_failed_write(struct inode *inode) 1233{ 1234 truncate_inode_pages(inode->i_mapping, inode->i_size); 1235 ext3_truncate(inode); 1236} 1237 1238/* 1239 * Truncate blocks that were not used by direct IO write. We have to zero out 1240 * the last file block as well because direct IO might have written to it. 1241 */ 1242static void ext3_truncate_failed_direct_write(struct inode *inode) 1243{ 1244 ext3_block_truncate_page(inode, inode->i_size); 1245 ext3_truncate(inode); 1246} 1247 1248static int ext3_write_begin(struct file *file, struct address_space *mapping, 1249 loff_t pos, unsigned len, unsigned flags, 1250 struct page **pagep, void **fsdata) 1251{ 1252 struct inode *inode = mapping->host; 1253 int ret; 1254 handle_t *handle; 1255 int retries = 0; 1256 struct page *page; 1257 pgoff_t index; 1258 unsigned from, to; 1259 /* Reserve one block more for addition to orphan list in case 1260 * we allocate blocks but write fails for some reason */ 1261 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; 1262 1263 trace_ext3_write_begin(inode, pos, len, flags); 1264 1265 index = pos >> PAGE_CACHE_SHIFT; 1266 from = pos & (PAGE_CACHE_SIZE - 1); 1267 to = from + len; 1268 1269retry: 1270 page = grab_cache_page_write_begin(mapping, index, flags); 1271 if (!page) 1272 return -ENOMEM; 1273 *pagep = page; 1274 1275 handle = ext3_journal_start(inode, needed_blocks); 1276 if (IS_ERR(handle)) { 1277 unlock_page(page); 1278 page_cache_release(page); 1279 ret = PTR_ERR(handle); 1280 goto out; 1281 } 1282 ret = __block_write_begin(page, pos, len, ext3_get_block); 1283 if (ret) 1284 goto write_begin_failed; 1285 1286 if (ext3_should_journal_data(inode)) { 1287 ret = walk_page_buffers(handle, page_buffers(page), 1288 from, to, NULL, do_journal_get_write_access); 1289 } 1290write_begin_failed: 1291 if (ret) { 1292 /* 1293 * block_write_begin may have instantiated a few blocks 1294 * outside i_size. Trim these off again. Don't need 1295 * i_size_read because we hold i_mutex. 1296 * 1297 * Add inode to orphan list in case we crash before truncate 1298 * finishes. Do this only if ext3_can_truncate() agrees so 1299 * that orphan processing code is happy. 1300 */ 1301 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1302 ext3_orphan_add(handle, inode); 1303 ext3_journal_stop(handle); 1304 unlock_page(page); 1305 page_cache_release(page); 1306 if (pos + len > inode->i_size) 1307 ext3_truncate_failed_write(inode); 1308 } 1309 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1310 goto retry; 1311out: 1312 return ret; 1313} 1314 1315 1316int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) 1317{ 1318 int err = journal_dirty_data(handle, bh); 1319 if (err) 1320 ext3_journal_abort_handle(__func__, __func__, 1321 bh, handle, err); 1322 return err; 1323} 1324 1325/* For ordered writepage and write_end functions */ 1326static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1327{ 1328 /* 1329 * Write could have mapped the buffer but it didn't copy the data in 1330 * yet. So avoid filing such buffer into a transaction. 1331 */ 1332 if (buffer_mapped(bh) && buffer_uptodate(bh)) 1333 return ext3_journal_dirty_data(handle, bh); 1334 return 0; 1335} 1336 1337/* For write_end() in data=journal mode */ 1338static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1339{ 1340 if (!buffer_mapped(bh) || buffer_freed(bh)) 1341 return 0; 1342 set_buffer_uptodate(bh); 1343 return ext3_journal_dirty_metadata(handle, bh); 1344} 1345 1346/* 1347 * This is nasty and subtle: ext3_write_begin() could have allocated blocks 1348 * for the whole page but later we failed to copy the data in. Update inode 1349 * size according to what we managed to copy. The rest is going to be 1350 * truncated in write_end function. 1351 */ 1352static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) 1353{ 1354 /* What matters to us is i_disksize. We don't write i_size anywhere */ 1355 if (pos + copied > inode->i_size) 1356 i_size_write(inode, pos + copied); 1357 if (pos + copied > EXT3_I(inode)->i_disksize) { 1358 EXT3_I(inode)->i_disksize = pos + copied; 1359 mark_inode_dirty(inode); 1360 } 1361} 1362 1363/* 1364 * We need to pick up the new inode size which generic_commit_write gave us 1365 * `file' can be NULL - eg, when called from page_symlink(). 1366 * 1367 * ext3 never places buffers on inode->i_mapping->private_list. metadata 1368 * buffers are managed internally. 1369 */ 1370static int ext3_ordered_write_end(struct file *file, 1371 struct address_space *mapping, 1372 loff_t pos, unsigned len, unsigned copied, 1373 struct page *page, void *fsdata) 1374{ 1375 handle_t *handle = ext3_journal_current_handle(); 1376 struct inode *inode = file->f_mapping->host; 1377 unsigned from, to; 1378 int ret = 0, ret2; 1379 1380 trace_ext3_ordered_write_end(inode, pos, len, copied); 1381 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1382 1383 from = pos & (PAGE_CACHE_SIZE - 1); 1384 to = from + copied; 1385 ret = walk_page_buffers(handle, page_buffers(page), 1386 from, to, NULL, journal_dirty_data_fn); 1387 1388 if (ret == 0) 1389 update_file_sizes(inode, pos, copied); 1390 /* 1391 * There may be allocated blocks outside of i_size because 1392 * we failed to copy some data. Prepare for truncate. 1393 */ 1394 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1395 ext3_orphan_add(handle, inode); 1396 ret2 = ext3_journal_stop(handle); 1397 if (!ret) 1398 ret = ret2; 1399 unlock_page(page); 1400 page_cache_release(page); 1401 1402 if (pos + len > inode->i_size) 1403 ext3_truncate_failed_write(inode); 1404 return ret ? ret : copied; 1405} 1406 1407static int ext3_writeback_write_end(struct file *file, 1408 struct address_space *mapping, 1409 loff_t pos, unsigned len, unsigned copied, 1410 struct page *page, void *fsdata) 1411{ 1412 handle_t *handle = ext3_journal_current_handle(); 1413 struct inode *inode = file->f_mapping->host; 1414 int ret; 1415 1416 trace_ext3_writeback_write_end(inode, pos, len, copied); 1417 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1418 update_file_sizes(inode, pos, copied); 1419 /* 1420 * There may be allocated blocks outside of i_size because 1421 * we failed to copy some data. Prepare for truncate. 1422 */ 1423 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1424 ext3_orphan_add(handle, inode); 1425 ret = ext3_journal_stop(handle); 1426 unlock_page(page); 1427 page_cache_release(page); 1428 1429 if (pos + len > inode->i_size) 1430 ext3_truncate_failed_write(inode); 1431 return ret ? ret : copied; 1432} 1433 1434static int ext3_journalled_write_end(struct file *file, 1435 struct address_space *mapping, 1436 loff_t pos, unsigned len, unsigned copied, 1437 struct page *page, void *fsdata) 1438{ 1439 handle_t *handle = ext3_journal_current_handle(); 1440 struct inode *inode = mapping->host; 1441 struct ext3_inode_info *ei = EXT3_I(inode); 1442 int ret = 0, ret2; 1443 int partial = 0; 1444 unsigned from, to; 1445 1446 trace_ext3_journalled_write_end(inode, pos, len, copied); 1447 from = pos & (PAGE_CACHE_SIZE - 1); 1448 to = from + len; 1449 1450 if (copied < len) { 1451 if (!PageUptodate(page)) 1452 copied = 0; 1453 page_zero_new_buffers(page, from + copied, to); 1454 to = from + copied; 1455 } 1456 1457 ret = walk_page_buffers(handle, page_buffers(page), from, 1458 to, &partial, write_end_fn); 1459 if (!partial) 1460 SetPageUptodate(page); 1461 1462 if (pos + copied > inode->i_size) 1463 i_size_write(inode, pos + copied); 1464 /* 1465 * There may be allocated blocks outside of i_size because 1466 * we failed to copy some data. Prepare for truncate. 1467 */ 1468 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1469 ext3_orphan_add(handle, inode); 1470 ext3_set_inode_state(inode, EXT3_STATE_JDATA); 1471 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); 1472 if (inode->i_size > ei->i_disksize) { 1473 ei->i_disksize = inode->i_size; 1474 ret2 = ext3_mark_inode_dirty(handle, inode); 1475 if (!ret) 1476 ret = ret2; 1477 } 1478 1479 ret2 = ext3_journal_stop(handle); 1480 if (!ret) 1481 ret = ret2; 1482 unlock_page(page); 1483 page_cache_release(page); 1484 1485 if (pos + len > inode->i_size) 1486 ext3_truncate_failed_write(inode); 1487 return ret ? ret : copied; 1488} 1489 1490/* 1491 * bmap() is special. It gets used by applications such as lilo and by 1492 * the swapper to find the on-disk block of a specific piece of data. 1493 * 1494 * Naturally, this is dangerous if the block concerned is still in the 1495 * journal. If somebody makes a swapfile on an ext3 data-journaling 1496 * filesystem and enables swap, then they may get a nasty shock when the 1497 * data getting swapped to that swapfile suddenly gets overwritten by 1498 * the original zero's written out previously to the journal and 1499 * awaiting writeback in the kernel's buffer cache. 1500 * 1501 * So, if we see any bmap calls here on a modified, data-journaled file, 1502 * take extra steps to flush any blocks which might be in the cache. 1503 */ 1504static sector_t ext3_bmap(struct address_space *mapping, sector_t block) 1505{ 1506 struct inode *inode = mapping->host; 1507 journal_t *journal; 1508 int err; 1509 1510 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { 1511 /* 1512 * This is a REALLY heavyweight approach, but the use of 1513 * bmap on dirty files is expected to be extremely rare: 1514 * only if we run lilo or swapon on a freshly made file 1515 * do we expect this to happen. 1516 * 1517 * (bmap requires CAP_SYS_RAWIO so this does not 1518 * represent an unprivileged user DOS attack --- we'd be 1519 * in trouble if mortal users could trigger this path at 1520 * will.) 1521 * 1522 * NB. EXT3_STATE_JDATA is not set on files other than 1523 * regular files. If somebody wants to bmap a directory 1524 * or symlink and gets confused because the buffer 1525 * hasn't yet been flushed to disk, they deserve 1526 * everything they get. 1527 */ 1528 1529 ext3_clear_inode_state(inode, EXT3_STATE_JDATA); 1530 journal = EXT3_JOURNAL(inode); 1531 journal_lock_updates(journal); 1532 err = journal_flush(journal); 1533 journal_unlock_updates(journal); 1534 1535 if (err) 1536 return 0; 1537 } 1538 1539 return generic_block_bmap(mapping,block,ext3_get_block); 1540} 1541 1542static int bget_one(handle_t *handle, struct buffer_head *bh) 1543{ 1544 get_bh(bh); 1545 return 0; 1546} 1547 1548static int bput_one(handle_t *handle, struct buffer_head *bh) 1549{ 1550 put_bh(bh); 1551 return 0; 1552} 1553 1554static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) 1555{ 1556 return !buffer_mapped(bh); 1557} 1558 1559/* 1560 * Note that we always start a transaction even if we're not journalling 1561 * data. This is to preserve ordering: any hole instantiation within 1562 * __block_write_full_page -> ext3_get_block() should be journalled 1563 * along with the data so we don't crash and then get metadata which 1564 * refers to old data. 1565 * 1566 * In all journalling modes block_write_full_page() will start the I/O. 1567 * 1568 * Problem: 1569 * 1570 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1571 * ext3_writepage() 1572 * 1573 * Similar for: 1574 * 1575 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... 1576 * 1577 * Same applies to ext3_get_block(). We will deadlock on various things like 1578 * lock_journal and i_truncate_mutex. 1579 * 1580 * Setting PF_MEMALLOC here doesn't work - too many internal memory 1581 * allocations fail. 1582 * 1583 * 16May01: If we're reentered then journal_current_handle() will be 1584 * non-zero. We simply *return*. 1585 * 1586 * 1 July 2001: @@@ FIXME: 1587 * In journalled data mode, a data buffer may be metadata against the 1588 * current transaction. But the same file is part of a shared mapping 1589 * and someone does a writepage() on it. 1590 * 1591 * We will move the buffer onto the async_data list, but *after* it has 1592 * been dirtied. So there's a small window where we have dirty data on 1593 * BJ_Metadata. 1594 * 1595 * Note that this only applies to the last partial page in the file. The 1596 * bit which block_write_full_page() uses prepare/commit for. (That's 1597 * broken code anyway: it's wrong for msync()). 1598 * 1599 * It's a rare case: affects the final partial page, for journalled data 1600 * where the file is subject to bith write() and writepage() in the same 1601 * transction. To fix it we'll need a custom block_write_full_page(). 1602 * We'll probably need that anyway for journalling writepage() output. 1603 * 1604 * We don't honour synchronous mounts for writepage(). That would be 1605 * disastrous. Any write() or metadata operation will sync the fs for 1606 * us. 1607 * 1608 * AKPM2: if all the page's buffers are mapped to disk and !data=journal, 1609 * we don't need to open a transaction here. 1610 */ 1611static int ext3_ordered_writepage(struct page *page, 1612 struct writeback_control *wbc) 1613{ 1614 struct inode *inode = page->mapping->host; 1615 struct buffer_head *page_bufs; 1616 handle_t *handle = NULL; 1617 int ret = 0; 1618 int err; 1619 1620 J_ASSERT(PageLocked(page)); 1621 /* 1622 * We don't want to warn for emergency remount. The condition is 1623 * ordered to avoid dereferencing inode->i_sb in non-error case to 1624 * avoid slow-downs. 1625 */ 1626 WARN_ON_ONCE(IS_RDONLY(inode) && 1627 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); 1628 1629 /* 1630 * We give up here if we're reentered, because it might be for a 1631 * different filesystem. 1632 */ 1633 if (ext3_journal_current_handle()) 1634 goto out_fail; 1635 1636 trace_ext3_ordered_writepage(page); 1637 if (!page_has_buffers(page)) { 1638 create_empty_buffers(page, inode->i_sb->s_blocksize, 1639 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1640 page_bufs = page_buffers(page); 1641 } else { 1642 page_bufs = page_buffers(page); 1643 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, 1644 NULL, buffer_unmapped)) { 1645 /* Provide NULL get_block() to catch bugs if buffers 1646 * weren't really mapped */ 1647 return block_write_full_page(page, NULL, wbc); 1648 } 1649 } 1650 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1651 1652 if (IS_ERR(handle)) { 1653 ret = PTR_ERR(handle); 1654 goto out_fail; 1655 } 1656 1657 walk_page_buffers(handle, page_bufs, 0, 1658 PAGE_CACHE_SIZE, NULL, bget_one); 1659 1660 ret = block_write_full_page(page, ext3_get_block, wbc); 1661 1662 /* 1663 * The page can become unlocked at any point now, and 1664 * truncate can then come in and change things. So we 1665 * can't touch *page from now on. But *page_bufs is 1666 * safe due to elevated refcount. 1667 */ 1668 1669 /* 1670 * And attach them to the current transaction. But only if 1671 * block_write_full_page() succeeded. Otherwise they are unmapped, 1672 * and generally junk. 1673 */ 1674 if (ret == 0) { 1675 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, 1676 NULL, journal_dirty_data_fn); 1677 if (!ret) 1678 ret = err; 1679 } 1680 walk_page_buffers(handle, page_bufs, 0, 1681 PAGE_CACHE_SIZE, NULL, bput_one); 1682 err = ext3_journal_stop(handle); 1683 if (!ret) 1684 ret = err; 1685 return ret; 1686 1687out_fail: 1688 redirty_page_for_writepage(wbc, page); 1689 unlock_page(page); 1690 return ret; 1691} 1692 1693static int ext3_writeback_writepage(struct page *page, 1694 struct writeback_control *wbc) 1695{ 1696 struct inode *inode = page->mapping->host; 1697 handle_t *handle = NULL; 1698 int ret = 0; 1699 int err; 1700 1701 J_ASSERT(PageLocked(page)); 1702 /* 1703 * We don't want to warn for emergency remount. The condition is 1704 * ordered to avoid dereferencing inode->i_sb in non-error case to 1705 * avoid slow-downs. 1706 */ 1707 WARN_ON_ONCE(IS_RDONLY(inode) && 1708 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); 1709 1710 if (ext3_journal_current_handle()) 1711 goto out_fail; 1712 1713 trace_ext3_writeback_writepage(page); 1714 if (page_has_buffers(page)) { 1715 if (!walk_page_buffers(NULL, page_buffers(page), 0, 1716 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { 1717 /* Provide NULL get_block() to catch bugs if buffers 1718 * weren't really mapped */ 1719 return block_write_full_page(page, NULL, wbc); 1720 } 1721 } 1722 1723 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1724 if (IS_ERR(handle)) { 1725 ret = PTR_ERR(handle); 1726 goto out_fail; 1727 } 1728 1729 ret = block_write_full_page(page, ext3_get_block, wbc); 1730 1731 err = ext3_journal_stop(handle); 1732 if (!ret) 1733 ret = err; 1734 return ret; 1735 1736out_fail: 1737 redirty_page_for_writepage(wbc, page); 1738 unlock_page(page); 1739 return ret; 1740} 1741 1742static int ext3_journalled_writepage(struct page *page, 1743 struct writeback_control *wbc) 1744{ 1745 struct inode *inode = page->mapping->host; 1746 handle_t *handle = NULL; 1747 int ret = 0; 1748 int err; 1749 1750 J_ASSERT(PageLocked(page)); 1751 /* 1752 * We don't want to warn for emergency remount. The condition is 1753 * ordered to avoid dereferencing inode->i_sb in non-error case to 1754 * avoid slow-downs. 1755 */ 1756 WARN_ON_ONCE(IS_RDONLY(inode) && 1757 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); 1758 1759 if (ext3_journal_current_handle()) 1760 goto no_write; 1761 1762 trace_ext3_journalled_writepage(page); 1763 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1764 if (IS_ERR(handle)) { 1765 ret = PTR_ERR(handle); 1766 goto no_write; 1767 } 1768 1769 if (!page_has_buffers(page) || PageChecked(page)) { 1770 /* 1771 * It's mmapped pagecache. Add buffers and journal it. There 1772 * doesn't seem much point in redirtying the page here. 1773 */ 1774 ClearPageChecked(page); 1775 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, 1776 ext3_get_block); 1777 if (ret != 0) { 1778 ext3_journal_stop(handle); 1779 goto out_unlock; 1780 } 1781 ret = walk_page_buffers(handle, page_buffers(page), 0, 1782 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 1783 1784 err = walk_page_buffers(handle, page_buffers(page), 0, 1785 PAGE_CACHE_SIZE, NULL, write_end_fn); 1786 if (ret == 0) 1787 ret = err; 1788 ext3_set_inode_state(inode, EXT3_STATE_JDATA); 1789 atomic_set(&EXT3_I(inode)->i_datasync_tid, 1790 handle->h_transaction->t_tid); 1791 unlock_page(page); 1792 } else { 1793 /* 1794 * It may be a page full of checkpoint-mode buffers. We don't 1795 * really know unless we go poke around in the buffer_heads. 1796 * But block_write_full_page will do the right thing. 1797 */ 1798 ret = block_write_full_page(page, ext3_get_block, wbc); 1799 } 1800 err = ext3_journal_stop(handle); 1801 if (!ret) 1802 ret = err; 1803out: 1804 return ret; 1805 1806no_write: 1807 redirty_page_for_writepage(wbc, page); 1808out_unlock: 1809 unlock_page(page); 1810 goto out; 1811} 1812 1813static int ext3_readpage(struct file *file, struct page *page) 1814{ 1815 trace_ext3_readpage(page); 1816 return mpage_readpage(page, ext3_get_block); 1817} 1818 1819static int 1820ext3_readpages(struct file *file, struct address_space *mapping, 1821 struct list_head *pages, unsigned nr_pages) 1822{ 1823 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1824} 1825 1826static void ext3_invalidatepage(struct page *page, unsigned long offset) 1827{ 1828 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1829 1830 trace_ext3_invalidatepage(page, offset); 1831 1832 /* 1833 * If it's a full truncate we just forget about the pending dirtying 1834 */ 1835 if (offset == 0) 1836 ClearPageChecked(page); 1837 1838 journal_invalidatepage(journal, page, offset); 1839} 1840 1841static int ext3_releasepage(struct page *page, gfp_t wait) 1842{ 1843 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1844 1845 trace_ext3_releasepage(page); 1846 WARN_ON(PageChecked(page)); 1847 if (!page_has_buffers(page)) 1848 return 0; 1849 return journal_try_to_free_buffers(journal, page, wait); 1850} 1851 1852/* 1853 * If the O_DIRECT write will extend the file then add this inode to the 1854 * orphan list. So recovery will truncate it back to the original size 1855 * if the machine crashes during the write. 1856 * 1857 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1858 * crashes then stale disk data _may_ be exposed inside the file. But current 1859 * VFS code falls back into buffered path in that case so we are safe. 1860 */ 1861static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, 1862 const struct iovec *iov, loff_t offset, 1863 unsigned long nr_segs) 1864{ 1865 struct file *file = iocb->ki_filp; 1866 struct inode *inode = file->f_mapping->host; 1867 struct ext3_inode_info *ei = EXT3_I(inode); 1868 handle_t *handle; 1869 ssize_t ret; 1870 int orphan = 0; 1871 size_t count = iov_length(iov, nr_segs); 1872 int retries = 0; 1873 1874 trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 1875 1876 if (rw == WRITE) { 1877 loff_t final_size = offset + count; 1878 1879 if (final_size > inode->i_size) { 1880 /* Credits for sb + inode write */ 1881 handle = ext3_journal_start(inode, 2); 1882 if (IS_ERR(handle)) { 1883 ret = PTR_ERR(handle); 1884 goto out; 1885 } 1886 ret = ext3_orphan_add(handle, inode); 1887 if (ret) { 1888 ext3_journal_stop(handle); 1889 goto out; 1890 } 1891 orphan = 1; 1892 ei->i_disksize = inode->i_size; 1893 ext3_journal_stop(handle); 1894 } 1895 } 1896 1897retry: 1898 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 1899 ext3_get_block); 1900 /* 1901 * In case of error extending write may have instantiated a few 1902 * blocks outside i_size. Trim these off again. 1903 */ 1904 if (unlikely((rw & WRITE) && ret < 0)) { 1905 loff_t isize = i_size_read(inode); 1906 loff_t end = offset + iov_length(iov, nr_segs); 1907 1908 if (end > isize) 1909 ext3_truncate_failed_direct_write(inode); 1910 } 1911 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1912 goto retry; 1913 1914 if (orphan) { 1915 int err; 1916 1917 /* Credits for sb + inode write */ 1918 handle = ext3_journal_start(inode, 2); 1919 if (IS_ERR(handle)) { 1920 /* This is really bad luck. We've written the data 1921 * but cannot extend i_size. Truncate allocated blocks 1922 * and pretend the write failed... */ 1923 ext3_truncate_failed_direct_write(inode); 1924 ret = PTR_ERR(handle); 1925 goto out; 1926 } 1927 if (inode->i_nlink) 1928 ext3_orphan_del(handle, inode); 1929 if (ret > 0) { 1930 loff_t end = offset + ret; 1931 if (end > inode->i_size) { 1932 ei->i_disksize = end; 1933 i_size_write(inode, end); 1934 /* 1935 * We're going to return a positive `ret' 1936 * here due to non-zero-length I/O, so there's 1937 * no way of reporting error returns from 1938 * ext3_mark_inode_dirty() to userspace. So 1939 * ignore it. 1940 */ 1941 ext3_mark_inode_dirty(handle, inode); 1942 } 1943 } 1944 err = ext3_journal_stop(handle); 1945 if (ret == 0) 1946 ret = err; 1947 } 1948out: 1949 trace_ext3_direct_IO_exit(inode, offset, 1950 iov_length(iov, nr_segs), rw, ret); 1951 return ret; 1952} 1953 1954/* 1955 * Pages can be marked dirty completely asynchronously from ext3's journalling 1956 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 1957 * much here because ->set_page_dirty is called under VFS locks. The page is 1958 * not necessarily locked. 1959 * 1960 * We cannot just dirty the page and leave attached buffers clean, because the 1961 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 1962 * or jbddirty because all the journalling code will explode. 1963 * 1964 * So what we do is to mark the page "pending dirty" and next time writepage 1965 * is called, propagate that into the buffers appropriately. 1966 */ 1967static int ext3_journalled_set_page_dirty(struct page *page) 1968{ 1969 SetPageChecked(page); 1970 return __set_page_dirty_nobuffers(page); 1971} 1972 1973static const struct address_space_operations ext3_ordered_aops = { 1974 .readpage = ext3_readpage, 1975 .readpages = ext3_readpages, 1976 .writepage = ext3_ordered_writepage, 1977 .write_begin = ext3_write_begin, 1978 .write_end = ext3_ordered_write_end, 1979 .bmap = ext3_bmap, 1980 .invalidatepage = ext3_invalidatepage, 1981 .releasepage = ext3_releasepage, 1982 .direct_IO = ext3_direct_IO, 1983 .migratepage = buffer_migrate_page, 1984 .is_partially_uptodate = block_is_partially_uptodate, 1985 .error_remove_page = generic_error_remove_page, 1986}; 1987 1988static const struct address_space_operations ext3_writeback_aops = { 1989 .readpage = ext3_readpage, 1990 .readpages = ext3_readpages, 1991 .writepage = ext3_writeback_writepage, 1992 .write_begin = ext3_write_begin, 1993 .write_end = ext3_writeback_write_end, 1994 .bmap = ext3_bmap, 1995 .invalidatepage = ext3_invalidatepage, 1996 .releasepage = ext3_releasepage, 1997 .direct_IO = ext3_direct_IO, 1998 .migratepage = buffer_migrate_page, 1999 .is_partially_uptodate = block_is_partially_uptodate, 2000 .error_remove_page = generic_error_remove_page, 2001}; 2002 2003static const struct address_space_operations ext3_journalled_aops = { 2004 .readpage = ext3_readpage, 2005 .readpages = ext3_readpages, 2006 .writepage = ext3_journalled_writepage, 2007 .write_begin = ext3_write_begin, 2008 .write_end = ext3_journalled_write_end, 2009 .set_page_dirty = ext3_journalled_set_page_dirty, 2010 .bmap = ext3_bmap, 2011 .invalidatepage = ext3_invalidatepage, 2012 .releasepage = ext3_releasepage, 2013 .is_partially_uptodate = block_is_partially_uptodate, 2014 .error_remove_page = generic_error_remove_page, 2015}; 2016 2017void ext3_set_aops(struct inode *inode) 2018{ 2019 if (ext3_should_order_data(inode)) 2020 inode->i_mapping->a_ops = &ext3_ordered_aops; 2021 else if (ext3_should_writeback_data(inode)) 2022 inode->i_mapping->a_ops = &ext3_writeback_aops; 2023 else 2024 inode->i_mapping->a_ops = &ext3_journalled_aops; 2025} 2026 2027/* 2028 * ext3_block_truncate_page() zeroes out a mapping from file offset `from' 2029 * up to the end of the block which corresponds to `from'. 2030 * This required during truncate. We need to physically zero the tail end 2031 * of that block so it doesn't yield old data if the file is later grown. 2032 */ 2033static int ext3_block_truncate_page(struct inode *inode, loff_t from) 2034{ 2035 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; 2036 unsigned offset = from & (PAGE_CACHE_SIZE - 1); 2037 unsigned blocksize, iblock, length, pos; 2038 struct page *page; 2039 handle_t *handle = NULL; 2040 struct buffer_head *bh; 2041 int err = 0; 2042 2043 /* Truncated on block boundary - nothing to do */ 2044 blocksize = inode->i_sb->s_blocksize; 2045 if ((from & (blocksize - 1)) == 0) 2046 return 0; 2047 2048 page = grab_cache_page(inode->i_mapping, index); 2049 if (!page) 2050 return -ENOMEM; 2051 length = blocksize - (offset & (blocksize - 1)); 2052 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 2053 2054 if (!page_has_buffers(page)) 2055 create_empty_buffers(page, blocksize, 0); 2056 2057 /* Find the buffer that contains "offset" */ 2058 bh = page_buffers(page); 2059 pos = blocksize; 2060 while (offset >= pos) { 2061 bh = bh->b_this_page; 2062 iblock++; 2063 pos += blocksize; 2064 } 2065 2066 err = 0; 2067 if (buffer_freed(bh)) { 2068 BUFFER_TRACE(bh, "freed: skip"); 2069 goto unlock; 2070 } 2071 2072 if (!buffer_mapped(bh)) { 2073 BUFFER_TRACE(bh, "unmapped"); 2074 ext3_get_block(inode, iblock, bh, 0); 2075 /* unmapped? It's a hole - nothing to do */ 2076 if (!buffer_mapped(bh)) { 2077 BUFFER_TRACE(bh, "still unmapped"); 2078 goto unlock; 2079 } 2080 } 2081 2082 /* Ok, it's mapped. Make sure it's up-to-date */ 2083 if (PageUptodate(page)) 2084 set_buffer_uptodate(bh); 2085 2086 if (!bh_uptodate_or_lock(bh)) { 2087 err = bh_submit_read(bh); 2088 /* Uhhuh. Read error. Complain and punt. */ 2089 if (err) 2090 goto unlock; 2091 } 2092 2093 /* data=writeback mode doesn't need transaction to zero-out data */ 2094 if (!ext3_should_writeback_data(inode)) { 2095 /* We journal at most one block */ 2096 handle = ext3_journal_start(inode, 1); 2097 if (IS_ERR(handle)) { 2098 clear_highpage(page); 2099 flush_dcache_page(page); 2100 err = PTR_ERR(handle); 2101 goto unlock; 2102 } 2103 } 2104 2105 if (ext3_should_journal_data(inode)) { 2106 BUFFER_TRACE(bh, "get write access"); 2107 err = ext3_journal_get_write_access(handle, bh); 2108 if (err) 2109 goto stop; 2110 } 2111 2112 zero_user(page, offset, length); 2113 BUFFER_TRACE(bh, "zeroed end of block"); 2114 2115 err = 0; 2116 if (ext3_should_journal_data(inode)) { 2117 err = ext3_journal_dirty_metadata(handle, bh); 2118 } else { 2119 if (ext3_should_order_data(inode)) 2120 err = ext3_journal_dirty_data(handle, bh); 2121 mark_buffer_dirty(bh); 2122 } 2123stop: 2124 if (handle) 2125 ext3_journal_stop(handle); 2126 2127unlock: 2128 unlock_page(page); 2129 page_cache_release(page); 2130 return err; 2131} 2132 2133/* 2134 * Probably it should be a library function... search for first non-zero word 2135 * or memcmp with zero_page, whatever is better for particular architecture. 2136 * Linus? 2137 */ 2138static inline int all_zeroes(__le32 *p, __le32 *q) 2139{ 2140 while (p < q) 2141 if (*p++) 2142 return 0; 2143 return 1; 2144} 2145 2146/** 2147 * ext3_find_shared - find the indirect blocks for partial truncation. 2148 * @inode: inode in question 2149 * @depth: depth of the affected branch 2150 * @offsets: offsets of pointers in that branch (see ext3_block_to_path) 2151 * @chain: place to store the pointers to partial indirect blocks 2152 * @top: place to the (detached) top of branch 2153 * 2154 * This is a helper function used by ext3_truncate(). 2155 * 2156 * When we do truncate() we may have to clean the ends of several 2157 * indirect blocks but leave the blocks themselves alive. Block is 2158 * partially truncated if some data below the new i_size is referred 2159 * from it (and it is on the path to the first completely truncated 2160 * data block, indeed). We have to free the top of that path along 2161 * with everything to the right of the path. Since no allocation 2162 * past the truncation point is possible until ext3_truncate() 2163 * finishes, we may safely do the latter, but top of branch may 2164 * require special attention - pageout below the truncation point 2165 * might try to populate it. 2166 * 2167 * We atomically detach the top of branch from the tree, store the 2168 * block number of its root in *@top, pointers to buffer_heads of 2169 * partially truncated blocks - in @chain[].bh and pointers to 2170 * their last elements that should not be removed - in 2171 * @chain[].p. Return value is the pointer to last filled element 2172 * of @chain. 2173 * 2174 * The work left to caller to do the actual freeing of subtrees: 2175 * a) free the subtree starting from *@top 2176 * b) free the subtrees whose roots are stored in 2177 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 2178 * c) free the subtrees growing from the inode past the @chain[0]. 2179 * (no partially truncated stuff there). */ 2180 2181static Indirect *ext3_find_shared(struct inode *inode, int depth, 2182 int offsets[4], Indirect chain[4], __le32 *top) 2183{ 2184 Indirect *partial, *p; 2185 int k, err; 2186 2187 *top = 0; 2188 /* Make k index the deepest non-null offset + 1 */ 2189 for (k = depth; k > 1 && !offsets[k-1]; k--) 2190 ; 2191 partial = ext3_get_branch(inode, k, offsets, chain, &err); 2192 /* Writer: pointers */ 2193 if (!partial) 2194 partial = chain + k-1; 2195 /* 2196 * If the branch acquired continuation since we've looked at it - 2197 * fine, it should all survive and (new) top doesn't belong to us. 2198 */ 2199 if (!partial->key && *partial->p) 2200 /* Writer: end */ 2201 goto no_top; 2202 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 2203 ; 2204 /* 2205 * OK, we've found the last block that must survive. The rest of our 2206 * branch should be detached before unlocking. However, if that rest 2207 * of branch is all ours and does not grow immediately from the inode 2208 * it's easier to cheat and just decrement partial->p. 2209 */ 2210 if (p == chain + k - 1 && p > chain) { 2211 p->p--; 2212 } else { 2213 *top = *p->p; 2214 /* Nope, don't do this in ext3. Must leave the tree intact */ 2215#if 0 2216 *p->p = 0; 2217#endif 2218 } 2219 /* Writer: end */ 2220 2221 while(partial > p) { 2222 brelse(partial->bh); 2223 partial--; 2224 } 2225no_top: 2226 return partial; 2227} 2228 2229/* 2230 * Zero a number of block pointers in either an inode or an indirect block. 2231 * If we restart the transaction we must again get write access to the 2232 * indirect block for further modification. 2233 * 2234 * We release `count' blocks on disk, but (last - first) may be greater 2235 * than `count' because there can be holes in there. 2236 */ 2237static void ext3_clear_blocks(handle_t *handle, struct inode *inode, 2238 struct buffer_head *bh, ext3_fsblk_t block_to_free, 2239 unsigned long count, __le32 *first, __le32 *last) 2240{ 2241 __le32 *p; 2242 if (try_to_extend_transaction(handle, inode)) { 2243 if (bh) { 2244 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 2245 if (ext3_journal_dirty_metadata(handle, bh)) 2246 return; 2247 } 2248 ext3_mark_inode_dirty(handle, inode); 2249 truncate_restart_transaction(handle, inode); 2250 if (bh) { 2251 BUFFER_TRACE(bh, "retaking write access"); 2252 if (ext3_journal_get_write_access(handle, bh)) 2253 return; 2254 } 2255 } 2256 2257 /* 2258 * Any buffers which are on the journal will be in memory. We find 2259 * them on the hash table so journal_revoke() will run journal_forget() 2260 * on them. We've already detached each block from the file, so 2261 * bforget() in journal_forget() should be safe. 2262 * 2263 * AKPM: turn on bforget in journal_forget()!!! 2264 */ 2265 for (p = first; p < last; p++) { 2266 u32 nr = le32_to_cpu(*p); 2267 if (nr) { 2268 struct buffer_head *bh; 2269 2270 *p = 0; 2271 bh = sb_find_get_block(inode->i_sb, nr); 2272 ext3_forget(handle, 0, inode, bh, nr); 2273 } 2274 } 2275 2276 ext3_free_blocks(handle, inode, block_to_free, count); 2277} 2278 2279/** 2280 * ext3_free_data - free a list of data blocks 2281 * @handle: handle for this transaction 2282 * @inode: inode we are dealing with 2283 * @this_bh: indirect buffer_head which contains *@first and *@last 2284 * @first: array of block numbers 2285 * @last: points immediately past the end of array 2286 * 2287 * We are freeing all blocks referred from that array (numbers are stored as 2288 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 2289 * 2290 * We accumulate contiguous runs of blocks to free. Conveniently, if these 2291 * blocks are contiguous then releasing them at one time will only affect one 2292 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 2293 * actually use a lot of journal space. 2294 * 2295 * @this_bh will be %NULL if @first and @last point into the inode's direct 2296 * block pointers. 2297 */ 2298static void ext3_free_data(handle_t *handle, struct inode *inode, 2299 struct buffer_head *this_bh, 2300 __le32 *first, __le32 *last) 2301{ 2302 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ 2303 unsigned long count = 0; /* Number of blocks in the run */ 2304 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 2305 corresponding to 2306 block_to_free */ 2307 ext3_fsblk_t nr; /* Current block # */ 2308 __le32 *p; /* Pointer into inode/ind 2309 for current block */ 2310 int err; 2311 2312 if (this_bh) { /* For indirect block */ 2313 BUFFER_TRACE(this_bh, "get_write_access"); 2314 err = ext3_journal_get_write_access(handle, this_bh); 2315 /* Important: if we can't update the indirect pointers 2316 * to the blocks, we can't free them. */ 2317 if (err) 2318 return; 2319 } 2320 2321 for (p = first; p < last; p++) { 2322 nr = le32_to_cpu(*p); 2323 if (nr) { 2324 /* accumulate blocks to free if they're contiguous */ 2325 if (count == 0) { 2326 block_to_free = nr; 2327 block_to_free_p = p; 2328 count = 1; 2329 } else if (nr == block_to_free + count) { 2330 count++; 2331 } else { 2332 ext3_clear_blocks(handle, inode, this_bh, 2333 block_to_free, 2334 count, block_to_free_p, p); 2335 block_to_free = nr; 2336 block_to_free_p = p; 2337 count = 1; 2338 } 2339 } 2340 } 2341 2342 if (count > 0) 2343 ext3_clear_blocks(handle, inode, this_bh, block_to_free, 2344 count, block_to_free_p, p); 2345 2346 if (this_bh) { 2347 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); 2348 2349 /* 2350 * The buffer head should have an attached journal head at this 2351 * point. However, if the data is corrupted and an indirect 2352 * block pointed to itself, it would have been detached when 2353 * the block was cleared. Check for this instead of OOPSing. 2354 */ 2355 if (bh2jh(this_bh)) 2356 ext3_journal_dirty_metadata(handle, this_bh); 2357 else 2358 ext3_error(inode->i_sb, "ext3_free_data", 2359 "circular indirect block detected, " 2360 "inode=%lu, block=%llu", 2361 inode->i_ino, 2362 (unsigned long long)this_bh->b_blocknr); 2363 } 2364} 2365 2366/** 2367 * ext3_free_branches - free an array of branches 2368 * @handle: JBD handle for this transaction 2369 * @inode: inode we are dealing with 2370 * @parent_bh: the buffer_head which contains *@first and *@last 2371 * @first: array of block numbers 2372 * @last: pointer immediately past the end of array 2373 * @depth: depth of the branches to free 2374 * 2375 * We are freeing all blocks referred from these branches (numbers are 2376 * stored as little-endian 32-bit) and updating @inode->i_blocks 2377 * appropriately. 2378 */ 2379static void ext3_free_branches(handle_t *handle, struct inode *inode, 2380 struct buffer_head *parent_bh, 2381 __le32 *first, __le32 *last, int depth) 2382{ 2383 ext3_fsblk_t nr; 2384 __le32 *p; 2385 2386 if (is_handle_aborted(handle)) 2387 return; 2388 2389 if (depth--) { 2390 struct buffer_head *bh; 2391 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2392 p = last; 2393 while (--p >= first) { 2394 nr = le32_to_cpu(*p); 2395 if (!nr) 2396 continue; /* A hole */ 2397 2398 /* Go read the buffer for the next level down */ 2399 bh = sb_bread(inode->i_sb, nr); 2400 2401 /* 2402 * A read failure? Report error and clear slot 2403 * (should be rare). 2404 */ 2405 if (!bh) { 2406 ext3_error(inode->i_sb, "ext3_free_branches", 2407 "Read failure, inode=%lu, block="E3FSBLK, 2408 inode->i_ino, nr); 2409 continue; 2410 } 2411 2412 /* This zaps the entire block. Bottom up. */ 2413 BUFFER_TRACE(bh, "free child branches"); 2414 ext3_free_branches(handle, inode, bh, 2415 (__le32*)bh->b_data, 2416 (__le32*)bh->b_data + addr_per_block, 2417 depth); 2418 2419 /* 2420 * Everything below this this pointer has been 2421 * released. Now let this top-of-subtree go. 2422 * 2423 * We want the freeing of this indirect block to be 2424 * atomic in the journal with the updating of the 2425 * bitmap block which owns it. So make some room in 2426 * the journal. 2427 * 2428 * We zero the parent pointer *after* freeing its 2429 * pointee in the bitmaps, so if extend_transaction() 2430 * for some reason fails to put the bitmap changes and 2431 * the release into the same transaction, recovery 2432 * will merely complain about releasing a free block, 2433 * rather than leaking blocks. 2434 */ 2435 if (is_handle_aborted(handle)) 2436 return; 2437 if (try_to_extend_transaction(handle, inode)) { 2438 ext3_mark_inode_dirty(handle, inode); 2439 truncate_restart_transaction(handle, inode); 2440 } 2441 2442 /* 2443 * We've probably journalled the indirect block several 2444 * times during the truncate. But it's no longer 2445 * needed and we now drop it from the transaction via 2446 * journal_revoke(). 2447 * 2448 * That's easy if it's exclusively part of this 2449 * transaction. But if it's part of the committing 2450 * transaction then journal_forget() will simply 2451 * brelse() it. That means that if the underlying 2452 * block is reallocated in ext3_get_block(), 2453 * unmap_underlying_metadata() will find this block 2454 * and will try to get rid of it. damn, damn. Thus 2455 * we don't allow a block to be reallocated until 2456 * a transaction freeing it has fully committed. 2457 * 2458 * We also have to make sure journal replay after a 2459 * crash does not overwrite non-journaled data blocks 2460 * with old metadata when the block got reallocated for 2461 * data. Thus we have to store a revoke record for a 2462 * block in the same transaction in which we free the 2463 * block. 2464 */ 2465 ext3_forget(handle, 1, inode, bh, bh->b_blocknr); 2466 2467 ext3_free_blocks(handle, inode, nr, 1); 2468 2469 if (parent_bh) { 2470 /* 2471 * The block which we have just freed is 2472 * pointed to by an indirect block: journal it 2473 */ 2474 BUFFER_TRACE(parent_bh, "get_write_access"); 2475 if (!ext3_journal_get_write_access(handle, 2476 parent_bh)){ 2477 *p = 0; 2478 BUFFER_TRACE(parent_bh, 2479 "call ext3_journal_dirty_metadata"); 2480 ext3_journal_dirty_metadata(handle, 2481 parent_bh); 2482 } 2483 } 2484 } 2485 } else { 2486 /* We have reached the bottom of the tree. */ 2487 BUFFER_TRACE(parent_bh, "free data blocks"); 2488 ext3_free_data(handle, inode, parent_bh, first, last); 2489 } 2490} 2491 2492int ext3_can_truncate(struct inode *inode) 2493{ 2494 if (S_ISREG(inode->i_mode)) 2495 return 1; 2496 if (S_ISDIR(inode->i_mode)) 2497 return 1; 2498 if (S_ISLNK(inode->i_mode)) 2499 return !ext3_inode_is_fast_symlink(inode); 2500 return 0; 2501} 2502 2503/* 2504 * ext3_truncate() 2505 * 2506 * We block out ext3_get_block() block instantiations across the entire 2507 * transaction, and VFS/VM ensures that ext3_truncate() cannot run 2508 * simultaneously on behalf of the same inode. 2509 * 2510 * As we work through the truncate and commit bits of it to the journal there 2511 * is one core, guiding principle: the file's tree must always be consistent on 2512 * disk. We must be able to restart the truncate after a crash. 2513 * 2514 * The file's tree may be transiently inconsistent in memory (although it 2515 * probably isn't), but whenever we close off and commit a journal transaction, 2516 * the contents of (the filesystem + the journal) must be consistent and 2517 * restartable. It's pretty simple, really: bottom up, right to left (although 2518 * left-to-right works OK too). 2519 * 2520 * Note that at recovery time, journal replay occurs *before* the restart of 2521 * truncate against the orphan inode list. 2522 * 2523 * The committed inode has the new, desired i_size (which is the same as 2524 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see 2525 * that this inode's truncate did not complete and it will again call 2526 * ext3_truncate() to have another go. So there will be instantiated blocks 2527 * to the right of the truncation point in a crashed ext3 filesystem. But 2528 * that's fine - as long as they are linked from the inode, the post-crash 2529 * ext3_truncate() run will find them and release them. 2530 */ 2531void ext3_truncate(struct inode *inode) 2532{ 2533 handle_t *handle; 2534 struct ext3_inode_info *ei = EXT3_I(inode); 2535 __le32 *i_data = ei->i_data; 2536 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2537 int offsets[4]; 2538 Indirect chain[4]; 2539 Indirect *partial; 2540 __le32 nr = 0; 2541 int n; 2542 long last_block; 2543 unsigned blocksize = inode->i_sb->s_blocksize; 2544 2545 trace_ext3_truncate_enter(inode); 2546 2547 if (!ext3_can_truncate(inode)) 2548 goto out_notrans; 2549 2550 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2551 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); 2552 2553 handle = start_transaction(inode); 2554 if (IS_ERR(handle)) 2555 goto out_notrans; 2556 2557 last_block = (inode->i_size + blocksize-1) 2558 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); 2559 n = ext3_block_to_path(inode, last_block, offsets, NULL); 2560 if (n == 0) 2561 goto out_stop; /* error */ 2562 2563 /* 2564 * OK. This truncate is going to happen. We add the inode to the 2565 * orphan list, so that if this truncate spans multiple transactions, 2566 * and we crash, we will resume the truncate when the filesystem 2567 * recovers. It also marks the inode dirty, to catch the new size. 2568 * 2569 * Implication: the file must always be in a sane, consistent 2570 * truncatable state while each transaction commits. 2571 */ 2572 if (ext3_orphan_add(handle, inode)) 2573 goto out_stop; 2574 2575 /* 2576 * The orphan list entry will now protect us from any crash which 2577 * occurs before the truncate completes, so it is now safe to propagate 2578 * the new, shorter inode size (held for now in i_size) into the 2579 * on-disk inode. We do this via i_disksize, which is the value which 2580 * ext3 *really* writes onto the disk inode. 2581 */ 2582 ei->i_disksize = inode->i_size; 2583 2584 /* 2585 * From here we block out all ext3_get_block() callers who want to 2586 * modify the block allocation tree. 2587 */ 2588 mutex_lock(&ei->truncate_mutex); 2589 2590 if (n == 1) { /* direct blocks */ 2591 ext3_free_data(handle, inode, NULL, i_data+offsets[0], 2592 i_data + EXT3_NDIR_BLOCKS); 2593 goto do_indirects; 2594 } 2595 2596 partial = ext3_find_shared(inode, n, offsets, chain, &nr); 2597 /* Kill the top of shared branch (not detached) */ 2598 if (nr) { 2599 if (partial == chain) { 2600 /* Shared branch grows from the inode */ 2601 ext3_free_branches(handle, inode, NULL, 2602 &nr, &nr+1, (chain+n-1) - partial); 2603 *partial->p = 0; 2604 /* 2605 * We mark the inode dirty prior to restart, 2606 * and prior to stop. No need for it here. 2607 */ 2608 } else { 2609 /* Shared branch grows from an indirect block */ 2610 ext3_free_branches(handle, inode, partial->bh, 2611 partial->p, 2612 partial->p+1, (chain+n-1) - partial); 2613 } 2614 } 2615 /* Clear the ends of indirect blocks on the shared branch */ 2616 while (partial > chain) { 2617 ext3_free_branches(handle, inode, partial->bh, partial->p + 1, 2618 (__le32*)partial->bh->b_data+addr_per_block, 2619 (chain+n-1) - partial); 2620 BUFFER_TRACE(partial->bh, "call brelse"); 2621 brelse (partial->bh); 2622 partial--; 2623 } 2624do_indirects: 2625 /* Kill the remaining (whole) subtrees */ 2626 switch (offsets[0]) { 2627 default: 2628 nr = i_data[EXT3_IND_BLOCK]; 2629 if (nr) { 2630 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 2631 i_data[EXT3_IND_BLOCK] = 0; 2632 } 2633 case EXT3_IND_BLOCK: 2634 nr = i_data[EXT3_DIND_BLOCK]; 2635 if (nr) { 2636 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 2637 i_data[EXT3_DIND_BLOCK] = 0; 2638 } 2639 case EXT3_DIND_BLOCK: 2640 nr = i_data[EXT3_TIND_BLOCK]; 2641 if (nr) { 2642 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 2643 i_data[EXT3_TIND_BLOCK] = 0; 2644 } 2645 case EXT3_TIND_BLOCK: 2646 ; 2647 } 2648 2649 ext3_discard_reservation(inode); 2650 2651 mutex_unlock(&ei->truncate_mutex); 2652 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 2653 ext3_mark_inode_dirty(handle, inode); 2654 2655 /* 2656 * In a multi-transaction truncate, we only make the final transaction 2657 * synchronous 2658 */ 2659 if (IS_SYNC(inode)) 2660 handle->h_sync = 1; 2661out_stop: 2662 /* 2663 * If this was a simple ftruncate(), and the file will remain alive 2664 * then we need to clear up the orphan record which we created above. 2665 * However, if this was a real unlink then we were called by 2666 * ext3_evict_inode(), and we allow that function to clean up the 2667 * orphan info for us. 2668 */ 2669 if (inode->i_nlink) 2670 ext3_orphan_del(handle, inode); 2671 2672 ext3_journal_stop(handle); 2673 trace_ext3_truncate_exit(inode); 2674 return; 2675out_notrans: 2676 /* 2677 * Delete the inode from orphan list so that it doesn't stay there 2678 * forever and trigger assertion on umount. 2679 */ 2680 if (inode->i_nlink) 2681 ext3_orphan_del(NULL, inode); 2682 trace_ext3_truncate_exit(inode); 2683} 2684 2685static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, 2686 unsigned long ino, struct ext3_iloc *iloc) 2687{ 2688 unsigned long block_group; 2689 unsigned long offset; 2690 ext3_fsblk_t block; 2691 struct ext3_group_desc *gdp; 2692 2693 if (!ext3_valid_inum(sb, ino)) { 2694 /* 2695 * This error is already checked for in namei.c unless we are 2696 * looking at an NFS filehandle, in which case no error 2697 * report is needed 2698 */ 2699 return 0; 2700 } 2701 2702 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); 2703 gdp = ext3_get_group_desc(sb, block_group, NULL); 2704 if (!gdp) 2705 return 0; 2706 /* 2707 * Figure out the offset within the block group inode table 2708 */ 2709 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * 2710 EXT3_INODE_SIZE(sb); 2711 block = le32_to_cpu(gdp->bg_inode_table) + 2712 (offset >> EXT3_BLOCK_SIZE_BITS(sb)); 2713 2714 iloc->block_group = block_group; 2715 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); 2716 return block; 2717} 2718 2719/* 2720 * ext3_get_inode_loc returns with an extra refcount against the inode's 2721 * underlying buffer_head on success. If 'in_mem' is true, we have all 2722 * data in memory that is needed to recreate the on-disk version of this 2723 * inode. 2724 */ 2725static int __ext3_get_inode_loc(struct inode *inode, 2726 struct ext3_iloc *iloc, int in_mem) 2727{ 2728 ext3_fsblk_t block; 2729 struct buffer_head *bh; 2730 2731 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); 2732 if (!block) 2733 return -EIO; 2734 2735 bh = sb_getblk(inode->i_sb, block); 2736 if (unlikely(!bh)) { 2737 ext3_error (inode->i_sb, "ext3_get_inode_loc", 2738 "unable to read inode block - " 2739 "inode=%lu, block="E3FSBLK, 2740 inode->i_ino, block); 2741 return -ENOMEM; 2742 } 2743 if (!buffer_uptodate(bh)) { 2744 lock_buffer(bh); 2745 2746 /* 2747 * If the buffer has the write error flag, we have failed 2748 * to write out another inode in the same block. In this 2749 * case, we don't have to read the block because we may 2750 * read the old inode data successfully. 2751 */ 2752 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 2753 set_buffer_uptodate(bh); 2754 2755 if (buffer_uptodate(bh)) { 2756 /* someone brought it uptodate while we waited */ 2757 unlock_buffer(bh); 2758 goto has_buffer; 2759 } 2760 2761 /* 2762 * If we have all information of the inode in memory and this 2763 * is the only valid inode in the block, we need not read the 2764 * block. 2765 */ 2766 if (in_mem) { 2767 struct buffer_head *bitmap_bh; 2768 struct ext3_group_desc *desc; 2769 int inodes_per_buffer; 2770 int inode_offset, i; 2771 int block_group; 2772 int start; 2773 2774 block_group = (inode->i_ino - 1) / 2775 EXT3_INODES_PER_GROUP(inode->i_sb); 2776 inodes_per_buffer = bh->b_size / 2777 EXT3_INODE_SIZE(inode->i_sb); 2778 inode_offset = ((inode->i_ino - 1) % 2779 EXT3_INODES_PER_GROUP(inode->i_sb)); 2780 start = inode_offset & ~(inodes_per_buffer - 1); 2781 2782 /* Is the inode bitmap in cache? */ 2783 desc = ext3_get_group_desc(inode->i_sb, 2784 block_group, NULL); 2785 if (!desc) 2786 goto make_io; 2787 2788 bitmap_bh = sb_getblk(inode->i_sb, 2789 le32_to_cpu(desc->bg_inode_bitmap)); 2790 if (unlikely(!bitmap_bh)) 2791 goto make_io; 2792 2793 /* 2794 * If the inode bitmap isn't in cache then the 2795 * optimisation may end up performing two reads instead 2796 * of one, so skip it. 2797 */ 2798 if (!buffer_uptodate(bitmap_bh)) { 2799 brelse(bitmap_bh); 2800 goto make_io; 2801 } 2802 for (i = start; i < start + inodes_per_buffer; i++) { 2803 if (i == inode_offset) 2804 continue; 2805 if (ext3_test_bit(i, bitmap_bh->b_data)) 2806 break; 2807 } 2808 brelse(bitmap_bh); 2809 if (i == start + inodes_per_buffer) { 2810 /* all other inodes are free, so skip I/O */ 2811 memset(bh->b_data, 0, bh->b_size); 2812 set_buffer_uptodate(bh); 2813 unlock_buffer(bh); 2814 goto has_buffer; 2815 } 2816 } 2817 2818make_io: 2819 /* 2820 * There are other valid inodes in the buffer, this inode 2821 * has in-inode xattrs, or we don't have this inode in memory. 2822 * Read the block from disk. 2823 */ 2824 trace_ext3_load_inode(inode); 2825 get_bh(bh); 2826 bh->b_end_io = end_buffer_read_sync; 2827 submit_bh(READ | REQ_META | REQ_PRIO, bh); 2828 wait_on_buffer(bh); 2829 if (!buffer_uptodate(bh)) { 2830 ext3_error(inode->i_sb, "ext3_get_inode_loc", 2831 "unable to read inode block - " 2832 "inode=%lu, block="E3FSBLK, 2833 inode->i_ino, block); 2834 brelse(bh); 2835 return -EIO; 2836 } 2837 } 2838has_buffer: 2839 iloc->bh = bh; 2840 return 0; 2841} 2842 2843int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) 2844{ 2845 /* We have all inode data except xattrs in memory here. */ 2846 return __ext3_get_inode_loc(inode, iloc, 2847 !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); 2848} 2849 2850void ext3_set_inode_flags(struct inode *inode) 2851{ 2852 unsigned int flags = EXT3_I(inode)->i_flags; 2853 2854 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 2855 if (flags & EXT3_SYNC_FL) 2856 inode->i_flags |= S_SYNC; 2857 if (flags & EXT3_APPEND_FL) 2858 inode->i_flags |= S_APPEND; 2859 if (flags & EXT3_IMMUTABLE_FL) 2860 inode->i_flags |= S_IMMUTABLE; 2861 if (flags & EXT3_NOATIME_FL) 2862 inode->i_flags |= S_NOATIME; 2863 if (flags & EXT3_DIRSYNC_FL) 2864 inode->i_flags |= S_DIRSYNC; 2865} 2866 2867/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ 2868void ext3_get_inode_flags(struct ext3_inode_info *ei) 2869{ 2870 unsigned int flags = ei->vfs_inode.i_flags; 2871 2872 ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| 2873 EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); 2874 if (flags & S_SYNC) 2875 ei->i_flags |= EXT3_SYNC_FL; 2876 if (flags & S_APPEND) 2877 ei->i_flags |= EXT3_APPEND_FL; 2878 if (flags & S_IMMUTABLE) 2879 ei->i_flags |= EXT3_IMMUTABLE_FL; 2880 if (flags & S_NOATIME) 2881 ei->i_flags |= EXT3_NOATIME_FL; 2882 if (flags & S_DIRSYNC) 2883 ei->i_flags |= EXT3_DIRSYNC_FL; 2884} 2885 2886struct inode *ext3_iget(struct super_block *sb, unsigned long ino) 2887{ 2888 struct ext3_iloc iloc; 2889 struct ext3_inode *raw_inode; 2890 struct ext3_inode_info *ei; 2891 struct buffer_head *bh; 2892 struct inode *inode; 2893 journal_t *journal = EXT3_SB(sb)->s_journal; 2894 transaction_t *transaction; 2895 long ret; 2896 int block; 2897 uid_t i_uid; 2898 gid_t i_gid; 2899 2900 inode = iget_locked(sb, ino); 2901 if (!inode) 2902 return ERR_PTR(-ENOMEM); 2903 if (!(inode->i_state & I_NEW)) 2904 return inode; 2905 2906 ei = EXT3_I(inode); 2907 ei->i_block_alloc_info = NULL; 2908 2909 ret = __ext3_get_inode_loc(inode, &iloc, 0); 2910 if (ret < 0) 2911 goto bad_inode; 2912 bh = iloc.bh; 2913 raw_inode = ext3_raw_inode(&iloc); 2914 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 2915 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 2916 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 2917 if(!(test_opt (inode->i_sb, NO_UID32))) { 2918 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 2919 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2920 } 2921 i_uid_write(inode, i_uid); 2922 i_gid_write(inode, i_gid); 2923 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 2924 inode->i_size = le32_to_cpu(raw_inode->i_size); 2925 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); 2926 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); 2927 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2928 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2929 2930 ei->i_state_flags = 0; 2931 ei->i_dir_start_lookup = 0; 2932 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2933 /* We now have enough fields to check if the inode was active or not. 2934 * This is needed because nfsd might try to access dead inodes 2935 * the test is that same one that e2fsck uses 2936 * NeilBrown 1999oct15 2937 */ 2938 if (inode->i_nlink == 0) { 2939 if (inode->i_mode == 0 || 2940 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { 2941 /* this inode is deleted */ 2942 brelse (bh); 2943 ret = -ESTALE; 2944 goto bad_inode; 2945 } 2946 /* The only unlinked inodes we let through here have 2947 * valid i_mode and are being read by the orphan 2948 * recovery code: that's fine, we're about to complete 2949 * the process of deleting those. */ 2950 } 2951 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); 2952 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 2953#ifdef EXT3_FRAGMENTS 2954 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); 2955 ei->i_frag_no = raw_inode->i_frag; 2956 ei->i_frag_size = raw_inode->i_fsize; 2957#endif 2958 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 2959 if (!S_ISREG(inode->i_mode)) { 2960 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); 2961 } else { 2962 inode->i_size |= 2963 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; 2964 } 2965 ei->i_disksize = inode->i_size; 2966 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 2967 ei->i_block_group = iloc.block_group; 2968 /* 2969 * NOTE! The in-memory inode i_data array is in little-endian order 2970 * even on big-endian machines: we do NOT byteswap the block numbers! 2971 */ 2972 for (block = 0; block < EXT3_N_BLOCKS; block++) 2973 ei->i_data[block] = raw_inode->i_block[block]; 2974 INIT_LIST_HEAD(&ei->i_orphan); 2975 2976 /* 2977 * Set transaction id's of transactions that have to be committed 2978 * to finish f[data]sync. We set them to currently running transaction 2979 * as we cannot be sure that the inode or some of its metadata isn't 2980 * part of the transaction - the inode could have been reclaimed and 2981 * now it is reread from disk. 2982 */ 2983 if (journal) { 2984 tid_t tid; 2985 2986 spin_lock(&journal->j_state_lock); 2987 if (journal->j_running_transaction) 2988 transaction = journal->j_running_transaction; 2989 else 2990 transaction = journal->j_committing_transaction; 2991 if (transaction) 2992 tid = transaction->t_tid; 2993 else 2994 tid = journal->j_commit_sequence; 2995 spin_unlock(&journal->j_state_lock); 2996 atomic_set(&ei->i_sync_tid, tid); 2997 atomic_set(&ei->i_datasync_tid, tid); 2998 } 2999 3000 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 3001 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { 3002 /* 3003 * When mke2fs creates big inodes it does not zero out 3004 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, 3005 * so ignore those first few inodes. 3006 */ 3007 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 3008 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 3009 EXT3_INODE_SIZE(inode->i_sb)) { 3010 brelse (bh); 3011 ret = -EIO; 3012 goto bad_inode; 3013 } 3014 if (ei->i_extra_isize == 0) { 3015 /* The extra space is currently unused. Use it. */ 3016 ei->i_extra_isize = sizeof(struct ext3_inode) - 3017 EXT3_GOOD_OLD_INODE_SIZE; 3018 } else { 3019 __le32 *magic = (void *)raw_inode + 3020 EXT3_GOOD_OLD_INODE_SIZE + 3021 ei->i_extra_isize; 3022 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) 3023 ext3_set_inode_state(inode, EXT3_STATE_XATTR); 3024 } 3025 } else 3026 ei->i_extra_isize = 0; 3027 3028 if (S_ISREG(inode->i_mode)) { 3029 inode->i_op = &ext3_file_inode_operations; 3030 inode->i_fop = &ext3_file_operations; 3031 ext3_set_aops(inode); 3032 } else if (S_ISDIR(inode->i_mode)) { 3033 inode->i_op = &ext3_dir_inode_operations; 3034 inode->i_fop = &ext3_dir_operations; 3035 } else if (S_ISLNK(inode->i_mode)) { 3036 if (ext3_inode_is_fast_symlink(inode)) { 3037 inode->i_op = &ext3_fast_symlink_inode_operations; 3038 nd_terminate_link(ei->i_data, inode->i_size, 3039 sizeof(ei->i_data) - 1); 3040 } else { 3041 inode->i_op = &ext3_symlink_inode_operations; 3042 ext3_set_aops(inode); 3043 } 3044 } else { 3045 inode->i_op = &ext3_special_inode_operations; 3046 if (raw_inode->i_block[0]) 3047 init_special_inode(inode, inode->i_mode, 3048 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 3049 else 3050 init_special_inode(inode, inode->i_mode, 3051 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 3052 } 3053 brelse (iloc.bh); 3054 ext3_set_inode_flags(inode); 3055 unlock_new_inode(inode); 3056 return inode; 3057 3058bad_inode: 3059 iget_failed(inode); 3060 return ERR_PTR(ret); 3061} 3062 3063/* 3064 * Post the struct inode info into an on-disk inode location in the 3065 * buffer-cache. This gobbles the caller's reference to the 3066 * buffer_head in the inode location struct. 3067 * 3068 * The caller must have write access to iloc->bh. 3069 */ 3070static int ext3_do_update_inode(handle_t *handle, 3071 struct inode *inode, 3072 struct ext3_iloc *iloc) 3073{ 3074 struct ext3_inode *raw_inode = ext3_raw_inode(iloc); 3075 struct ext3_inode_info *ei = EXT3_I(inode); 3076 struct buffer_head *bh = iloc->bh; 3077 int err = 0, rc, block; 3078 int need_datasync = 0; 3079 __le32 disksize; 3080 uid_t i_uid; 3081 gid_t i_gid; 3082 3083again: 3084 /* we can't allow multiple procs in here at once, its a bit racey */ 3085 lock_buffer(bh); 3086 3087 /* For fields not not tracking in the in-memory inode, 3088 * initialise them to zero for new inodes. */ 3089 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) 3090 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 3091 3092 ext3_get_inode_flags(ei); 3093 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 3094 i_uid = i_uid_read(inode); 3095 i_gid = i_gid_read(inode); 3096 if(!(test_opt(inode->i_sb, NO_UID32))) { 3097 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); 3098 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); 3099/* 3100 * Fix up interoperability with old kernels. Otherwise, old inodes get 3101 * re-used with the upper 16 bits of the uid/gid intact 3102 */ 3103 if(!ei->i_dtime) { 3104 raw_inode->i_uid_high = 3105 cpu_to_le16(high_16_bits(i_uid)); 3106 raw_inode->i_gid_high = 3107 cpu_to_le16(high_16_bits(i_gid)); 3108 } else { 3109 raw_inode->i_uid_high = 0; 3110 raw_inode->i_gid_high = 0; 3111 } 3112 } else { 3113 raw_inode->i_uid_low = 3114 cpu_to_le16(fs_high2lowuid(i_uid)); 3115 raw_inode->i_gid_low = 3116 cpu_to_le16(fs_high2lowgid(i_gid)); 3117 raw_inode->i_uid_high = 0; 3118 raw_inode->i_gid_high = 0; 3119 } 3120 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 3121 disksize = cpu_to_le32(ei->i_disksize); 3122 if (disksize != raw_inode->i_size) { 3123 need_datasync = 1; 3124 raw_inode->i_size = disksize; 3125 } 3126 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 3127 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 3128 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 3129 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 3130 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 3131 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 3132#ifdef EXT3_FRAGMENTS 3133 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); 3134 raw_inode->i_frag = ei->i_frag_no; 3135 raw_inode->i_fsize = ei->i_frag_size; 3136#endif 3137 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); 3138 if (!S_ISREG(inode->i_mode)) { 3139 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 3140 } else { 3141 disksize = cpu_to_le32(ei->i_disksize >> 32); 3142 if (disksize != raw_inode->i_size_high) { 3143 raw_inode->i_size_high = disksize; 3144 need_datasync = 1; 3145 } 3146 if (ei->i_disksize > 0x7fffffffULL) { 3147 struct super_block *sb = inode->i_sb; 3148 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, 3149 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || 3150 EXT3_SB(sb)->s_es->s_rev_level == 3151 cpu_to_le32(EXT3_GOOD_OLD_REV)) { 3152 /* If this is the first large file 3153 * created, add a flag to the superblock. 3154 */ 3155 unlock_buffer(bh); 3156 err = ext3_journal_get_write_access(handle, 3157 EXT3_SB(sb)->s_sbh); 3158 if (err) 3159 goto out_brelse; 3160 3161 ext3_update_dynamic_rev(sb); 3162 EXT3_SET_RO_COMPAT_FEATURE(sb, 3163 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 3164 handle->h_sync = 1; 3165 err = ext3_journal_dirty_metadata(handle, 3166 EXT3_SB(sb)->s_sbh); 3167 /* get our lock and start over */ 3168 goto again; 3169 } 3170 } 3171 } 3172 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 3173 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 3174 if (old_valid_dev(inode->i_rdev)) { 3175 raw_inode->i_block[0] = 3176 cpu_to_le32(old_encode_dev(inode->i_rdev)); 3177 raw_inode->i_block[1] = 0; 3178 } else { 3179 raw_inode->i_block[0] = 0; 3180 raw_inode->i_block[1] = 3181 cpu_to_le32(new_encode_dev(inode->i_rdev)); 3182 raw_inode->i_block[2] = 0; 3183 } 3184 } else for (block = 0; block < EXT3_N_BLOCKS; block++) 3185 raw_inode->i_block[block] = ei->i_data[block]; 3186 3187 if (ei->i_extra_isize) 3188 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 3189 3190 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 3191 unlock_buffer(bh); 3192 rc = ext3_journal_dirty_metadata(handle, bh); 3193 if (!err) 3194 err = rc; 3195 ext3_clear_inode_state(inode, EXT3_STATE_NEW); 3196 3197 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3198 if (need_datasync) 3199 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); 3200out_brelse: 3201 brelse (bh); 3202 ext3_std_error(inode->i_sb, err); 3203 return err; 3204} 3205 3206/* 3207 * ext3_write_inode() 3208 * 3209 * We are called from a few places: 3210 * 3211 * - Within generic_file_write() for O_SYNC files. 3212 * Here, there will be no transaction running. We wait for any running 3213 * transaction to commit. 3214 * 3215 * - Within sys_sync(), kupdate and such. 3216 * We wait on commit, if tol to. 3217 * 3218 * - Within prune_icache() (PF_MEMALLOC == true) 3219 * Here we simply return. We can't afford to block kswapd on the 3220 * journal commit. 3221 * 3222 * In all cases it is actually safe for us to return without doing anything, 3223 * because the inode has been copied into a raw inode buffer in 3224 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 3225 * knfsd. 3226 * 3227 * Note that we are absolutely dependent upon all inode dirtiers doing the 3228 * right thing: they *must* call mark_inode_dirty() after dirtying info in 3229 * which we are interested. 3230 * 3231 * It would be a bug for them to not do this. The code: 3232 * 3233 * mark_inode_dirty(inode) 3234 * stuff(); 3235 * inode->i_size = expr; 3236 * 3237 * is in error because a kswapd-driven write_inode() could occur while 3238 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3239 * will no longer be on the superblock's dirty inode list. 3240 */ 3241int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) 3242{ 3243 if (current->flags & PF_MEMALLOC) 3244 return 0; 3245 3246 if (ext3_journal_current_handle()) { 3247 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 3248 dump_stack(); 3249 return -EIO; 3250 } 3251 3252 if (wbc->sync_mode != WB_SYNC_ALL) 3253 return 0; 3254 3255 return ext3_force_commit(inode->i_sb); 3256} 3257 3258/* 3259 * ext3_setattr() 3260 * 3261 * Called from notify_change. 3262 * 3263 * We want to trap VFS attempts to truncate the file as soon as 3264 * possible. In particular, we want to make sure that when the VFS 3265 * shrinks i_size, we put the inode on the orphan list and modify 3266 * i_disksize immediately, so that during the subsequent flushing of 3267 * dirty pages and freeing of disk blocks, we can guarantee that any 3268 * commit will leave the blocks being flushed in an unused state on 3269 * disk. (On recovery, the inode will get truncated and the blocks will 3270 * be freed, so we have a strong guarantee that no future commit will 3271 * leave these blocks visible to the user.) 3272 * 3273 * Called with inode->sem down. 3274 */ 3275int ext3_setattr(struct dentry *dentry, struct iattr *attr) 3276{ 3277 struct inode *inode = dentry->d_inode; 3278 int error, rc = 0; 3279 const unsigned int ia_valid = attr->ia_valid; 3280 3281 error = inode_change_ok(inode, attr); 3282 if (error) 3283 return error; 3284 3285 if (is_quota_modification(inode, attr)) 3286 dquot_initialize(inode); 3287 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || 3288 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { 3289 handle_t *handle; 3290 3291 /* (user+group)*(old+new) structure, inode write (sb, 3292 * inode block, ? - but truncate inode update has it) */ 3293 handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 3294 EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); 3295 if (IS_ERR(handle)) { 3296 error = PTR_ERR(handle); 3297 goto err_out; 3298 } 3299 error = dquot_transfer(inode, attr); 3300 if (error) { 3301 ext3_journal_stop(handle); 3302 return error; 3303 } 3304 /* Update corresponding info in inode so that everything is in 3305 * one transaction */ 3306 if (attr->ia_valid & ATTR_UID) 3307 inode->i_uid = attr->ia_uid; 3308 if (attr->ia_valid & ATTR_GID) 3309 inode->i_gid = attr->ia_gid; 3310 error = ext3_mark_inode_dirty(handle, inode); 3311 ext3_journal_stop(handle); 3312 } 3313 3314 if (attr->ia_valid & ATTR_SIZE) 3315 inode_dio_wait(inode); 3316 3317 if (S_ISREG(inode->i_mode) && 3318 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 3319 handle_t *handle; 3320 3321 handle = ext3_journal_start(inode, 3); 3322 if (IS_ERR(handle)) { 3323 error = PTR_ERR(handle); 3324 goto err_out; 3325 } 3326 3327 error = ext3_orphan_add(handle, inode); 3328 if (error) { 3329 ext3_journal_stop(handle); 3330 goto err_out; 3331 } 3332 EXT3_I(inode)->i_disksize = attr->ia_size; 3333 error = ext3_mark_inode_dirty(handle, inode); 3334 ext3_journal_stop(handle); 3335 if (error) { 3336 /* Some hard fs error must have happened. Bail out. */ 3337 ext3_orphan_del(NULL, inode); 3338 goto err_out; 3339 } 3340 rc = ext3_block_truncate_page(inode, attr->ia_size); 3341 if (rc) { 3342 /* Cleanup orphan list and exit */ 3343 handle = ext3_journal_start(inode, 3); 3344 if (IS_ERR(handle)) { 3345 ext3_orphan_del(NULL, inode); 3346 goto err_out; 3347 } 3348 ext3_orphan_del(handle, inode); 3349 ext3_journal_stop(handle); 3350 goto err_out; 3351 } 3352 } 3353 3354 if ((attr->ia_valid & ATTR_SIZE) && 3355 attr->ia_size != i_size_read(inode)) { 3356 truncate_setsize(inode, attr->ia_size); 3357 ext3_truncate(inode); 3358 } 3359 3360 setattr_copy(inode, attr); 3361 mark_inode_dirty(inode); 3362 3363 if (ia_valid & ATTR_MODE) 3364 rc = ext3_acl_chmod(inode); 3365 3366err_out: 3367 ext3_std_error(inode->i_sb, error); 3368 if (!error) 3369 error = rc; 3370 return error; 3371} 3372 3373 3374/* 3375 * How many blocks doth make a writepage()? 3376 * 3377 * With N blocks per page, it may be: 3378 * N data blocks 3379 * 2 indirect block 3380 * 2 dindirect 3381 * 1 tindirect 3382 * N+5 bitmap blocks (from the above) 3383 * N+5 group descriptor summary blocks 3384 * 1 inode block 3385 * 1 superblock. 3386 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files 3387 * 3388 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS 3389 * 3390 * With ordered or writeback data it's the same, less the N data blocks. 3391 * 3392 * If the inode's direct blocks can hold an integral number of pages then a 3393 * page cannot straddle two indirect blocks, and we can only touch one indirect 3394 * and dindirect block, and the "5" above becomes "3". 3395 * 3396 * This still overestimates under most circumstances. If we were to pass the 3397 * start and end offsets in here as well we could do block_to_path() on each 3398 * block and work out the exact number of indirects which are touched. Pah. 3399 */ 3400 3401static int ext3_writepage_trans_blocks(struct inode *inode) 3402{ 3403 int bpp = ext3_journal_blocks_per_page(inode); 3404 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; 3405 int ret; 3406 3407 if (ext3_should_journal_data(inode)) 3408 ret = 3 * (bpp + indirects) + 2; 3409 else 3410 ret = 2 * (bpp + indirects) + indirects + 2; 3411 3412#ifdef CONFIG_QUOTA 3413 /* We know that structure was already allocated during dquot_initialize so 3414 * we will be updating only the data blocks + inodes */ 3415 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 3416#endif 3417 3418 return ret; 3419} 3420 3421/* 3422 * The caller must have previously called ext3_reserve_inode_write(). 3423 * Give this, we know that the caller already has write access to iloc->bh. 3424 */ 3425int ext3_mark_iloc_dirty(handle_t *handle, 3426 struct inode *inode, struct ext3_iloc *iloc) 3427{ 3428 int err = 0; 3429 3430 /* the do_update_inode consumes one bh->b_count */ 3431 get_bh(iloc->bh); 3432 3433 /* ext3_do_update_inode() does journal_dirty_metadata */ 3434 err = ext3_do_update_inode(handle, inode, iloc); 3435 put_bh(iloc->bh); 3436 return err; 3437} 3438 3439/* 3440 * On success, We end up with an outstanding reference count against 3441 * iloc->bh. This _must_ be cleaned up later. 3442 */ 3443 3444int 3445ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 3446 struct ext3_iloc *iloc) 3447{ 3448 int err = 0; 3449 if (handle) { 3450 err = ext3_get_inode_loc(inode, iloc); 3451 if (!err) { 3452 BUFFER_TRACE(iloc->bh, "get_write_access"); 3453 err = ext3_journal_get_write_access(handle, iloc->bh); 3454 if (err) { 3455 brelse(iloc->bh); 3456 iloc->bh = NULL; 3457 } 3458 } 3459 } 3460 ext3_std_error(inode->i_sb, err); 3461 return err; 3462} 3463 3464/* 3465 * What we do here is to mark the in-core inode as clean with respect to inode 3466 * dirtiness (it may still be data-dirty). 3467 * This means that the in-core inode may be reaped by prune_icache 3468 * without having to perform any I/O. This is a very good thing, 3469 * because *any* task may call prune_icache - even ones which 3470 * have a transaction open against a different journal. 3471 * 3472 * Is this cheating? Not really. Sure, we haven't written the 3473 * inode out, but prune_icache isn't a user-visible syncing function. 3474 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 3475 * we start and wait on commits. 3476 */ 3477int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) 3478{ 3479 struct ext3_iloc iloc; 3480 int err; 3481 3482 might_sleep(); 3483 trace_ext3_mark_inode_dirty(inode, _RET_IP_); 3484 err = ext3_reserve_inode_write(handle, inode, &iloc); 3485 if (!err) 3486 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 3487 return err; 3488} 3489 3490/* 3491 * ext3_dirty_inode() is called from __mark_inode_dirty() 3492 * 3493 * We're really interested in the case where a file is being extended. 3494 * i_size has been changed by generic_commit_write() and we thus need 3495 * to include the updated inode in the current transaction. 3496 * 3497 * Also, dquot_alloc_space() will always dirty the inode when blocks 3498 * are allocated to the file. 3499 * 3500 * If the inode is marked synchronous, we don't honour that here - doing 3501 * so would cause a commit on atime updates, which we don't bother doing. 3502 * We handle synchronous inodes at the highest possible level. 3503 */ 3504void ext3_dirty_inode(struct inode *inode, int flags) 3505{ 3506 handle_t *current_handle = ext3_journal_current_handle(); 3507 handle_t *handle; 3508 3509 handle = ext3_journal_start(inode, 2); 3510 if (IS_ERR(handle)) 3511 goto out; 3512 if (current_handle && 3513 current_handle->h_transaction != handle->h_transaction) { 3514 /* This task has a transaction open against a different fs */ 3515 printk(KERN_EMERG "%s: transactions do not match!\n", 3516 __func__); 3517 } else { 3518 jbd_debug(5, "marking dirty. outer handle=%p\n", 3519 current_handle); 3520 ext3_mark_inode_dirty(handle, inode); 3521 } 3522 ext3_journal_stop(handle); 3523out: 3524 return; 3525} 3526 3527#if 0 3528/* 3529 * Bind an inode's backing buffer_head into this transaction, to prevent 3530 * it from being flushed to disk early. Unlike 3531 * ext3_reserve_inode_write, this leaves behind no bh reference and 3532 * returns no iloc structure, so the caller needs to repeat the iloc 3533 * lookup to mark the inode dirty later. 3534 */ 3535static int ext3_pin_inode(handle_t *handle, struct inode *inode) 3536{ 3537 struct ext3_iloc iloc; 3538 3539 int err = 0; 3540 if (handle) { 3541 err = ext3_get_inode_loc(inode, &iloc); 3542 if (!err) { 3543 BUFFER_TRACE(iloc.bh, "get_write_access"); 3544 err = journal_get_write_access(handle, iloc.bh); 3545 if (!err) 3546 err = ext3_journal_dirty_metadata(handle, 3547 iloc.bh); 3548 brelse(iloc.bh); 3549 } 3550 } 3551 ext3_std_error(inode->i_sb, err); 3552 return err; 3553} 3554#endif 3555 3556int ext3_change_inode_journal_flag(struct inode *inode, int val) 3557{ 3558 journal_t *journal; 3559 handle_t *handle; 3560 int err; 3561 3562 /* 3563 * We have to be very careful here: changing a data block's 3564 * journaling status dynamically is dangerous. If we write a 3565 * data block to the journal, change the status and then delete 3566 * that block, we risk forgetting to revoke the old log record 3567 * from the journal and so a subsequent replay can corrupt data. 3568 * So, first we make sure that the journal is empty and that 3569 * nobody is changing anything. 3570 */ 3571 3572 journal = EXT3_JOURNAL(inode); 3573 if (is_journal_aborted(journal)) 3574 return -EROFS; 3575 3576 journal_lock_updates(journal); 3577 journal_flush(journal); 3578 3579 /* 3580 * OK, there are no updates running now, and all cached data is 3581 * synced to disk. We are now in a completely consistent state 3582 * which doesn't have anything in the journal, and we know that 3583 * no filesystem updates are running, so it is safe to modify 3584 * the inode's in-core data-journaling state flag now. 3585 */ 3586 3587 if (val) 3588 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; 3589 else 3590 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; 3591 ext3_set_aops(inode); 3592 3593 journal_unlock_updates(journal); 3594 3595 /* Finally we can mark the inode as dirty. */ 3596 3597 handle = ext3_journal_start(inode, 1); 3598 if (IS_ERR(handle)) 3599 return PTR_ERR(handle); 3600 3601 err = ext3_mark_inode_dirty(handle, inode); 3602 handle->h_sync = 1; 3603 ext3_journal_stop(handle); 3604 ext3_std_error(inode->i_sb, err); 3605 3606 return err; 3607}