Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6:
jbd: fix race between write_metadata_buffer and get_write_access
ext3: Get rid of extenddisksize parameter of ext3_get_blocks_handle()
jbd: Fix a race between checkpointing code and journal_get_write_access()
ext3: Fix truncation of symlinks after failed write
jbd: Fail to load a journal if it is too short

+67 -64
+1 -2
fs/ext3/dir.c
··· 130 130 struct buffer_head *bh = NULL; 131 131 132 132 map_bh.b_state = 0; 133 - err = ext3_get_blocks_handle(NULL, inode, blk, 1, 134 - &map_bh, 0, 0); 133 + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); 135 134 if (err > 0) { 136 135 pgoff_t index = map_bh.b_blocknr >> 137 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
+13 -19
fs/ext3/inode.c
··· 788 788 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 789 789 sector_t iblock, unsigned long maxblocks, 790 790 struct buffer_head *bh_result, 791 - int create, int extend_disksize) 791 + int create) 792 792 { 793 793 int err = -EIO; 794 794 int offsets[4]; ··· 911 911 if (!err) 912 912 err = ext3_splice_branch(handle, inode, iblock, 913 913 partial, indirect_blks, count); 914 - /* 915 - * i_disksize growing is protected by truncate_mutex. Don't forget to 916 - * protect it if you're about to implement concurrent 917 - * ext3_get_block() -bzzz 918 - */ 919 - if (!err && extend_disksize && inode->i_size > ei->i_disksize) 920 - ei->i_disksize = inode->i_size; 921 914 mutex_unlock(&ei->truncate_mutex); 922 915 if (err) 923 916 goto cleanup; ··· 965 972 } 966 973 967 974 ret = ext3_get_blocks_handle(handle, inode, iblock, 968 - max_blocks, bh_result, create, 0); 975 + max_blocks, bh_result, create); 969 976 if (ret > 0) { 970 977 bh_result->b_size = (ret << inode->i_blkbits); 971 978 ret = 0; ··· 998 1005 dummy.b_blocknr = -1000; 999 1006 buffer_trace_init(&dummy.b_history); 1000 1007 err = ext3_get_blocks_handle(handle, inode, block, 1, 1001 - &dummy, create, 1); 1008 + &dummy, create); 1002 1009 /* 1003 1010 * ext3_get_blocks_handle() returns number of blocks 1004 1011 * mapped. 0 in case of a HOLE. ··· 1186 1193 * i_size_read because we hold i_mutex. 1187 1194 * 1188 1195 * Add inode to orphan list in case we crash before truncate 1189 - * finishes. 1196 + * finishes. Do this only if ext3_can_truncate() agrees so 1197 + * that orphan processing code is happy. 1190 1198 */ 1191 - if (pos + len > inode->i_size) 1199 + if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1192 1200 ext3_orphan_add(handle, inode); 1193 1201 ext3_journal_stop(handle); 1194 1202 unlock_page(page); 1195 1203 page_cache_release(page); 1196 1204 if (pos + len > inode->i_size) 1197 - vmtruncate(inode, inode->i_size); 1205 + ext3_truncate(inode); 1198 1206 } 1199 1207 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1200 1208 goto retry; ··· 1281 1287 * There may be allocated blocks outside of i_size because 1282 1288 * we failed to copy some data. Prepare for truncate. 1283 1289 */ 1284 - if (pos + len > inode->i_size) 1290 + if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1285 1291 ext3_orphan_add(handle, inode); 1286 1292 ret2 = ext3_journal_stop(handle); 1287 1293 if (!ret) ··· 1290 1296 page_cache_release(page); 1291 1297 1292 1298 if (pos + len > inode->i_size) 1293 - vmtruncate(inode, inode->i_size); 1299 + ext3_truncate(inode); 1294 1300 return ret ? ret : copied; 1295 1301 } 1296 1302 ··· 1309 1315 * There may be allocated blocks outside of i_size because 1310 1316 * we failed to copy some data. Prepare for truncate. 1311 1317 */ 1312 - if (pos + len > inode->i_size) 1318 + if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1313 1319 ext3_orphan_add(handle, inode); 1314 1320 ret = ext3_journal_stop(handle); 1315 1321 unlock_page(page); 1316 1322 page_cache_release(page); 1317 1323 1318 1324 if (pos + len > inode->i_size) 1319 - vmtruncate(inode, inode->i_size); 1325 + ext3_truncate(inode); 1320 1326 return ret ? ret : copied; 1321 1327 } 1322 1328 ··· 1352 1358 * There may be allocated blocks outside of i_size because 1353 1359 * we failed to copy some data. Prepare for truncate. 1354 1360 */ 1355 - if (pos + len > inode->i_size) 1361 + if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1356 1362 ext3_orphan_add(handle, inode); 1357 1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1358 1364 if (inode->i_size > EXT3_I(inode)->i_disksize) { ··· 1369 1375 page_cache_release(page); 1370 1376 1371 1377 if (pos + len > inode->i_size) 1372 - vmtruncate(inode, inode->i_size); 1378 + ext3_truncate(inode); 1373 1379 return ret ? ret : copied; 1374 1380 } 1375 1381
+17 -9
fs/jbd/journal.c
··· 287 287 struct page *new_page; 288 288 unsigned int new_offset; 289 289 struct buffer_head *bh_in = jh2bh(jh_in); 290 + journal_t *journal = transaction->t_journal; 290 291 291 292 /* 292 293 * The buffer really shouldn't be locked: only the current committing ··· 301 300 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 302 301 303 302 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 303 + /* keep subsequent assertions sane */ 304 + new_bh->b_state = 0; 305 + init_buffer(new_bh, NULL, NULL); 306 + atomic_set(&new_bh->b_count, 1); 307 + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ 304 308 305 309 /* 306 310 * If a new transaction has already done a buffer copy-out, then ··· 367 361 kunmap_atomic(mapped_data, KM_USER0); 368 362 } 369 363 370 - /* keep subsequent assertions sane */ 371 - new_bh->b_state = 0; 372 - init_buffer(new_bh, NULL, NULL); 373 - atomic_set(&new_bh->b_count, 1); 374 - jbd_unlock_bh_state(bh_in); 375 - 376 - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ 377 - 378 364 set_bh_page(new_bh, new_page, new_offset); 379 365 new_jh->b_transaction = NULL; 380 366 new_bh->b_size = jh2bh(jh_in)->b_size; ··· 383 385 * copying is moved to the transaction's shadow queue. 384 386 */ 385 387 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 386 - journal_file_buffer(jh_in, transaction, BJ_Shadow); 388 + spin_lock(&journal->j_list_lock); 389 + __journal_file_buffer(jh_in, transaction, BJ_Shadow); 390 + spin_unlock(&journal->j_list_lock); 391 + jbd_unlock_bh_state(bh_in); 392 + 387 393 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 388 394 journal_file_buffer(new_jh, transaction, BJ_IO); 389 395 ··· 850 848 851 849 first = be32_to_cpu(sb->s_first); 852 850 last = be32_to_cpu(sb->s_maxlen); 851 + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { 852 + printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", 853 + first, last); 854 + journal_fail_superblock(journal); 855 + return -EINVAL; 856 + } 853 857 854 858 journal->j_first = first; 855 859 journal->j_last = last;
+35 -33
fs/jbd/transaction.c
··· 489 489 wake_up(&journal->j_wait_transaction_locked); 490 490 } 491 491 492 - /* 493 - * Report any unexpected dirty buffers which turn up. Normally those 494 - * indicate an error, but they can occur if the user is running (say) 495 - * tune2fs to modify the live filesystem, so we need the option of 496 - * continuing as gracefully as possible. # 497 - * 498 - * The caller should already hold the journal lock and 499 - * j_list_lock spinlock: most callers will need those anyway 500 - * in order to probe the buffer's journaling state safely. 501 - */ 502 - static void jbd_unexpected_dirty_buffer(struct journal_head *jh) 492 + static void warn_dirty_buffer(struct buffer_head *bh) 503 493 { 504 - int jlist; 494 + char b[BDEVNAME_SIZE]; 505 495 506 - /* If this buffer is one which might reasonably be dirty 507 - * --- ie. data, or not part of this journal --- then 508 - * we're OK to leave it alone, but otherwise we need to 509 - * move the dirty bit to the journal's own internal 510 - * JBDDirty bit. */ 511 - jlist = jh->b_jlist; 512 - 513 - if (jlist == BJ_Metadata || jlist == BJ_Reserved || 514 - jlist == BJ_Shadow || jlist == BJ_Forget) { 515 - struct buffer_head *bh = jh2bh(jh); 516 - 517 - if (test_clear_buffer_dirty(bh)) 518 - set_buffer_jbddirty(bh); 519 - } 496 + printk(KERN_WARNING 497 + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 498 + "There's a risk of filesystem corruption in case of system " 499 + "crash.\n", 500 + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 520 501 } 521 502 522 503 /* ··· 564 583 if (jh->b_next_transaction) 565 584 J_ASSERT_JH(jh, jh->b_next_transaction == 566 585 transaction); 586 + warn_dirty_buffer(bh); 567 587 } 568 588 /* 569 589 * In any case we need to clean the dirty flag and we must 570 590 * do it under the buffer lock to be sure we don't race 571 591 * with running write-out. 572 592 */ 573 - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 574 - jbd_unexpected_dirty_buffer(jh); 593 + JBUFFER_TRACE(jh, "Journalling dirty buffer"); 594 + clear_buffer_dirty(bh); 595 + set_buffer_jbddirty(bh); 575 596 } 576 597 577 598 unlock_buffer(bh); ··· 809 826 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 810 827 811 828 if (jh->b_transaction == NULL) { 829 + /* 830 + * Previous journal_forget() could have left the buffer 831 + * with jbddirty bit set because it was being committed. When 832 + * the commit finished, we've filed the buffer for 833 + * checkpointing and marked it dirty. Now we are reallocating 834 + * the buffer so the transaction freeing it must have 835 + * committed and so it's safe to clear the dirty bit. 836 + */ 837 + clear_buffer_dirty(jh2bh(jh)); 812 838 jh->b_transaction = transaction; 813 839 814 840 /* first access by this transaction */ ··· 1774 1782 1775 1783 if (jh->b_cp_transaction) { 1776 1784 JBUFFER_TRACE(jh, "on running+cp transaction"); 1785 + /* 1786 + * We don't want to write the buffer anymore, clear the 1787 + * bit so that we don't confuse checks in 1788 + * __journal_file_buffer 1789 + */ 1790 + clear_buffer_dirty(bh); 1777 1791 __journal_file_buffer(jh, transaction, BJ_Forget); 1778 - clear_buffer_jbddirty(bh); 1779 1792 may_free = 0; 1780 1793 } else { 1781 1794 JBUFFER_TRACE(jh, "on running transaction"); ··· 2038 2041 if (jh->b_transaction && jh->b_jlist == jlist) 2039 2042 return; 2040 2043 2041 - /* The following list of buffer states needs to be consistent 2042 - * with __jbd_unexpected_dirty_buffer()'s handling of dirty 2043 - * state. */ 2044 - 2045 2044 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 2046 2045 jlist == BJ_Shadow || jlist == BJ_Forget) { 2046 + /* 2047 + * For metadata buffers, we track dirty bit in buffer_jbddirty 2048 + * instead of buffer_dirty. We should not see a dirty bit set 2049 + * here because we clear it in do_get_write_access but e.g. 2050 + * tune2fs can modify the sb and set the dirty bit at any time 2051 + * so we try to gracefully handle that. 2052 + */ 2053 + if (buffer_dirty(bh)) 2054 + warn_dirty_buffer(bh); 2047 2055 if (test_clear_buffer_dirty(bh) || 2048 2056 test_clear_buffer_jbddirty(bh)) 2049 2057 was_dirty = 1;
+1 -1
include/linux/ext3_fs.h
··· 874 874 struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); 875 875 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 876 876 sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, 877 - int create, int extend_disksize); 877 + int create); 878 878 879 879 extern struct inode *ext3_iget(struct super_block *, unsigned long); 880 880 extern int ext3_write_inode (struct inode *, int);