fs: protect inode->i_state with inode->i_lock

Protect inode state transitions and validity checks with the
inode->i_lock. This enables us to make inode state transitions
independently of the inode_lock and is the first step to peeling
away the inode_lock from the code.

This requires that __iget() is done atomically with i_state checks
during list traversals so that we don't race with another thread
marking the inode I_FREEING between the state check and grabbing the
reference.

Also remove the unlock_new_inode() memory barrier optimisation
required to avoid taking the inode_lock when clearing I_NEW.
Simplify the code by simply taking the inode->i_lock around the
state change and wakeup. Because the wakeup is no longer tricky,
remove the wake_up_inode() function and open code the wakeup where
necessary.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by Dave Chinner and committed by Al Viro 250df6ed 3dc8fe4d

+176 -76
+2
fs/block_dev.c
··· 56 struct backing_dev_info *dst) 57 { 58 spin_lock(&inode_lock); 59 inode->i_data.backing_dev_info = dst; 60 if (inode->i_state & I_DIRTY) 61 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 62 spin_unlock(&inode_lock); 63 } 64
··· 56 struct backing_dev_info *dst) 57 { 58 spin_lock(&inode_lock); 59 + spin_lock(&inode->i_lock); 60 inode->i_data.backing_dev_info = dst; 61 if (inode->i_state & I_DIRTY) 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 63 + spin_unlock(&inode->i_lock); 64 spin_unlock(&inode_lock); 65 } 66
+1 -1
fs/buffer.c
··· 1144 * inode list. 1145 * 1146 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1147 - * mapping->tree_lock and the global inode_lock. 1148 */ 1149 void mark_buffer_dirty(struct buffer_head *bh) 1150 {
··· 1144 * inode list. 1145 * 1146 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1147 + * mapping->tree_lock and mapping->host->i_lock. 1148 */ 1149 void mark_buffer_dirty(struct buffer_head *bh) 1150 {
+6 -3
fs/drop_caches.c
··· 18 19 spin_lock(&inode_lock); 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 21 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 22 continue; 23 - if (inode->i_mapping->nrpages == 0) 24 - continue; 25 __iget(inode); 26 spin_unlock(&inode_lock); 27 invalidate_mapping_pages(inode->i_mapping, 0, -1); 28 iput(toput_inode);
··· 18 19 spin_lock(&inode_lock); 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 21 + spin_lock(&inode->i_lock); 22 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 23 + (inode->i_mapping->nrpages == 0)) { 24 + spin_unlock(&inode->i_lock); 25 continue; 26 + } 27 __iget(inode); 28 + spin_unlock(&inode->i_lock); 29 spin_unlock(&inode_lock); 30 invalidate_mapping_pages(inode->i_mapping, 0, -1); 31 iput(toput_inode);
+34 -10
fs/fs-writeback.c
··· 306 wait_queue_head_t *wqh; 307 308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 309 - while (inode->i_state & I_SYNC) { 310 spin_unlock(&inode_lock); 311 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 312 spin_lock(&inode_lock); 313 } 314 } 315 ··· 335 unsigned dirty; 336 int ret; 337 338 if (!atomic_read(&inode->i_count)) 339 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 340 else ··· 351 * completed a full scan of b_io. 352 */ 353 if (wbc->sync_mode != WB_SYNC_ALL) { 354 requeue_io(inode); 355 return 0; 356 } ··· 367 /* Set I_SYNC, reset I_DIRTY_PAGES */ 368 inode->i_state |= I_SYNC; 369 inode->i_state &= ~I_DIRTY_PAGES; 370 spin_unlock(&inode_lock); 371 372 ret = do_writepages(mapping, wbc); ··· 389 * write_inode() 390 */ 391 spin_lock(&inode_lock); 392 dirty = inode->i_state & I_DIRTY; 393 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 394 spin_unlock(&inode_lock); 395 /* Don't write the inode if only I_DIRTY_PAGES was set */ 396 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { ··· 402 } 403 404 spin_lock(&inode_lock); 405 inode->i_state &= ~I_SYNC; 406 if (!(inode->i_state & I_FREEING)) { 407 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ··· 444 } 445 } 446 inode_sync_complete(inode); 447 return ret; 448 } 449 ··· 515 * kind does not need peridic writeout yet, and for the latter 516 * kind writeout is handled by the freer. 517 */ 518 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 519 requeue_io(inode); 520 continue; 521 } ··· 526 * Was this inode dirtied after sync_sb_inodes was called? 527 * This keeps sync from extra jobs and livelock. 528 */ 529 - if (inode_dirtied_after(inode, wbc->wb_start)) 530 return 1; 531 532 __iget(inode); 533 pages_skipped = wbc->pages_skipped; 534 writeback_single_inode(inode, wbc); 535 if (wbc->pages_skipped != pages_skipped) { ··· 739 if (!list_empty(&wb->b_more_io)) { 740 inode = wb_inode(wb->b_more_io.prev); 741 trace_wbc_writeback_wait(&wbc, wb->bdi); 742 inode_wait_for_writeback(inode); 743 } 744 spin_unlock(&inode_lock); 745 } ··· 1034 block_dump___mark_inode_dirty(inode); 1035 1036 spin_lock(&inode_lock); 1037 if ((inode->i_state & flags) != flags) { 1038 const int was_dirty = inode->i_state & I_DIRTY; 1039 ··· 1046 * superblock list, based upon its state. 1047 */ 1048 if (inode->i_state & I_SYNC) 1049 - goto out; 1050 1051 /* 1052 * Only add valid (hashed) inodes to the superblock's ··· 1054 */ 1055 if (!S_ISBLK(inode->i_mode)) { 1056 if (inode_unhashed(inode)) 1057 - goto out; 1058 } 1059 if (inode->i_state & I_FREEING) 1060 - goto out; 1061 1062 /* 1063 * If the inode was already on b_dirty/b_io/b_more_io, don't 1064 * reposition it (that would break b_dirty time-ordering). ··· 1084 inode->dirtied_when = jiffies; 1085 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1086 } 1087 } 1088 out: 1089 spin_unlock(&inode_lock); 1090 ··· 1133 * we still have to wait for that writeout. 1134 */ 1135 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1136 - struct address_space *mapping; 1137 1138 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 1139 continue; 1140 - mapping = inode->i_mapping; 1141 - if (mapping->nrpages == 0) 1142 - continue; 1143 __iget(inode); 1144 spin_unlock(&inode_lock); 1145 /* 1146 * We hold a reference to 'inode' so it couldn't have
··· 306 wait_queue_head_t *wqh; 307 308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 309 + while (inode->i_state & I_SYNC) { 310 + spin_unlock(&inode->i_lock); 311 spin_unlock(&inode_lock); 312 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 313 spin_lock(&inode_lock); 314 + spin_lock(&inode->i_lock); 315 } 316 } 317 ··· 333 unsigned dirty; 334 int ret; 335 336 + spin_lock(&inode->i_lock); 337 if (!atomic_read(&inode->i_count)) 338 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 339 else ··· 348 * completed a full scan of b_io. 349 */ 350 if (wbc->sync_mode != WB_SYNC_ALL) { 351 + spin_unlock(&inode->i_lock); 352 requeue_io(inode); 353 return 0; 354 } ··· 363 /* Set I_SYNC, reset I_DIRTY_PAGES */ 364 inode->i_state |= I_SYNC; 365 inode->i_state &= ~I_DIRTY_PAGES; 366 + spin_unlock(&inode->i_lock); 367 spin_unlock(&inode_lock); 368 369 ret = do_writepages(mapping, wbc); ··· 384 * write_inode() 385 */ 386 spin_lock(&inode_lock); 387 + spin_lock(&inode->i_lock); 388 dirty = inode->i_state & I_DIRTY; 389 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 390 + spin_unlock(&inode->i_lock); 391 spin_unlock(&inode_lock); 392 /* Don't write the inode if only I_DIRTY_PAGES was set */ 393 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { ··· 395 } 396 397 spin_lock(&inode_lock); 398 + spin_lock(&inode->i_lock); 399 inode->i_state &= ~I_SYNC; 400 if (!(inode->i_state & I_FREEING)) { 401 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ··· 436 } 437 } 438 inode_sync_complete(inode); 439 + spin_unlock(&inode->i_lock); 440 return ret; 441 } 442 ··· 506 * kind does not need peridic writeout yet, and for the latter 507 * kind writeout is handled by the freer. 508 */ 509 + spin_lock(&inode->i_lock); 510 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 511 + spin_unlock(&inode->i_lock); 512 requeue_io(inode); 513 continue; 514 } ··· 515 * Was this inode dirtied after sync_sb_inodes was called? 516 * This keeps sync from extra jobs and livelock. 517 */ 518 + if (inode_dirtied_after(inode, wbc->wb_start)) { 519 + spin_unlock(&inode->i_lock); 520 return 1; 521 + } 522 523 __iget(inode); 524 + spin_unlock(&inode->i_lock); 525 + 526 pages_skipped = wbc->pages_skipped; 527 writeback_single_inode(inode, wbc); 528 if (wbc->pages_skipped != pages_skipped) { ··· 724 if (!list_empty(&wb->b_more_io)) { 725 inode = wb_inode(wb->b_more_io.prev); 726 trace_wbc_writeback_wait(&wbc, wb->bdi); 727 + spin_lock(&inode->i_lock); 728 inode_wait_for_writeback(inode); 729 + spin_unlock(&inode->i_lock); 730 } 731 spin_unlock(&inode_lock); 732 } ··· 1017 block_dump___mark_inode_dirty(inode); 1018 1019 spin_lock(&inode_lock); 1020 + spin_lock(&inode->i_lock); 1021 if ((inode->i_state & flags) != flags) { 1022 const int was_dirty = inode->i_state & I_DIRTY; 1023 ··· 1028 * superblock list, based upon its state. 1029 */ 1030 if (inode->i_state & I_SYNC) 1031 + goto out_unlock_inode; 1032 1033 /* 1034 * Only add valid (hashed) inodes to the superblock's ··· 1036 */ 1037 if (!S_ISBLK(inode->i_mode)) { 1038 if (inode_unhashed(inode)) 1039 + goto out_unlock_inode; 1040 } 1041 if (inode->i_state & I_FREEING) 1042 + goto out_unlock_inode; 1043 1044 + spin_unlock(&inode->i_lock); 1045 /* 1046 * If the inode was already on b_dirty/b_io/b_more_io, don't 1047 * reposition it (that would break b_dirty time-ordering). ··· 1065 inode->dirtied_when = jiffies; 1066 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1067 } 1068 + goto out; 1069 } 1070 + out_unlock_inode: 1071 + spin_unlock(&inode->i_lock); 1072 out: 1073 spin_unlock(&inode_lock); 1074 ··· 1111 * we still have to wait for that writeout. 1112 */ 1113 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1114 + struct address_space *mapping = inode->i_mapping; 1115 1116 + spin_lock(&inode->i_lock); 1117 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 1118 + (mapping->nrpages == 0)) { 1119 + spin_unlock(&inode->i_lock); 1120 continue; 1121 + } 1122 __iget(inode); 1123 + spin_unlock(&inode->i_lock); 1124 spin_unlock(&inode_lock); 1125 /* 1126 * We hold a reference to 'inode' so it couldn't have
+106 -48
fs/inode.c
··· 28 #include <linux/cred.h> 29 30 /* 31 * This is needed for the following functions: 32 * - inode_has_buffers 33 * - invalidate_bdev ··· 147 return proc_dointvec(table, write, buffer, lenp, ppos); 148 } 149 #endif 150 - 151 - static void wake_up_inode(struct inode *inode) 152 - { 153 - /* 154 - * Prevent speculative execution through spin_unlock(&inode_lock); 155 - */ 156 - smp_mb(); 157 - wake_up_bit(&inode->i_state, __I_NEW); 158 - } 159 160 /** 161 * inode_init_always - perform inode structure intialisation ··· 338 } 339 340 /* 341 - * inode_lock must be held 342 */ 343 void __iget(struct inode *inode) 344 { ··· 415 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 416 417 spin_lock(&inode_lock); 418 hlist_add_head(&inode->i_hash, b); 419 spin_unlock(&inode_lock); 420 } 421 EXPORT_SYMBOL(__insert_inode_hash); ··· 442 void remove_inode_hash(struct inode *inode) 443 { 444 spin_lock(&inode_lock); 445 hlist_del_init(&inode->i_hash); 446 spin_unlock(&inode_lock); 447 } 448 EXPORT_SYMBOL(remove_inode_hash); ··· 501 __inode_sb_list_del(inode); 502 spin_unlock(&inode_lock); 503 504 - wake_up_inode(inode); 505 destroy_inode(inode); 506 } 507 } ··· 526 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 527 if (atomic_read(&inode->i_count)) 528 continue; 529 - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 530 continue; 531 532 inode->i_state |= I_FREEING; 533 534 /* 535 * Move the inode off the IO lists and LRU once I_FREEING is ··· 544 */ 545 list_move(&inode->i_lru, &dispose); 546 list_del_init(&inode->i_wb_list); 547 - if (!(inode->i_state & (I_DIRTY | I_SYNC))) 548 - inodes_stat.nr_unused--; 549 } 550 spin_unlock(&inode_lock); 551 ··· 576 577 spin_lock(&inode_lock); 578 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 579 - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 580 continue; 581 if (inode->i_state & I_DIRTY && !kill_dirty) { 582 busy = 1; 583 continue; 584 } 585 if (atomic_read(&inode->i_count)) { 586 busy = 1; 587 continue; 588 } 589 590 inode->i_state |= I_FREEING; 591 592 /* 593 * Move the inode off the IO lists and LRU once I_FREEING is ··· 603 */ 604 list_move(&inode->i_lru, &dispose); 605 list_del_init(&inode->i_wb_list); 606 - if (!(inode->i_state & (I_DIRTY | I_SYNC))) 607 - inodes_stat.nr_unused--; 608 } 609 spin_unlock(&inode_lock); 610 ··· 660 * Referenced or dirty inodes are still in use. Give them 661 * another pass through the LRU as we canot reclaim them now. 662 */ 663 if (atomic_read(&inode->i_count) || 664 (inode->i_state & ~I_REFERENCED)) { 665 list_del_init(&inode->i_lru); 666 inodes_stat.nr_unused--; 667 continue; ··· 671 672 /* recently referenced inodes get one more pass */ 673 if (inode->i_state & I_REFERENCED) { 674 - list_move(&inode->i_lru, &inode_lru); 675 inode->i_state &= ~I_REFERENCED; 676 continue; 677 } 678 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 679 __iget(inode); 680 spin_unlock(&inode_lock); 681 if (remove_inode_buffers(inode)) 682 reap += invalidate_mapping_pages(&inode->i_data, ··· 689 if (inode != list_entry(inode_lru.next, 690 struct inode, i_lru)) 691 continue; /* wrong inode or list_empty */ 692 - if (!can_unuse(inode)) 693 continue; 694 } 695 WARN_ON(inode->i_state & I_NEW); 696 inode->i_state |= I_FREEING; 697 698 /* 699 * Move the inode off the IO lists and LRU once I_FREEING is ··· 764 continue; 765 if (!test(inode, data)) 766 continue; 767 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 768 __wait_on_freeing_inode(inode); 769 goto repeat; 770 } 771 __iget(inode); 772 return inode; 773 } 774 return NULL; ··· 792 continue; 793 if (inode->i_sb != sb) 794 continue; 795 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 796 __wait_on_freeing_inode(inode); 797 goto repeat; 798 } 799 __iget(inode); 800 return inode; 801 } 802 return NULL; ··· 863 inode = alloc_inode(sb); 864 if (inode) { 865 spin_lock(&inode_lock); 866 - __inode_sb_list_add(inode); 867 inode->i_state = 0; 868 spin_unlock(&inode_lock); 869 } 870 return inode; 871 } 872 EXPORT_SYMBOL(new_inode); 873 874 void unlock_new_inode(struct inode *inode) 875 { 876 #ifdef CONFIG_DEBUG_LOCK_ALLOC ··· 899 } 900 } 901 #endif 902 - /* 903 - * This is special! We do not need the spinlock when clearing I_NEW, 904 - * because we're guaranteed that nobody else tries to do anything about 905 - * the state of the inode when it is locked, as we just created it (so 906 - * there can be no old holders that haven't tested I_NEW). 907 - * However we must emit the memory barrier so that other CPUs reliably 908 - * see the clearing of I_NEW after the other inode initialisation has 909 - * completed. 910 - */ 911 - smp_mb(); 912 WARN_ON(!(inode->i_state & I_NEW)); 913 inode->i_state &= ~I_NEW; 914 - wake_up_inode(inode); 915 } 916 EXPORT_SYMBOL(unlock_new_inode); 917 ··· 932 if (set(inode, data)) 933 goto set_failed; 934 935 - hlist_add_head(&inode->i_hash, head); 936 - __inode_sb_list_add(inode); 937 inode->i_state = I_NEW; 938 spin_unlock(&inode_lock); 939 940 /* Return the locked inode with I_NEW set, the ··· 981 old = find_inode_fast(sb, head, ino); 982 if (!old) { 983 inode->i_ino = ino; 984 - hlist_add_head(&inode->i_hash, head); 985 - __inode_sb_list_add(inode); 986 inode->i_state = I_NEW; 987 spin_unlock(&inode_lock); 988 989 /* Return the locked inode with I_NEW set, the ··· 1070 struct inode *igrab(struct inode *inode) 1071 { 1072 spin_lock(&inode_lock); 1073 - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 1074 __iget(inode); 1075 - else 1076 /* 1077 * Handle the case where s_op->clear_inode is not been 1078 * called yet, and somebody is calling igrab 1079 * while the inode is getting freed. 1080 */ 1081 inode = NULL; 1082 spin_unlock(&inode_lock); 1083 return inode; 1084 } ··· 1311 ino_t ino = inode->i_ino; 1312 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1313 1314 - inode->i_state |= I_NEW; 1315 while (1) { 1316 struct hlist_node *node; 1317 struct inode *old = NULL; ··· 1320 continue; 1321 if (old->i_sb != sb) 1322 continue; 1323 - if (old->i_state & (I_FREEING|I_WILL_FREE)) 1324 continue; 1325 break; 1326 } 1327 if (likely(!node)) { 1328 hlist_add_head(&inode->i_hash, head); 1329 spin_unlock(&inode_lock); 1330 return 0; 1331 } 1332 __iget(old); 1333 spin_unlock(&inode_lock); 1334 wait_on_inode(old); 1335 if (unlikely(!inode_unhashed(old))) { ··· 1354 struct super_block *sb = inode->i_sb; 1355 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1356 1357 - inode->i_state |= I_NEW; 1358 - 1359 while (1) { 1360 struct hlist_node *node; 1361 struct inode *old = NULL; ··· 1364 continue; 1365 if (!test(old, data)) 1366 continue; 1367 - if (old->i_state & (I_FREEING|I_WILL_FREE)) 1368 continue; 1369 break; 1370 } 1371 if (likely(!node)) { 1372 hlist_add_head(&inode->i_hash, head); 1373 spin_unlock(&inode_lock); 1374 return 0; 1375 } 1376 __iget(old); 1377 spin_unlock(&inode_lock); 1378 wait_on_inode(old); 1379 if (unlikely(!inode_unhashed(old))) { ··· 1426 const struct super_operations *op = inode->i_sb->s_op; 1427 int drop; 1428 1429 if (op && op->drop_inode) 1430 drop = op->drop_inode(inode); 1431 else ··· 1440 if (!(inode->i_state & (I_DIRTY|I_SYNC))) { 1441 inode_lru_list_add(inode); 1442 } 1443 spin_unlock(&inode_lock); 1444 return; 1445 } 1446 - WARN_ON(inode->i_state & I_NEW); 1447 inode->i_state |= I_WILL_FREE; 1448 spin_unlock(&inode_lock); 1449 write_inode_now(inode, 1); 1450 spin_lock(&inode_lock); 1451 WARN_ON(inode->i_state & I_NEW); 1452 inode->i_state &= ~I_WILL_FREE; 1453 __remove_inode_hash(inode); 1454 } 1455 1456 - WARN_ON(inode->i_state & I_NEW); 1457 inode->i_state |= I_FREEING; 1458 1459 /* 1460 * Move the inode off the IO lists and LRU once I_FREEING is ··· 1469 spin_unlock(&inode_lock); 1470 evict(inode); 1471 remove_inode_hash(inode); 1472 - wake_up_inode(inode); 1473 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 1474 destroy_inode(inode); 1475 } 1476 ··· 1669 * to recheck inode state. 1670 * 1671 * It doesn't matter if I_NEW is not set initially, a call to 1672 - * wake_up_inode() after removing from the hash list will DTRT. 1673 - * 1674 - * This is called with inode_lock held. 1675 */ 1676 static void __wait_on_freeing_inode(struct inode *inode) 1677 { ··· 1678 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1679 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1680 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1681 spin_unlock(&inode_lock); 1682 schedule(); 1683 finish_wait(wq, &wait.wait);
··· 28 #include <linux/cred.h> 29 30 /* 31 + * inode locking rules. 32 + * 33 + * inode->i_lock protects: 34 + * inode->i_state, inode->i_hash, __iget() 35 + * 36 + * Lock ordering: 37 + * inode_lock 38 + * inode->i_lock 39 + */ 40 + 41 + /* 42 * This is needed for the following functions: 43 * - inode_has_buffers 44 * - invalidate_bdev ··· 136 return proc_dointvec(table, write, buffer, lenp, ppos); 137 } 138 #endif 139 140 /** 141 * inode_init_always - perform inode structure intialisation ··· 336 } 337 338 /* 339 + * inode->i_lock must be held 340 */ 341 void __iget(struct inode *inode) 342 { ··· 413 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 414 415 spin_lock(&inode_lock); 416 + spin_lock(&inode->i_lock); 417 hlist_add_head(&inode->i_hash, b); 418 + spin_unlock(&inode->i_lock); 419 spin_unlock(&inode_lock); 420 } 421 EXPORT_SYMBOL(__insert_inode_hash); ··· 438 void remove_inode_hash(struct inode *inode) 439 { 440 spin_lock(&inode_lock); 441 + spin_lock(&inode->i_lock); 442 hlist_del_init(&inode->i_hash); 443 + spin_unlock(&inode->i_lock); 444 spin_unlock(&inode_lock); 445 } 446 EXPORT_SYMBOL(remove_inode_hash); ··· 495 __inode_sb_list_del(inode); 496 spin_unlock(&inode_lock); 497 498 + spin_lock(&inode->i_lock); 499 + wake_up_bit(&inode->i_state, __I_NEW); 500 + spin_unlock(&inode->i_lock); 501 destroy_inode(inode); 502 } 503 } ··· 518 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 519 if (atomic_read(&inode->i_count)) 520 continue; 521 + 522 + spin_lock(&inode->i_lock); 523 + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 524 + spin_unlock(&inode->i_lock); 525 continue; 526 + } 527 528 inode->i_state |= I_FREEING; 529 + if (!(inode->i_state & (I_DIRTY | I_SYNC))) 530 + inodes_stat.nr_unused--; 531 + spin_unlock(&inode->i_lock); 532 533 /* 534 * Move the inode off the IO lists and LRU once I_FREEING is ··· 529 */ 530 list_move(&inode->i_lru, &dispose); 531 list_del_init(&inode->i_wb_list); 532 } 533 spin_unlock(&inode_lock); 534 ··· 563 564 spin_lock(&inode_lock); 565 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 566 + spin_lock(&inode->i_lock); 567 + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 568 + spin_unlock(&inode->i_lock); 569 continue; 570 + } 571 if (inode->i_state & I_DIRTY && !kill_dirty) { 572 + spin_unlock(&inode->i_lock); 573 busy = 1; 574 continue; 575 } 576 if (atomic_read(&inode->i_count)) { 577 + spin_unlock(&inode->i_lock); 578 busy = 1; 579 continue; 580 } 581 582 inode->i_state |= I_FREEING; 583 + if (!(inode->i_state & (I_DIRTY | I_SYNC))) 584 + inodes_stat.nr_unused--; 585 + spin_unlock(&inode->i_lock); 586 587 /* 588 * Move the inode off the IO lists and LRU once I_FREEING is ··· 582 */ 583 list_move(&inode->i_lru, &dispose); 584 list_del_init(&inode->i_wb_list); 585 } 586 spin_unlock(&inode_lock); 587 ··· 641 * Referenced or dirty inodes are still in use. Give them 642 * another pass through the LRU as we canot reclaim them now. 643 */ 644 + spin_lock(&inode->i_lock); 645 if (atomic_read(&inode->i_count) || 646 (inode->i_state & ~I_REFERENCED)) { 647 + spin_unlock(&inode->i_lock); 648 list_del_init(&inode->i_lru); 649 inodes_stat.nr_unused--; 650 continue; ··· 650 651 /* recently referenced inodes get one more pass */ 652 if (inode->i_state & I_REFERENCED) { 653 inode->i_state &= ~I_REFERENCED; 654 + spin_unlock(&inode->i_lock); 655 + list_move(&inode->i_lru, &inode_lru); 656 continue; 657 } 658 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 659 __iget(inode); 660 + spin_unlock(&inode->i_lock); 661 spin_unlock(&inode_lock); 662 if (remove_inode_buffers(inode)) 663 reap += invalidate_mapping_pages(&inode->i_data, ··· 666 if (inode != list_entry(inode_lru.next, 667 struct inode, i_lru)) 668 continue; /* wrong inode or list_empty */ 669 + spin_lock(&inode->i_lock); 670 + if (!can_unuse(inode)) { 671 + spin_unlock(&inode->i_lock); 672 continue; 673 + } 674 } 675 WARN_ON(inode->i_state & I_NEW); 676 inode->i_state |= I_FREEING; 677 + spin_unlock(&inode->i_lock); 678 679 /* 680 * Move the inode off the IO lists and LRU once I_FREEING is ··· 737 continue; 738 if (!test(inode, data)) 739 continue; 740 + spin_lock(&inode->i_lock); 741 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 742 __wait_on_freeing_inode(inode); 743 goto repeat; 744 } 745 __iget(inode); 746 + spin_unlock(&inode->i_lock); 747 return inode; 748 } 749 return NULL; ··· 763 continue; 764 if (inode->i_sb != sb) 765 continue; 766 + spin_lock(&inode->i_lock); 767 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 768 __wait_on_freeing_inode(inode); 769 goto repeat; 770 } 771 __iget(inode); 772 + spin_unlock(&inode->i_lock); 773 return inode; 774 } 775 return NULL; ··· 832 inode = alloc_inode(sb); 833 if (inode) { 834 spin_lock(&inode_lock); 835 + spin_lock(&inode->i_lock); 836 inode->i_state = 0; 837 + spin_unlock(&inode->i_lock); 838 + __inode_sb_list_add(inode); 839 spin_unlock(&inode_lock); 840 } 841 return inode; 842 } 843 EXPORT_SYMBOL(new_inode); 844 845 + /** 846 + * unlock_new_inode - clear the I_NEW state and wake up any waiters 847 + * @inode: new inode to unlock 848 + * 849 + * Called when the inode is fully initialised to clear the new state of the 850 + * inode and wake up anyone waiting for the inode to finish initialisation. 851 + */ 852 void unlock_new_inode(struct inode *inode) 853 { 854 #ifdef CONFIG_DEBUG_LOCK_ALLOC ··· 859 } 860 } 861 #endif 862 + spin_lock(&inode->i_lock); 863 WARN_ON(!(inode->i_state & I_NEW)); 864 inode->i_state &= ~I_NEW; 865 + wake_up_bit(&inode->i_state, __I_NEW); 866 + spin_unlock(&inode->i_lock); 867 } 868 EXPORT_SYMBOL(unlock_new_inode); 869 ··· 900 if (set(inode, data)) 901 goto set_failed; 902 903 + spin_lock(&inode->i_lock); 904 inode->i_state = I_NEW; 905 + hlist_add_head(&inode->i_hash, head); 906 + spin_unlock(&inode->i_lock); 907 + __inode_sb_list_add(inode); 908 spin_unlock(&inode_lock); 909 910 /* Return the locked inode with I_NEW set, the ··· 947 old = find_inode_fast(sb, head, ino); 948 if (!old) { 949 inode->i_ino = ino; 950 + spin_lock(&inode->i_lock); 951 inode->i_state = I_NEW; 952 + hlist_add_head(&inode->i_hash, head); 953 + spin_unlock(&inode->i_lock); 954 + __inode_sb_list_add(inode); 955 spin_unlock(&inode_lock); 956 957 /* Return the locked inode with I_NEW set, the ··· 1034 struct inode *igrab(struct inode *inode) 1035 { 1036 spin_lock(&inode_lock); 1037 + spin_lock(&inode->i_lock); 1038 + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1039 __iget(inode); 1040 + spin_unlock(&inode->i_lock); 1041 + } else { 1042 + spin_unlock(&inode->i_lock); 1043 /* 1044 * Handle the case where s_op->clear_inode is not been 1045 * called yet, and somebody is calling igrab 1046 * while the inode is getting freed. 1047 */ 1048 inode = NULL; 1049 + } 1050 spin_unlock(&inode_lock); 1051 return inode; 1052 } ··· 1271 ino_t ino = inode->i_ino; 1272 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1273 1274 while (1) { 1275 struct hlist_node *node; 1276 struct inode *old = NULL; ··· 1281 continue; 1282 if (old->i_sb != sb) 1283 continue; 1284 + spin_lock(&old->i_lock); 1285 + if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1286 + spin_unlock(&old->i_lock); 1287 continue; 1288 + } 1289 break; 1290 } 1291 if (likely(!node)) { 1292 + spin_lock(&inode->i_lock); 1293 + inode->i_state |= I_NEW; 1294 hlist_add_head(&inode->i_hash, head); 1295 + spin_unlock(&inode->i_lock); 1296 spin_unlock(&inode_lock); 1297 return 0; 1298 } 1299 __iget(old); 1300 + spin_unlock(&old->i_lock); 1301 spin_unlock(&inode_lock); 1302 wait_on_inode(old); 1303 if (unlikely(!inode_unhashed(old))) { ··· 1308 struct super_block *sb = inode->i_sb; 1309 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1310 1311 while (1) { 1312 struct hlist_node *node; 1313 struct inode *old = NULL; ··· 1320 continue; 1321 if (!test(old, data)) 1322 continue; 1323 + spin_lock(&old->i_lock); 1324 + if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1325 + spin_unlock(&old->i_lock); 1326 continue; 1327 + } 1328 break; 1329 } 1330 if (likely(!node)) { 1331 + spin_lock(&inode->i_lock); 1332 + inode->i_state |= I_NEW; 1333 hlist_add_head(&inode->i_hash, head); 1334 + spin_unlock(&inode->i_lock); 1335 spin_unlock(&inode_lock); 1336 return 0; 1337 } 1338 __iget(old); 1339 + spin_unlock(&old->i_lock); 1340 spin_unlock(&inode_lock); 1341 wait_on_inode(old); 1342 if (unlikely(!inode_unhashed(old))) { ··· 1375 const struct super_operations *op = inode->i_sb->s_op; 1376 int drop; 1377 1378 + spin_lock(&inode->i_lock); 1379 + WARN_ON(inode->i_state & I_NEW); 1380 + 1381 if (op && op->drop_inode) 1382 drop = op->drop_inode(inode); 1383 else ··· 1386 if (!(inode->i_state & (I_DIRTY|I_SYNC))) { 1387 inode_lru_list_add(inode); 1388 } 1389 + spin_unlock(&inode->i_lock); 1390 spin_unlock(&inode_lock); 1391 return; 1392 } 1393 inode->i_state |= I_WILL_FREE; 1394 + spin_unlock(&inode->i_lock); 1395 spin_unlock(&inode_lock); 1396 write_inode_now(inode, 1); 1397 spin_lock(&inode_lock); 1398 + spin_lock(&inode->i_lock); 1399 WARN_ON(inode->i_state & I_NEW); 1400 inode->i_state &= ~I_WILL_FREE; 1401 __remove_inode_hash(inode); 1402 } 1403 1404 inode->i_state |= I_FREEING; 1405 + spin_unlock(&inode->i_lock); 1406 1407 /* 1408 * Move the inode off the IO lists and LRU once I_FREEING is ··· 1413 spin_unlock(&inode_lock); 1414 evict(inode); 1415 remove_inode_hash(inode); 1416 + spin_lock(&inode->i_lock); 1417 + wake_up_bit(&inode->i_state, __I_NEW); 1418 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 1419 + spin_unlock(&inode->i_lock); 1420 destroy_inode(inode); 1421 } 1422 ··· 1611 * to recheck inode state. 1612 * 1613 * It doesn't matter if I_NEW is not set initially, a call to 1614 + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 1615 + * will DTRT. 1616 */ 1617 static void __wait_on_freeing_inode(struct inode *inode) 1618 { ··· 1621 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1622 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1623 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1624 + spin_unlock(&inode->i_lock); 1625 spin_unlock(&inode_lock); 1626 schedule(); 1627 finish_wait(wq, &wait.wait);
+15 -6
fs/notify/inode_mark.c
··· 254 * I_WILL_FREE, or I_NEW which is fine because by that point 255 * the inode cannot have any associated watches. 256 */ 257 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 258 continue; 259 260 /* 261 * If i_count is zero, the inode cannot have any watches and ··· 266 * evict all inodes with zero i_count from icache which is 267 * unnecessarily violent and may in fact be illegal to do. 268 */ 269 - if (!atomic_read(&inode->i_count)) 270 continue; 271 272 need_iput_tmp = need_iput; 273 need_iput = NULL; ··· 279 __iget(inode); 280 else 281 need_iput_tmp = NULL; 282 283 /* In case the dropping of a reference would nuke next_i. */ 284 if ((&next_i->i_sb_list != list) && 285 - atomic_read(&next_i->i_count) && 286 - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 287 - __iget(next_i); 288 - need_iput = next_i; 289 } 290 291 /*
··· 254 * I_WILL_FREE, or I_NEW which is fine because by that point 255 * the inode cannot have any associated watches. 256 */ 257 + spin_lock(&inode->i_lock); 258 + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { 259 + spin_unlock(&inode->i_lock); 260 continue; 261 + } 262 263 /* 264 * If i_count is zero, the inode cannot have any watches and ··· 263 * evict all inodes with zero i_count from icache which is 264 * unnecessarily violent and may in fact be illegal to do. 265 */ 266 + if (!atomic_read(&inode->i_count)) { 267 + spin_unlock(&inode->i_lock); 268 continue; 269 + } 270 271 need_iput_tmp = need_iput; 272 need_iput = NULL; ··· 274 __iget(inode); 275 else 276 need_iput_tmp = NULL; 277 + spin_unlock(&inode->i_lock); 278 279 /* In case the dropping of a reference would nuke next_i. */ 280 if ((&next_i->i_sb_list != list) && 281 + atomic_read(&next_i->i_count)) { 282 + spin_lock(&next_i->i_lock); 283 + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 284 + __iget(next_i); 285 + need_iput = next_i; 286 + } 287 + spin_unlock(&next_i->i_lock); 288 } 289 290 /*
+7 -6
fs/quota/dquot.c
··· 902 903 spin_lock(&inode_lock); 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 905 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 906 continue; 907 #ifdef CONFIG_QUOTA_DEBUG 908 if (unlikely(inode_get_rsv_space(inode) > 0)) 909 reserved = 1; 910 #endif 911 - if (!atomic_read(&inode->i_writecount)) 912 - continue; 913 - if (!dqinit_needed(inode, type)) 914 - continue; 915 - 916 __iget(inode); 917 spin_unlock(&inode_lock); 918 919 iput(old_inode);
··· 902 903 spin_lock(&inode_lock); 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 905 + spin_lock(&inode->i_lock); 906 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 907 + !atomic_read(&inode->i_writecount) || 908 + !dqinit_needed(inode, type)) { 909 + spin_unlock(&inode->i_lock); 910 continue; 911 + } 912 #ifdef CONFIG_QUOTA_DEBUG 913 if (unlikely(inode_get_rsv_space(inode) > 0)) 914 reserved = 1; 915 #endif 916 __iget(inode); 917 + spin_unlock(&inode->i_lock); 918 spin_unlock(&inode_lock); 919 920 iput(old_inode);
+1 -1
include/linux/fs.h
··· 1647 }; 1648 1649 /* 1650 - * Inode state bits. Protected by inode_lock. 1651 * 1652 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1653 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
··· 1647 }; 1648 1649 /* 1650 + * Inode state bits. Protected by inode->i_lock 1651 * 1652 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1653 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
+1 -1
include/linux/quotaops.h
··· 277 /* 278 * Mark inode fully dirty. Since we are allocating blocks, inode 279 * would become fully dirty soon anyway and it reportedly 280 - * reduces inode_lock contention. 281 */ 282 mark_inode_dirty(inode); 283 }
··· 277 /* 278 * Mark inode fully dirty. Since we are allocating blocks, inode 279 * would become fully dirty soon anyway and it reportedly 280 + * reduces lock contention. 281 */ 282 mark_inode_dirty(inode); 283 }
+2
mm/filemap.c
··· 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->inode_lock (page_remove_rmap->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 104 * 105 * (code doesn't rely on that order, so you could switch it around)
··· 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->inode_lock (page_remove_rmap->set_page_dirty) 102 + * ->inode->i_lock (page_remove_rmap->set_page_dirty) 103 * ->inode_lock (zap_pte_range->set_page_dirty) 104 + * ->inode->i_lock (zap_pte_range->set_page_dirty) 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 106 * 107 * (code doesn't rely on that order, so you could switch it around)
+1
mm/rmap.c
··· 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock,
··· 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 36 * sb_lock (within inode_lock in fs/fs-writeback.c) 37 * mapping->tree_lock (widely used, in set_page_dirty, 38 * in arch-dependent flush_dcache_mmap_lock,