fs: move i_wb_list out from under inode_lock

Protect the inode writeback list with a new global lock
inode_wb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by Dave Chinner and committed by Al Viro a66979ab 55fa6091

+70 -48
+2 -2
fs/block_dev.c
··· 55 55 static void bdev_inode_switch_bdi(struct inode *inode, 56 56 struct backing_dev_info *dst) 57 57 { 58 - spin_lock(&inode_lock); 58 + spin_lock(&inode_wb_list_lock); 59 59 spin_lock(&inode->i_lock); 60 60 inode->i_data.backing_dev_info = dst; 61 61 if (inode->i_state & I_DIRTY) 62 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 63 63 spin_unlock(&inode->i_lock); 64 - spin_unlock(&inode_lock); 64 + spin_unlock(&inode_wb_list_lock); 65 65 } 66 66 67 67 static sector_t max_block(struct block_device *bdev)
+44 -32
fs/fs-writeback.c
··· 176 176 } 177 177 178 178 /* 179 + * Remove the inode from the writeback list it is on. 180 + */ 181 + void inode_wb_list_del(struct inode *inode) 182 + { 183 + spin_lock(&inode_wb_list_lock); 184 + list_del_init(&inode->i_wb_list); 185 + spin_unlock(&inode_wb_list_lock); 186 + } 187 + 188 + 189 + /* 179 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 180 191 * furthest end of its superblock's dirty-inode list. 181 192 * ··· 199 188 { 200 189 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 201 190 191 + assert_spin_locked(&inode_wb_list_lock); 202 192 if (!list_empty(&wb->b_dirty)) { 203 193 struct inode *tail; 204 194 ··· 217 205 { 218 206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 219 207 208 + assert_spin_locked(&inode_wb_list_lock); 220 209 list_move(&inode->i_wb_list, &wb->b_more_io); 221 210 } 222 211 223 212 static void inode_sync_complete(struct inode *inode) 224 213 { 225 214 /* 226 - * Prevent speculative execution through spin_unlock(&inode_lock); 215 + * Prevent speculative execution through 216 + * spin_unlock(&inode_wb_list_lock); 227 217 */ 218 + 228 219 smp_mb(); 229 220 wake_up_bit(&inode->i_state, __I_SYNC); 230 221 } ··· 301 286 */ 302 287 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 303 288 { 289 + assert_spin_locked(&inode_wb_list_lock); 304 290 list_splice_init(&wb->b_more_io, &wb->b_io); 305 291 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 306 292 } ··· 324 308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 325 309 while (inode->i_state & I_SYNC) { 326 310 spin_unlock(&inode->i_lock); 327 - spin_unlock(&inode_lock); 311 + spin_unlock(&inode_wb_list_lock); 328 312 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 329 - spin_lock(&inode_lock); 313 + spin_lock(&inode_wb_list_lock); 330 314 spin_lock(&inode->i_lock); 331 315 } 332 316 } 333 317 334 318 /* 335 - * Write out an inode's dirty pages. Called under inode_lock. Either the 336 - * caller has ref on the inode (either via __iget or via syscall against an fd) 337 - * or the inode has I_WILL_FREE set (via generic_forget_inode) 319 + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either 320 + * the caller has an active reference on the inode or the inode has I_WILL_FREE 321 + * set. 338 322 * 339 323 * If `wait' is set, wait on the writeout. 340 324 * 341 325 * The whole writeout design is quite complex and fragile. We want to avoid 342 326 * starvation of particular inodes when others are being redirtied, prevent 343 327 * livelocks, etc. 344 - * 345 - * Called under inode_lock. 346 328 */ 347 329 static int 348 330 writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ··· 382 368 inode->i_state |= I_SYNC; 383 369 inode->i_state &= ~I_DIRTY_PAGES; 384 370 spin_unlock(&inode->i_lock); 385 - spin_unlock(&inode_lock); 371 + spin_unlock(&inode_wb_list_lock); 386 372 387 373 ret = do_writepages(mapping, wbc); 388 374 ··· 402 388 * due to delalloc, clear dirty metadata flags right before 403 389 * write_inode() 404 390 */ 405 - spin_lock(&inode_lock); 406 391 spin_lock(&inode->i_lock); 407 392 dirty = inode->i_state & I_DIRTY; 408 393 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 409 394 spin_unlock(&inode->i_lock); 410 - spin_unlock(&inode_lock); 411 395 /* Don't write the inode if only I_DIRTY_PAGES was set */ 412 396 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 413 397 int err = write_inode(inode, wbc); ··· 413 401 ret = err; 414 402 } 415 403 416 - spin_lock(&inode_lock); 404 + spin_lock(&inode_wb_list_lock); 417 405 spin_lock(&inode->i_lock); 418 406 inode->i_state &= ~I_SYNC; 419 407 if (!(inode->i_state & I_FREEING)) { ··· 555 543 */ 556 544 redirty_tail(inode); 557 545 } 558 - spin_unlock(&inode_lock); 546 + spin_unlock(&inode_wb_list_lock); 559 547 iput(inode); 560 548 cond_resched(); 561 - spin_lock(&inode_lock); 549 + spin_lock(&inode_wb_list_lock); 562 550 if (wbc->nr_to_write <= 0) { 563 551 wbc->more_io = 1; 564 552 return 1; ··· 577 565 578 566 if (!wbc->wb_start) 579 567 wbc->wb_start = jiffies; /* livelock avoidance */ 580 - spin_lock(&inode_lock); 568 + spin_lock(&inode_wb_list_lock); 581 569 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 582 570 queue_io(wb, wbc->older_than_this); 583 571 ··· 595 583 if (ret) 596 584 break; 597 585 } 598 - spin_unlock(&inode_lock); 586 + spin_unlock(&inode_wb_list_lock); 599 587 /* Leave any unwritten inodes on b_io */ 600 588 } 601 589 ··· 604 592 { 605 593 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 606 594 607 - spin_lock(&inode_lock); 595 + spin_lock(&inode_wb_list_lock); 608 596 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 609 597 queue_io(wb, wbc->older_than_this); 610 598 writeback_sb_inodes(sb, wb, wbc, true); 611 - spin_unlock(&inode_lock); 599 + spin_unlock(&inode_wb_list_lock); 612 600 } 613 601 614 602 /* ··· 747 735 * become available for writeback. Otherwise 748 736 * we'll just busyloop. 749 737 */ 750 - spin_lock(&inode_lock); 738 + spin_lock(&inode_wb_list_lock); 751 739 if (!list_empty(&wb->b_more_io)) { 752 740 inode = wb_inode(wb->b_more_io.prev); 753 741 trace_wbc_writeback_wait(&wbc, wb->bdi); ··· 755 743 inode_wait_for_writeback(inode); 756 744 spin_unlock(&inode->i_lock); 757 745 } 758 - spin_unlock(&inode_lock); 746 + spin_unlock(&inode_wb_list_lock); 759 747 } 760 748 761 749 return wrote; ··· 1021 1009 { 1022 1010 struct super_block *sb = inode->i_sb; 1023 1011 struct backing_dev_info *bdi = NULL; 1024 - bool wakeup_bdi = false; 1025 1012 1026 1013 /* 1027 1014 * Don't do this for I_DIRTY_PAGES - that doesn't actually ··· 1044 1033 if (unlikely(block_dump)) 1045 1034 block_dump___mark_inode_dirty(inode); 1046 1035 1047 - spin_lock(&inode_lock); 1048 1036 spin_lock(&inode->i_lock); 1049 1037 if ((inode->i_state & flags) != flags) { 1050 1038 const int was_dirty = inode->i_state & I_DIRTY; ··· 1069 1059 if (inode->i_state & I_FREEING) 1070 1060 goto out_unlock_inode; 1071 1061 1072 - spin_unlock(&inode->i_lock); 1073 1062 /* 1074 1063 * If the inode was already on b_dirty/b_io/b_more_io, don't 1075 1064 * reposition it (that would break b_dirty time-ordering). 1076 1065 */ 1077 1066 if (!was_dirty) { 1067 + bool wakeup_bdi = false; 1078 1068 bdi = inode_to_bdi(inode); 1079 1069 1080 1070 if (bdi_cap_writeback_dirty(bdi)) { ··· 1091 1081 wakeup_bdi = true; 1092 1082 } 1093 1083 1084 + spin_unlock(&inode->i_lock); 1085 + spin_lock(&inode_wb_list_lock); 1094 1086 inode->dirtied_when = jiffies; 1095 1087 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1088 + spin_unlock(&inode_wb_list_lock); 1089 + 1090 + if (wakeup_bdi) 1091 + bdi_wakeup_thread_delayed(bdi); 1092 + return; 1096 1093 } 1097 - goto out; 1098 1094 } 1099 1095 out_unlock_inode: 1100 1096 spin_unlock(&inode->i_lock); 1101 - out: 1102 - spin_unlock(&inode_lock); 1103 1097 1104 - if (wakeup_bdi) 1105 - bdi_wakeup_thread_delayed(bdi); 1106 1098 } 1107 1099 EXPORT_SYMBOL(__mark_inode_dirty); 1108 1100 ··· 1308 1296 wbc.nr_to_write = 0; 1309 1297 1310 1298 might_sleep(); 1311 - spin_lock(&inode_lock); 1299 + spin_lock(&inode_wb_list_lock); 1312 1300 ret = writeback_single_inode(inode, &wbc); 1313 - spin_unlock(&inode_lock); 1301 + spin_unlock(&inode_wb_list_lock); 1314 1302 if (sync) 1315 1303 inode_sync_wait(inode); 1316 1304 return ret; ··· 1332 1320 { 1333 1321 int ret; 1334 1322 1335 - spin_lock(&inode_lock); 1323 + spin_lock(&inode_wb_list_lock); 1336 1324 ret = writeback_single_inode(inode, wbc); 1337 - spin_unlock(&inode_lock); 1325 + spin_unlock(&inode_wb_list_lock); 1338 1326 return ret; 1339 1327 } 1340 1328 EXPORT_SYMBOL(sync_inode);
+8 -4
fs/inode.c
··· 26 26 #include <linux/posix_acl.h> 27 27 #include <linux/ima.h> 28 28 #include <linux/cred.h> 29 + #include "internal.h" 29 30 30 31 /* 31 32 * inode locking rules. ··· 37 36 * inode_lru, inode->i_lru 38 37 * inode_sb_list_lock protects: 39 38 * sb->s_inodes, inode->i_sb_list 39 + * inode_wb_list_lock protects: 40 + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 40 41 * 41 42 * Lock ordering: 42 43 * inode_lock ··· 47 44 * inode_sb_list_lock 48 45 * inode->i_lock 49 46 * inode_lru_lock 47 + * 48 + * inode_wb_list_lock 49 + * inode->i_lock 50 50 */ 51 51 52 52 /* ··· 111 105 DEFINE_SPINLOCK(inode_lock); 112 106 113 107 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 108 + __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); 114 109 115 110 /* 116 111 * iprune_sem provides exclusion between the icache shrinking and the ··· 490 483 BUG_ON(!(inode->i_state & I_FREEING)); 491 484 BUG_ON(!list_empty(&inode->i_lru)); 492 485 493 - spin_lock(&inode_lock); 494 - list_del_init(&inode->i_wb_list); 495 - spin_unlock(&inode_lock); 496 - 486 + inode_wb_list_del(inode); 497 487 inode_sb_list_del(inode); 498 488 499 489 if (op->evict_inode) {
+5
fs/internal.h
··· 127 127 */ 128 128 extern spinlock_t inode_sb_list_lock; 129 129 130 + /* 131 + * fs-writeback.c 132 + */ 133 + extern void inode_wb_list_del(struct inode *inode); 134 + 130 135 extern int get_nr_dirty_inodes(void); 131 136 extern void evict_inodes(struct super_block *); 132 137 extern int invalidate_inodes(struct super_block *, bool);
+1
include/linux/writeback.h
··· 10 10 struct backing_dev_info; 11 11 12 12 extern spinlock_t inode_lock; 13 + extern spinlock_t inode_wb_list_lock; 13 14 14 15 /* 15 16 * fs/fs-writeback.c
+4 -4
mm/backing-dev.c
··· 73 73 struct inode *inode; 74 74 75 75 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 76 - spin_lock(&inode_lock); 76 + spin_lock(&inode_wb_list_lock); 77 77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 78 78 nr_dirty++; 79 79 list_for_each_entry(inode, &wb->b_io, i_wb_list) 80 80 nr_io++; 81 81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 82 82 nr_more_io++; 83 - spin_unlock(&inode_lock); 83 + spin_unlock(&inode_wb_list_lock); 84 84 85 85 global_dirty_limits(&background_thresh, &dirty_thresh); 86 86 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); ··· 682 682 if (bdi_has_dirty_io(bdi)) { 683 683 struct bdi_writeback *dst = &default_backing_dev_info.wb; 684 684 685 - spin_lock(&inode_lock); 685 + spin_lock(&inode_wb_list_lock); 686 686 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 687 687 list_splice(&bdi->wb.b_io, &dst->b_io); 688 688 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 689 - spin_unlock(&inode_lock); 689 + spin_unlock(&inode_wb_list_lock); 690 690 } 691 691 692 692 bdi_unregister(bdi);
+4 -4
mm/filemap.c
··· 80 80 * ->i_mutex 81 81 * ->i_alloc_sem (various) 82 82 * 83 - * ->inode_lock 84 - * ->sb_lock (fs/fs-writeback.c) 83 + * inode_wb_list_lock 84 + * sb_lock (fs/fs-writeback.c) 85 85 * ->mapping->tree_lock (__sync_single_inode) 86 86 * 87 87 * ->i_mmap_lock ··· 98 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 - * ->inode_lock (page_remove_rmap->set_page_dirty) 101 + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 102 102 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 103 - * ->inode_lock (zap_pte_range->set_page_dirty) 103 + * inode_wb_list_lock (zap_pte_range->set_page_dirty) 104 104 * ->inode->i_lock (zap_pte_range->set_page_dirty) 105 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 106 106 *
+2 -2
mm/rmap.c
··· 31 31 * swap_lock (in swap_duplicate, swap_info_get) 32 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 - * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 35 + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 36 36 * sb_lock (within inode_lock in fs/fs-writeback.c) 37 37 * mapping->tree_lock (widely used, in set_page_dirty, 38 38 * in arch-dependent flush_dcache_mmap_lock, 39 - * within inode_lock in __sync_single_inode) 39 + * within inode_wb_list_lock in __sync_single_inode) 40 40 * 41 41 * (code doesn't rely on that order so it could be switched around) 42 42 * ->tasklist_lock