Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6:
fs: simplify iget & friends
fs: pull inode->i_lock up out of writeback_single_inode
fs: rename inode_lock to inode_hash_lock
fs: move i_wb_list out from under inode_lock
fs: move i_sb_list out from under inode_lock
fs: remove inode_lock from iput_final and prune_icache
fs: Lock the inode LRU list separately
fs: factor inode disposal
fs: protect inode->i_state with inode->i_lock
autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd()
autofs4 - remove autofs4_lock
autofs4 - fix d_manage() return on rcu-walk
autofs4 - fix autofs4_expire_indirect() traversal
autofs4 - fix dentry leak in autofs4_expire_direct()
autofs4 - reinstate last used update on access
vfs - check non-mountpoint dentry might block in __follow_mount_rcu()

+622 -543
+1 -1
Documentation/filesystems/Locking
··· 128 128 destroy_inode: 129 129 dirty_inode: (must not sleep) 130 130 write_inode: 131 - drop_inode: !!!inode_lock!!! 131 + drop_inode: !!!inode->i_lock!!! 132 132 evict_inode: 133 133 put_super: write 134 134 write_super: read
+11 -5
Documentation/filesystems/porting
··· 298 298 remaining links or not. Caller does *not* evict the pagecache or inode-associated 299 299 metadata buffers; getting rid of those is responsibility of method, as it had 300 300 been for ->delete_inode(). 301 - ->drop_inode() returns int now; it's called on final iput() with inode_lock 302 - held and it returns true if filesystems wants the inode to be dropped. As before, 303 - generic_drop_inode() is still the default and it's been updated appropriately. 304 - generic_delete_inode() is also alive and it consists simply of return 1. Note that 305 - all actual eviction work is done by caller after ->drop_inode() returns. 301 + 302 + ->drop_inode() returns int now; it's called on final iput() with 303 + inode->i_lock held and it returns true if filesystems wants the inode to be 304 + dropped. As before, generic_drop_inode() is still the default and it's been 305 + updated appropriately. generic_delete_inode() is also alive and it consists 306 + simply of return 1. Note that all actual eviction work is done by caller after 307 + ->drop_inode() returns. 308 + 306 309 clear_inode() is gone; use end_writeback() instead. As before, it must 307 310 be called exactly once on each call of ->evict_inode() (as it used to be for 308 311 each call of ->delete_inode()). Unlike before, if you are using inode-associated ··· 397 394 Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set, 398 395 so the i_size should not change when hole punching, even when puching the end of 399 396 a file off. 397 + 398 + -- 399 + [mandatory] 400 400 401 401 -- 402 402 [mandatory]
+1 -1
Documentation/filesystems/vfs.txt
··· 254 254 should be synchronous or not, not all filesystems check this flag. 255 255 256 256 drop_inode: called when the last access to the inode is dropped, 257 - with the inode_lock spinlock held. 257 + with the inode->i_lock spinlock held. 258 258 259 259 This method should be either NULL (normal UNIX filesystem 260 260 semantics) or "generic_delete_inode" (for filesystems that do not
-2
fs/autofs4/autofs_i.h
··· 61 61 current->pid, __func__, ##args); \ 62 62 } while (0) 63 63 64 - extern spinlock_t autofs4_lock; 65 - 66 64 /* Unified info structure. This is pointed to by both the dentry and 67 65 inode structures. Each file in the filesystem has an instance of this 68 66 structure. It holds a reference to the dentry, so dentries are never
+4
fs/autofs4/dev-ioctl.c
··· 372 372 return -EBUSY; 373 373 } else { 374 374 struct file *pipe = fget(pipefd); 375 + if (!pipe) { 376 + err = -EBADF; 377 + goto out; 378 + } 375 379 if (!pipe->f_op || !pipe->f_op->write) { 376 380 err = -EPIPE; 377 381 fput(pipe);
+63 -21
fs/autofs4/expire.c
··· 87 87 } 88 88 89 89 /* 90 + * Calculate and dget next entry in the subdirs list under root. 91 + */ 92 + static struct dentry *get_next_positive_subdir(struct dentry *prev, 93 + struct dentry *root) 94 + { 95 + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 96 + struct list_head *next; 97 + struct dentry *p, *q; 98 + 99 + spin_lock(&sbi->lookup_lock); 100 + 101 + if (prev == NULL) { 102 + spin_lock(&root->d_lock); 103 + prev = dget_dlock(root); 104 + next = prev->d_subdirs.next; 105 + p = prev; 106 + goto start; 107 + } 108 + 109 + p = prev; 110 + spin_lock(&p->d_lock); 111 + again: 112 + next = p->d_u.d_child.next; 113 + start: 114 + if (next == &root->d_subdirs) { 115 + spin_unlock(&p->d_lock); 116 + spin_unlock(&sbi->lookup_lock); 117 + dput(prev); 118 + return NULL; 119 + } 120 + 121 + q = list_entry(next, struct dentry, d_u.d_child); 122 + 123 + spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); 124 + /* Negative dentry - try next */ 125 + if (!simple_positive(q)) { 126 + spin_unlock(&p->d_lock); 127 + p = q; 128 + goto again; 129 + } 130 + dget_dlock(q); 131 + spin_unlock(&q->d_lock); 132 + spin_unlock(&p->d_lock); 133 + spin_unlock(&sbi->lookup_lock); 134 + 135 + dput(prev); 136 + 137 + return q; 138 + } 139 + 140 + /* 90 141 * Calculate and dget next entry in top down tree traversal. 91 142 */ 92 143 static struct dentry *get_next_positive_dentry(struct dentry *prev, 93 144 struct dentry *root) 94 145 { 146 + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 95 147 struct list_head *next; 96 148 struct dentry *p, *ret; 97 149 98 150 if (prev == NULL) 99 151 return dget(root); 100 152 101 - spin_lock(&autofs4_lock); 153 + spin_lock(&sbi->lookup_lock); 102 154 relock: 103 155 p = prev; 104 156 spin_lock(&p->d_lock); ··· 162 110 163 111 if (p == root) { 164 112 spin_unlock(&p->d_lock); 165 - spin_unlock(&autofs4_lock); 113 + spin_unlock(&sbi->lookup_lock); 166 114 dput(prev); 167 115 return NULL; 168 116 } ··· 192 140 dget_dlock(ret); 193 141 spin_unlock(&ret->d_lock); 194 142 spin_unlock(&p->d_lock); 195 - spin_unlock(&autofs4_lock); 143 + spin_unlock(&sbi->lookup_lock); 196 144 197 145 dput(prev); 198 146 ··· 342 290 spin_lock(&sbi->fs_lock); 343 291 ino = autofs4_dentry_ino(root); 344 292 /* No point expiring a pending mount */ 345 - if (ino->flags & AUTOFS_INF_PENDING) { 346 - spin_unlock(&sbi->fs_lock); 347 - return NULL; 348 - } 349 - managed_dentry_set_transit(root); 293 + if (ino->flags & AUTOFS_INF_PENDING) 294 + goto out; 350 295 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 351 296 struct autofs_info *ino = autofs4_dentry_ino(root); 352 297 ino->flags |= AUTOFS_INF_EXPIRING; ··· 351 302 spin_unlock(&sbi->fs_lock); 352 303 return root; 353 304 } 354 - managed_dentry_clear_transit(root); 305 + out: 355 306 spin_unlock(&sbi->fs_lock); 356 307 dput(root); 357 308 ··· 385 336 timeout = sbi->exp_timeout; 386 337 387 338 dentry = NULL; 388 - while ((dentry = get_next_positive_dentry(dentry, root))) { 339 + while ((dentry = get_next_positive_subdir(dentry, root))) { 389 340 spin_lock(&sbi->fs_lock); 390 341 ino = autofs4_dentry_ino(dentry); 391 342 /* No point expiring a pending mount */ 392 343 if (ino->flags & AUTOFS_INF_PENDING) 393 - goto cont; 394 - managed_dentry_set_transit(dentry); 344 + goto next; 395 345 396 346 /* 397 347 * Case 1: (i) indirect mount or top level pseudo direct mount ··· 450 402 } 451 403 } 452 404 next: 453 - managed_dentry_clear_transit(dentry); 454 - cont: 455 405 spin_unlock(&sbi->fs_lock); 456 406 } 457 407 return NULL; ··· 461 415 ino->flags |= AUTOFS_INF_EXPIRING; 462 416 init_completion(&ino->expire_complete); 463 417 spin_unlock(&sbi->fs_lock); 464 - spin_lock(&autofs4_lock); 418 + spin_lock(&sbi->lookup_lock); 465 419 spin_lock(&expired->d_parent->d_lock); 466 420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); 467 421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 468 422 spin_unlock(&expired->d_lock); 469 423 spin_unlock(&expired->d_parent->d_lock); 470 - spin_unlock(&autofs4_lock); 424 + spin_unlock(&sbi->lookup_lock); 471 425 return expired; 472 426 } 473 427 ··· 530 484 spin_lock(&sbi->fs_lock); 531 485 ino = autofs4_dentry_ino(dentry); 532 486 ino->flags &= ~AUTOFS_INF_EXPIRING; 533 - if (!d_unhashed(dentry)) 534 - managed_dentry_clear_transit(dentry); 535 487 complete_all(&ino->expire_complete); 536 488 spin_unlock(&sbi->fs_lock); 537 489 ··· 557 513 spin_lock(&sbi->fs_lock); 558 514 ino->flags &= ~AUTOFS_INF_EXPIRING; 559 515 spin_lock(&dentry->d_lock); 560 - if (ret) 561 - __managed_dentry_clear_transit(dentry); 562 - else { 516 + if (!ret) { 563 517 if ((IS_ROOT(dentry) || 564 518 (autofs_type_indirect(sbi->type) && 565 519 IS_ROOT(dentry->d_parent))) &&
+21 -41
fs/autofs4/root.c
··· 23 23 24 24 #include "autofs_i.h" 25 25 26 - DEFINE_SPINLOCK(autofs4_lock); 27 - 28 26 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 29 27 static int autofs4_dir_unlink(struct inode *,struct dentry *); 30 28 static int autofs4_dir_rmdir(struct inode *,struct dentry *); ··· 123 125 * autofs file system so just let the libfs routines handle 124 126 * it. 125 127 */ 126 - spin_lock(&autofs4_lock); 128 + spin_lock(&sbi->lookup_lock); 127 129 spin_lock(&dentry->d_lock); 128 130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 129 131 spin_unlock(&dentry->d_lock); 130 - spin_unlock(&autofs4_lock); 132 + spin_unlock(&sbi->lookup_lock); 131 133 return -ENOENT; 132 134 } 133 135 spin_unlock(&dentry->d_lock); 134 - spin_unlock(&autofs4_lock); 136 + spin_unlock(&sbi->lookup_lock); 135 137 136 138 out: 137 139 return dcache_dir_open(inode, file); ··· 169 171 const unsigned char *str = name->name; 170 172 struct list_head *p, *head; 171 173 172 - spin_lock(&autofs4_lock); 173 174 spin_lock(&sbi->lookup_lock); 174 175 head = &sbi->active_list; 175 176 list_for_each(p, head) { ··· 201 204 dget_dlock(active); 202 205 spin_unlock(&active->d_lock); 203 206 spin_unlock(&sbi->lookup_lock); 204 - spin_unlock(&autofs4_lock); 205 207 return active; 206 208 } 207 209 next: 208 210 spin_unlock(&active->d_lock); 209 211 } 210 212 spin_unlock(&sbi->lookup_lock); 211 - spin_unlock(&autofs4_lock); 212 213 213 214 return NULL; 214 215 } ··· 221 226 const unsigned char *str = name->name; 222 227 struct list_head *p, *head; 223 228 224 - spin_lock(&autofs4_lock); 225 229 spin_lock(&sbi->lookup_lock); 226 230 head = &sbi->expiring_list; 227 231 list_for_each(p, head) { ··· 253 259 dget_dlock(expiring); 254 260 spin_unlock(&expiring->d_lock); 255 261 spin_unlock(&sbi->lookup_lock); 256 - spin_unlock(&autofs4_lock); 257 262 return expiring; 258 263 } 259 264 next: 260 265 spin_unlock(&expiring->d_lock); 261 266 } 262 267 spin_unlock(&sbi->lookup_lock); 263 - spin_unlock(&autofs4_lock); 264 268 265 269 return NULL; 266 270 } ··· 267 275 { 268 276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 269 277 struct autofs_info *ino = autofs4_dentry_ino(dentry); 270 - int status; 278 + int status = 0; 271 279 272 280 if (ino->flags & AUTOFS_INF_PENDING) { 273 281 DPRINTK("waiting for mount name=%.*s", 274 282 dentry->d_name.len, dentry->d_name.name); 275 283 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 276 284 DPRINTK("mount wait done status=%d", status); 277 - ino->last_used = jiffies; 278 - return status; 279 285 } 280 - return 0; 286 + ino->last_used = jiffies; 287 + return status; 281 288 } 282 289 283 290 static int do_expire_wait(struct dentry *dentry) ··· 310 319 */ 311 320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 312 321 struct dentry *parent = dentry->d_parent; 322 + struct autofs_info *ino; 313 323 struct dentry *new = d_lookup(parent, &dentry->d_name); 314 324 if (!new) 315 325 return NULL; 326 + ino = autofs4_dentry_ino(new); 327 + ino->last_used = jiffies; 316 328 dput(path->dentry); 317 329 path->dentry = new; 318 330 } ··· 331 337 332 338 DPRINTK("dentry=%p %.*s", 333 339 dentry, dentry->d_name.len, dentry->d_name.name); 334 - 335 - /* 336 - * Someone may have manually umounted this or it was a submount 337 - * that has gone away. 338 - */ 339 - spin_lock(&dentry->d_lock); 340 - if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 341 - if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) && 342 - (dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 343 - __managed_dentry_set_transit(path->dentry); 344 - } 345 - spin_unlock(&dentry->d_lock); 346 340 347 341 /* The daemon never triggers a mount. */ 348 342 if (autofs4_oz_mode(sbi)) ··· 400 418 done: 401 419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 402 420 /* 403 - * Any needed mounting has been completed and the path updated 404 - * so turn this into a normal dentry so we don't continually 405 - * call ->d_automount() and ->d_manage(). 406 - */ 407 - spin_lock(&dentry->d_lock); 408 - __managed_dentry_clear_transit(dentry); 409 - /* 421 + * Any needed mounting has been completed and the path 422 + * updated so clear DCACHE_NEED_AUTOMOUNT so we don't 423 + * call ->d_automount() on rootless multi-mounts since 424 + * it can lead to an incorrect ELOOP error return. 425 + * 410 426 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and 411 427 * symlinks as in all other cases the dentry will be covered by 412 428 * an actual mount so ->d_automount() won't be called during 413 429 * the follow. 414 430 */ 431 + spin_lock(&dentry->d_lock); 415 432 if ((!d_mountpoint(dentry) && 416 433 !list_empty(&dentry->d_subdirs)) || 417 434 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) ··· 436 455 437 456 /* The daemon never waits. */ 438 457 if (autofs4_oz_mode(sbi)) { 458 + if (rcu_walk) 459 + return 0; 439 460 if (!d_mountpoint(dentry)) 440 461 return -EISDIR; 441 462 return 0; ··· 595 612 596 613 dir->i_mtime = CURRENT_TIME; 597 614 598 - spin_lock(&autofs4_lock); 599 - autofs4_add_expiring(dentry); 615 + spin_lock(&sbi->lookup_lock); 616 + __autofs4_add_expiring(dentry); 600 617 spin_lock(&dentry->d_lock); 601 618 __d_drop(dentry); 602 619 spin_unlock(&dentry->d_lock); 603 - spin_unlock(&autofs4_lock); 620 + spin_unlock(&sbi->lookup_lock); 604 621 605 622 return 0; 606 623 } ··· 669 686 if (!autofs4_oz_mode(sbi)) 670 687 return -EACCES; 671 688 672 - spin_lock(&autofs4_lock); 673 689 spin_lock(&sbi->lookup_lock); 674 690 spin_lock(&dentry->d_lock); 675 691 if (!list_empty(&dentry->d_subdirs)) { 676 692 spin_unlock(&dentry->d_lock); 677 693 spin_unlock(&sbi->lookup_lock); 678 - spin_unlock(&autofs4_lock); 679 694 return -ENOTEMPTY; 680 695 } 681 696 __autofs4_add_expiring(dentry); 682 - spin_unlock(&sbi->lookup_lock); 683 697 __d_drop(dentry); 684 698 spin_unlock(&dentry->d_lock); 685 - spin_unlock(&autofs4_lock); 699 + spin_unlock(&sbi->lookup_lock); 686 700 687 701 if (sbi->version < 5) 688 702 autofs_clear_leaf_automount_flags(dentry);
+3 -3
fs/autofs4/waitq.c
··· 197 197 198 198 seq = read_seqbegin(&rename_lock); 199 199 rcu_read_lock(); 200 - spin_lock(&autofs4_lock); 200 + spin_lock(&sbi->fs_lock); 201 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 202 202 len += tmp->d_name.len + 1; 203 203 204 204 if (!len || --len > NAME_MAX) { 205 - spin_unlock(&autofs4_lock); 205 + spin_unlock(&sbi->fs_lock); 206 206 rcu_read_unlock(); 207 207 if (read_seqretry(&rename_lock, seq)) 208 208 goto rename_retry; ··· 218 218 p -= tmp->d_name.len; 219 219 strncpy(p, tmp->d_name.name, tmp->d_name.len); 220 220 } 221 - spin_unlock(&autofs4_lock); 221 + spin_unlock(&sbi->fs_lock); 222 222 rcu_read_unlock(); 223 223 if (read_seqretry(&rename_lock, seq)) 224 224 goto rename_retry;
+4 -2
fs/block_dev.c
··· 55 55 static void bdev_inode_switch_bdi(struct inode *inode, 56 56 struct backing_dev_info *dst) 57 57 { 58 - spin_lock(&inode_lock); 58 + spin_lock(&inode_wb_list_lock); 59 + spin_lock(&inode->i_lock); 59 60 inode->i_data.backing_dev_info = dst; 60 61 if (inode->i_state & I_DIRTY) 61 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 62 - spin_unlock(&inode_lock); 63 + spin_unlock(&inode->i_lock); 64 + spin_unlock(&inode_wb_list_lock); 63 65 } 64 66 65 67 static sector_t max_block(struct block_device *bdev)
+1 -1
fs/buffer.c
··· 1138 1138 * inode list. 1139 1139 * 1140 1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1141 - * mapping->tree_lock and the global inode_lock. 1141 + * mapping->tree_lock and mapping->host->i_lock. 1142 1142 */ 1143 1143 void mark_buffer_dirty(struct buffer_head *bh) 1144 1144 {
+11 -7
fs/drop_caches.c
··· 8 8 #include <linux/writeback.h> 9 9 #include <linux/sysctl.h> 10 10 #include <linux/gfp.h> 11 + #include "internal.h" 11 12 12 13 /* A global variable is a bit ugly, but it keeps the code simple */ 13 14 int sysctl_drop_caches; ··· 17 16 { 18 17 struct inode *inode, *toput_inode = NULL; 19 18 20 - spin_lock(&inode_lock); 19 + spin_lock(&inode_sb_list_lock); 21 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 22 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 21 + spin_lock(&inode->i_lock); 22 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 23 + (inode->i_mapping->nrpages == 0)) { 24 + spin_unlock(&inode->i_lock); 23 25 continue; 24 - if (inode->i_mapping->nrpages == 0) 25 - continue; 26 + } 26 27 __iget(inode); 27 - spin_unlock(&inode_lock); 28 + spin_unlock(&inode->i_lock); 29 + spin_unlock(&inode_sb_list_lock); 28 30 invalidate_mapping_pages(inode->i_mapping, 0, -1); 29 31 iput(toput_inode); 30 32 toput_inode = inode; 31 - spin_lock(&inode_lock); 33 + spin_lock(&inode_sb_list_lock); 32 34 } 33 - spin_unlock(&inode_lock); 35 + spin_unlock(&inode_sb_list_lock); 34 36 iput(toput_inode); 35 37 } 36 38
+91 -50
fs/fs-writeback.c
··· 176 176 } 177 177 178 178 /* 179 + * Remove the inode from the writeback list it is on. 180 + */ 181 + void inode_wb_list_del(struct inode *inode) 182 + { 183 + spin_lock(&inode_wb_list_lock); 184 + list_del_init(&inode->i_wb_list); 185 + spin_unlock(&inode_wb_list_lock); 186 + } 187 + 188 + 189 + /* 179 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 180 191 * furthest end of its superblock's dirty-inode list. 181 192 * ··· 199 188 { 200 189 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 201 190 191 + assert_spin_locked(&inode_wb_list_lock); 202 192 if (!list_empty(&wb->b_dirty)) { 203 193 struct inode *tail; 204 194 ··· 217 205 { 218 206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 219 207 208 + assert_spin_locked(&inode_wb_list_lock); 220 209 list_move(&inode->i_wb_list, &wb->b_more_io); 221 210 } 222 211 223 212 static void inode_sync_complete(struct inode *inode) 224 213 { 225 214 /* 226 - * Prevent speculative execution through spin_unlock(&inode_lock); 215 + * Prevent speculative execution through 216 + * spin_unlock(&inode_wb_list_lock); 227 217 */ 218 + 228 219 smp_mb(); 229 220 wake_up_bit(&inode->i_state, __I_SYNC); 230 221 } ··· 301 286 */ 302 287 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 303 288 { 289 + assert_spin_locked(&inode_wb_list_lock); 304 290 list_splice_init(&wb->b_more_io, &wb->b_io); 305 291 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 306 292 } ··· 322 306 wait_queue_head_t *wqh; 323 307 324 308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 325 - while (inode->i_state & I_SYNC) { 326 - spin_unlock(&inode_lock); 309 + while (inode->i_state & I_SYNC) { 310 + spin_unlock(&inode->i_lock); 311 + spin_unlock(&inode_wb_list_lock); 327 312 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 328 - spin_lock(&inode_lock); 313 + spin_lock(&inode_wb_list_lock); 314 + spin_lock(&inode->i_lock); 329 315 } 330 316 } 331 317 332 318 /* 333 - * Write out an inode's dirty pages. Called under inode_lock. Either the 334 - * caller has ref on the inode (either via __iget or via syscall against an fd) 335 - * or the inode has I_WILL_FREE set (via generic_forget_inode) 319 + * Write out an inode's dirty pages. Called under inode_wb_list_lock and 320 + * inode->i_lock. Either the caller has an active reference on the inode or 321 + * the inode has I_WILL_FREE set. 336 322 * 337 323 * If `wait' is set, wait on the writeout. 338 324 * 339 325 * The whole writeout design is quite complex and fragile. We want to avoid 340 326 * starvation of particular inodes when others are being redirtied, prevent 341 327 * livelocks, etc. 342 - * 343 - * Called under inode_lock. 344 328 */ 345 329 static int 346 330 writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ··· 348 332 struct address_space *mapping = inode->i_mapping; 349 333 unsigned dirty; 350 334 int ret; 335 + 336 + assert_spin_locked(&inode_wb_list_lock); 337 + assert_spin_locked(&inode->i_lock); 351 338 352 339 if (!atomic_read(&inode->i_count)) 353 340 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); ··· 382 363 /* Set I_SYNC, reset I_DIRTY_PAGES */ 383 364 inode->i_state |= I_SYNC; 384 365 inode->i_state &= ~I_DIRTY_PAGES; 385 - spin_unlock(&inode_lock); 366 + spin_unlock(&inode->i_lock); 367 + spin_unlock(&inode_wb_list_lock); 386 368 387 369 ret = do_writepages(mapping, wbc); 388 370 ··· 403 383 * due to delalloc, clear dirty metadata flags right before 404 384 * write_inode() 405 385 */ 406 - spin_lock(&inode_lock); 386 + spin_lock(&inode->i_lock); 407 387 dirty = inode->i_state & I_DIRTY; 408 388 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 409 - spin_unlock(&inode_lock); 389 + spin_unlock(&inode->i_lock); 410 390 /* Don't write the inode if only I_DIRTY_PAGES was set */ 411 391 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 412 392 int err = write_inode(inode, wbc); ··· 414 394 ret = err; 415 395 } 416 396 417 - spin_lock(&inode_lock); 397 + spin_lock(&inode_wb_list_lock); 398 + spin_lock(&inode->i_lock); 418 399 inode->i_state &= ~I_SYNC; 419 400 if (!(inode->i_state & I_FREEING)) { 420 401 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ··· 527 506 * kind does not need peridic writeout yet, and for the latter 528 507 * kind writeout is handled by the freer. 529 508 */ 509 + spin_lock(&inode->i_lock); 530 510 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 511 + spin_unlock(&inode->i_lock); 531 512 requeue_io(inode); 532 513 continue; 533 514 } ··· 538 515 * Was this inode dirtied after sync_sb_inodes was called? 539 516 * This keeps sync from extra jobs and livelock. 540 517 */ 541 - if (inode_dirtied_after(inode, wbc->wb_start)) 518 + if (inode_dirtied_after(inode, wbc->wb_start)) { 519 + spin_unlock(&inode->i_lock); 542 520 return 1; 521 + } 543 522 544 523 __iget(inode); 524 + 545 525 pages_skipped = wbc->pages_skipped; 546 526 writeback_single_inode(inode, wbc); 547 527 if (wbc->pages_skipped != pages_skipped) { ··· 554 528 */ 555 529 redirty_tail(inode); 556 530 } 557 - spin_unlock(&inode_lock); 531 + spin_unlock(&inode->i_lock); 532 + spin_unlock(&inode_wb_list_lock); 558 533 iput(inode); 559 534 cond_resched(); 560 - spin_lock(&inode_lock); 535 + spin_lock(&inode_wb_list_lock); 561 536 if (wbc->nr_to_write <= 0) { 562 537 wbc->more_io = 1; 563 538 return 1; ··· 577 550 578 551 if (!wbc->wb_start) 579 552 wbc->wb_start = jiffies; /* livelock avoidance */ 580 - spin_lock(&inode_lock); 553 + spin_lock(&inode_wb_list_lock); 581 554 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 582 555 queue_io(wb, wbc->older_than_this); 583 556 ··· 595 568 if (ret) 596 569 break; 597 570 } 598 - spin_unlock(&inode_lock); 571 + spin_unlock(&inode_wb_list_lock); 599 572 /* Leave any unwritten inodes on b_io */ 600 573 } 601 574 ··· 604 577 { 605 578 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 606 579 607 - spin_lock(&inode_lock); 580 + spin_lock(&inode_wb_list_lock); 608 581 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 609 582 queue_io(wb, wbc->older_than_this); 610 583 writeback_sb_inodes(sb, wb, wbc, true); 611 - spin_unlock(&inode_lock); 584 + spin_unlock(&inode_wb_list_lock); 612 585 } 613 586 614 587 /* ··· 747 720 * become available for writeback. Otherwise 748 721 * we'll just busyloop. 749 722 */ 750 - spin_lock(&inode_lock); 723 + spin_lock(&inode_wb_list_lock); 751 724 if (!list_empty(&wb->b_more_io)) { 752 725 inode = wb_inode(wb->b_more_io.prev); 753 726 trace_wbc_writeback_wait(&wbc, wb->bdi); 727 + spin_lock(&inode->i_lock); 754 728 inode_wait_for_writeback(inode); 729 + spin_unlock(&inode->i_lock); 755 730 } 756 - spin_unlock(&inode_lock); 731 + spin_unlock(&inode_wb_list_lock); 757 732 } 758 733 759 734 return wrote; ··· 1021 992 { 1022 993 struct super_block *sb = inode->i_sb; 1023 994 struct backing_dev_info *bdi = NULL; 1024 - bool wakeup_bdi = false; 1025 995 1026 996 /* 1027 997 * Don't do this for I_DIRTY_PAGES - that doesn't actually ··· 1044 1016 if (unlikely(block_dump)) 1045 1017 block_dump___mark_inode_dirty(inode); 1046 1018 1047 - spin_lock(&inode_lock); 1019 + spin_lock(&inode->i_lock); 1048 1020 if ((inode->i_state & flags) != flags) { 1049 1021 const int was_dirty = inode->i_state & I_DIRTY; 1050 1022 ··· 1056 1028 * superblock list, based upon its state. 1057 1029 */ 1058 1030 if (inode->i_state & I_SYNC) 1059 - goto out; 1031 + goto out_unlock_inode; 1060 1032 1061 1033 /* 1062 1034 * Only add valid (hashed) inodes to the superblock's ··· 1064 1036 */ 1065 1037 if (!S_ISBLK(inode->i_mode)) { 1066 1038 if (inode_unhashed(inode)) 1067 - goto out; 1039 + goto out_unlock_inode; 1068 1040 } 1069 1041 if (inode->i_state & I_FREEING) 1070 - goto out; 1042 + goto out_unlock_inode; 1071 1043 1072 1044 /* 1073 1045 * If the inode was already on b_dirty/b_io/b_more_io, don't 1074 1046 * reposition it (that would break b_dirty time-ordering). 1075 1047 */ 1076 1048 if (!was_dirty) { 1049 + bool wakeup_bdi = false; 1077 1050 bdi = inode_to_bdi(inode); 1078 1051 1079 1052 if (bdi_cap_writeback_dirty(bdi)) { ··· 1091 1062 wakeup_bdi = true; 1092 1063 } 1093 1064 1065 + spin_unlock(&inode->i_lock); 1066 + spin_lock(&inode_wb_list_lock); 1094 1067 inode->dirtied_when = jiffies; 1095 1068 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1069 + spin_unlock(&inode_wb_list_lock); 1070 + 1071 + if (wakeup_bdi) 1072 + bdi_wakeup_thread_delayed(bdi); 1073 + return; 1096 1074 } 1097 1075 } 1098 - out: 1099 - spin_unlock(&inode_lock); 1076 + out_unlock_inode: 1077 + spin_unlock(&inode->i_lock); 1100 1078 1101 - if (wakeup_bdi) 1102 - bdi_wakeup_thread_delayed(bdi); 1103 1079 } 1104 1080 EXPORT_SYMBOL(__mark_inode_dirty); 1105 1081 ··· 1135 1101 */ 1136 1102 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1137 1103 1138 - spin_lock(&inode_lock); 1104 + spin_lock(&inode_sb_list_lock); 1139 1105 1140 1106 /* 1141 1107 * Data integrity sync. Must wait for all pages under writeback, ··· 1145 1111 * we still have to wait for that writeout. 1146 1112 */ 1147 1113 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1148 - struct address_space *mapping; 1114 + struct address_space *mapping = inode->i_mapping; 1149 1115 1150 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 1116 + spin_lock(&inode->i_lock); 1117 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 1118 + (mapping->nrpages == 0)) { 1119 + spin_unlock(&inode->i_lock); 1151 1120 continue; 1152 - mapping = inode->i_mapping; 1153 - if (mapping->nrpages == 0) 1154 - continue; 1121 + } 1155 1122 __iget(inode); 1156 - spin_unlock(&inode_lock); 1123 + spin_unlock(&inode->i_lock); 1124 + spin_unlock(&inode_sb_list_lock); 1125 + 1157 1126 /* 1158 - * We hold a reference to 'inode' so it couldn't have 1159 - * been removed from s_inodes list while we dropped the 1160 - * inode_lock. We cannot iput the inode now as we can 1161 - * be holding the last reference and we cannot iput it 1162 - * under inode_lock. So we keep the reference and iput 1163 - * it later. 1127 + * We hold a reference to 'inode' so it couldn't have been 1128 + * removed from s_inodes list while we dropped the 1129 + * inode_sb_list_lock. We cannot iput the inode now as we can 1130 + * be holding the last reference and we cannot iput it under 1131 + * inode_sb_list_lock. So we keep the reference and iput it 1132 + * later. 1164 1133 */ 1165 1134 iput(old_inode); 1166 1135 old_inode = inode; ··· 1172 1135 1173 1136 cond_resched(); 1174 1137 1175 - spin_lock(&inode_lock); 1138 + spin_lock(&inode_sb_list_lock); 1176 1139 } 1177 - spin_unlock(&inode_lock); 1140 + spin_unlock(&inode_sb_list_lock); 1178 1141 iput(old_inode); 1179 1142 } 1180 1143 ··· 1308 1271 wbc.nr_to_write = 0; 1309 1272 1310 1273 might_sleep(); 1311 - spin_lock(&inode_lock); 1274 + spin_lock(&inode_wb_list_lock); 1275 + spin_lock(&inode->i_lock); 1312 1276 ret = writeback_single_inode(inode, &wbc); 1313 - spin_unlock(&inode_lock); 1277 + spin_unlock(&inode->i_lock); 1278 + spin_unlock(&inode_wb_list_lock); 1314 1279 if (sync) 1315 1280 inode_sync_wait(inode); 1316 1281 return ret; ··· 1334 1295 { 1335 1296 int ret; 1336 1297 1337 - spin_lock(&inode_lock); 1298 + spin_lock(&inode_wb_list_lock); 1299 + spin_lock(&inode->i_lock); 1338 1300 ret = writeback_single_inode(inode, wbc); 1339 - spin_unlock(&inode_lock); 1301 + spin_unlock(&inode->i_lock); 1302 + spin_unlock(&inode_wb_list_lock); 1340 1303 return ret; 1341 1304 } 1342 1305 EXPORT_SYMBOL(sync_inode);
+319 -351
fs/inode.c
··· 26 26 #include <linux/posix_acl.h> 27 27 #include <linux/ima.h> 28 28 #include <linux/cred.h> 29 + #include "internal.h" 30 + 31 + /* 32 + * inode locking rules. 33 + * 34 + * inode->i_lock protects: 35 + * inode->i_state, inode->i_hash, __iget() 36 + * inode_lru_lock protects: 37 + * inode_lru, inode->i_lru 38 + * inode_sb_list_lock protects: 39 + * sb->s_inodes, inode->i_sb_list 40 + * inode_wb_list_lock protects: 41 + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 42 + * inode_hash_lock protects: 43 + * inode_hashtable, inode->i_hash 44 + * 45 + * Lock ordering: 46 + * 47 + * inode_sb_list_lock 48 + * inode->i_lock 49 + * inode_lru_lock 50 + * 51 + * inode_wb_list_lock 52 + * inode->i_lock 53 + * 54 + * inode_hash_lock 55 + * inode_sb_list_lock 56 + * inode->i_lock 57 + * 58 + * iunique_lock 59 + * inode_hash_lock 60 + */ 29 61 30 62 /* 31 63 * This is needed for the following functions: ··· 92 60 93 61 static unsigned int i_hash_mask __read_mostly; 94 62 static unsigned int i_hash_shift __read_mostly; 63 + static struct hlist_head *inode_hashtable __read_mostly; 64 + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 95 65 96 66 /* 97 67 * Each inode can be on two separate lists. One is ··· 108 74 */ 109 75 110 76 static LIST_HEAD(inode_lru); 111 - static struct hlist_head *inode_hashtable __read_mostly; 77 + static DEFINE_SPINLOCK(inode_lru_lock); 112 78 113 - /* 114 - * A simple spinlock to protect the list manipulations. 115 - * 116 - * NOTE! You also have to own the lock if you change 117 - * the i_state of an inode while it is in use.. 118 - */ 119 - DEFINE_SPINLOCK(inode_lock); 79 + __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 80 + __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); 120 81 121 82 /* 122 83 * iprune_sem provides exclusion between the icache shrinking and the ··· 165 136 return proc_dointvec(table, write, buffer, lenp, ppos); 166 137 } 167 138 #endif 168 - 169 - static void wake_up_inode(struct inode *inode) 170 - { 171 - /* 172 - * Prevent speculative execution through spin_unlock(&inode_lock); 173 - */ 174 - smp_mb(); 175 - wake_up_bit(&inode->i_state, __I_NEW); 176 - } 177 139 178 140 /** 179 141 * inode_init_always - perform inode structure intialisation ··· 356 336 } 357 337 358 338 /* 359 - * inode_lock must be held 339 + * inode->i_lock must be held 360 340 */ 361 341 void __iget(struct inode *inode) 362 342 { ··· 374 354 375 355 static void inode_lru_list_add(struct inode *inode) 376 356 { 357 + spin_lock(&inode_lru_lock); 377 358 if (list_empty(&inode->i_lru)) { 378 359 list_add(&inode->i_lru, &inode_lru); 379 360 inodes_stat.nr_unused++; 380 361 } 362 + spin_unlock(&inode_lru_lock); 381 363 } 382 364 383 365 static void inode_lru_list_del(struct inode *inode) 384 366 { 367 + spin_lock(&inode_lru_lock); 385 368 if (!list_empty(&inode->i_lru)) { 386 369 list_del_init(&inode->i_lru); 387 370 inodes_stat.nr_unused--; 388 371 } 389 - } 390 - 391 - static inline void __inode_sb_list_add(struct inode *inode) 392 - { 393 - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 372 + spin_unlock(&inode_lru_lock); 394 373 } 395 374 396 375 /** ··· 398 379 */ 399 380 void inode_sb_list_add(struct inode *inode) 400 381 { 401 - spin_lock(&inode_lock); 402 - __inode_sb_list_add(inode); 403 - spin_unlock(&inode_lock); 382 + spin_lock(&inode_sb_list_lock); 383 + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 384 + spin_unlock(&inode_sb_list_lock); 404 385 } 405 386 EXPORT_SYMBOL_GPL(inode_sb_list_add); 406 387 407 - static inline void __inode_sb_list_del(struct inode *inode) 388 + static inline void inode_sb_list_del(struct inode *inode) 408 389 { 390 + spin_lock(&inode_sb_list_lock); 409 391 list_del_init(&inode->i_sb_list); 392 + spin_unlock(&inode_sb_list_lock); 410 393 } 411 394 412 395 static unsigned long hash(struct super_block *sb, unsigned long hashval) ··· 433 412 { 434 413 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 435 414 436 - spin_lock(&inode_lock); 415 + spin_lock(&inode_hash_lock); 416 + spin_lock(&inode->i_lock); 437 417 hlist_add_head(&inode->i_hash, b); 438 - spin_unlock(&inode_lock); 418 + spin_unlock(&inode->i_lock); 419 + spin_unlock(&inode_hash_lock); 439 420 } 440 421 EXPORT_SYMBOL(__insert_inode_hash); 441 - 442 - /** 443 - * __remove_inode_hash - remove an inode from the hash 444 - * @inode: inode to unhash 445 - * 446 - * Remove an inode from the superblock. 447 - */ 448 - static void __remove_inode_hash(struct inode *inode) 449 - { 450 - hlist_del_init(&inode->i_hash); 451 - } 452 422 453 423 /** 454 424 * remove_inode_hash - remove an inode from the hash ··· 449 437 */ 450 438 void remove_inode_hash(struct inode *inode) 451 439 { 452 - spin_lock(&inode_lock); 440 + spin_lock(&inode_hash_lock); 441 + spin_lock(&inode->i_lock); 453 442 hlist_del_init(&inode->i_hash); 454 - spin_unlock(&inode_lock); 443 + spin_unlock(&inode->i_lock); 444 + spin_unlock(&inode_hash_lock); 455 445 } 456 446 EXPORT_SYMBOL(remove_inode_hash); 457 447 ··· 470 456 } 471 457 EXPORT_SYMBOL(end_writeback); 472 458 459 + /* 460 + * Free the inode passed in, removing it from the lists it is still connected 461 + * to. We remove any pages still attached to the inode and wait for any IO that 462 + * is still in progress before finally destroying the inode. 463 + * 464 + * An inode must already be marked I_FREEING so that we avoid the inode being 465 + * moved back onto lists if we race with other code that manipulates the lists 466 + * (e.g. writeback_single_inode). The caller is responsible for setting this. 467 + * 468 + * An inode must already be removed from the LRU list before being evicted from 469 + * the cache. This should occur atomically with setting the I_FREEING state 470 + * flag, so no inodes here should ever be on the LRU when being evicted. 471 + */ 473 472 static void evict(struct inode *inode) 474 473 { 475 474 const struct super_operations *op = inode->i_sb->s_op; 475 + 476 + BUG_ON(!(inode->i_state & I_FREEING)); 477 + BUG_ON(!list_empty(&inode->i_lru)); 478 + 479 + inode_wb_list_del(inode); 480 + inode_sb_list_del(inode); 476 481 477 482 if (op->evict_inode) { 478 483 op->evict_inode(inode); ··· 504 471 bd_forget(inode); 505 472 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 506 473 cd_forget(inode); 474 + 475 + remove_inode_hash(inode); 476 + 477 + spin_lock(&inode->i_lock); 478 + wake_up_bit(&inode->i_state, __I_NEW); 479 + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 480 + spin_unlock(&inode->i_lock); 481 + 482 + destroy_inode(inode); 507 483 } 508 484 509 485 /* ··· 531 489 list_del_init(&inode->i_lru); 532 490 533 491 evict(inode); 534 - 535 - spin_lock(&inode_lock); 536 - __remove_inode_hash(inode); 537 - __inode_sb_list_del(inode); 538 - spin_unlock(&inode_lock); 539 - 540 - wake_up_inode(inode); 541 - destroy_inode(inode); 542 492 } 543 493 } 544 494 ··· 548 514 struct inode *inode, *next; 549 515 LIST_HEAD(dispose); 550 516 551 - spin_lock(&inode_lock); 517 + spin_lock(&inode_sb_list_lock); 552 518 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 553 519 if (atomic_read(&inode->i_count)) 554 520 continue; 555 - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 521 + 522 + spin_lock(&inode->i_lock); 523 + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 524 + spin_unlock(&inode->i_lock); 556 525 continue; 526 + } 557 527 558 528 inode->i_state |= I_FREEING; 559 - 560 - /* 561 - * Move the inode off the IO lists and LRU once I_FREEING is 562 - * set so that it won't get moved back on there if it is dirty. 563 - */ 564 - list_move(&inode->i_lru, &dispose); 565 - list_del_init(&inode->i_wb_list); 566 - if (!(inode->i_state & (I_DIRTY | I_SYNC))) 567 - inodes_stat.nr_unused--; 529 + inode_lru_list_del(inode); 530 + spin_unlock(&inode->i_lock); 531 + list_add(&inode->i_lru, &dispose); 568 532 } 569 - spin_unlock(&inode_lock); 533 + spin_unlock(&inode_sb_list_lock); 570 534 571 535 dispose_list(&dispose); 572 536 ··· 593 561 struct inode *inode, *next; 594 562 LIST_HEAD(dispose); 595 563 596 - spin_lock(&inode_lock); 564 + spin_lock(&inode_sb_list_lock); 597 565 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 598 - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 566 + spin_lock(&inode->i_lock); 567 + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 568 + spin_unlock(&inode->i_lock); 599 569 continue; 570 + } 600 571 if (inode->i_state & I_DIRTY && !kill_dirty) { 572 + spin_unlock(&inode->i_lock); 601 573 busy = 1; 602 574 continue; 603 575 } 604 576 if (atomic_read(&inode->i_count)) { 577 + spin_unlock(&inode->i_lock); 605 578 busy = 1; 606 579 continue; 607 580 } 608 581 609 582 inode->i_state |= I_FREEING; 610 - 611 - /* 612 - * Move the inode off the IO lists and LRU once I_FREEING is 613 - * set so that it won't get moved back on there if it is dirty. 614 - */ 615 - list_move(&inode->i_lru, &dispose); 616 - list_del_init(&inode->i_wb_list); 617 - if (!(inode->i_state & (I_DIRTY | I_SYNC))) 618 - inodes_stat.nr_unused--; 583 + inode_lru_list_del(inode); 584 + spin_unlock(&inode->i_lock); 585 + list_add(&inode->i_lru, &dispose); 619 586 } 620 - spin_unlock(&inode_lock); 587 + spin_unlock(&inode_sb_list_lock); 621 588 622 589 dispose_list(&dispose); 623 590 ··· 638 607 639 608 /* 640 609 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 641 - * temporary list and then are freed outside inode_lock by dispose_list(). 610 + * temporary list and then are freed outside inode_lru_lock by dispose_list(). 642 611 * 643 612 * Any inodes which are pinned purely because of attached pagecache have their 644 613 * pagecache removed. If the inode has metadata buffers attached to ··· 659 628 unsigned long reap = 0; 660 629 661 630 down_read(&iprune_sem); 662 - spin_lock(&inode_lock); 631 + spin_lock(&inode_lru_lock); 663 632 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 664 633 struct inode *inode; 665 634 ··· 669 638 inode = list_entry(inode_lru.prev, struct inode, i_lru); 670 639 671 640 /* 641 + * we are inverting the inode_lru_lock/inode->i_lock here, 642 + * so use a trylock. If we fail to get the lock, just move the 643 + * inode to the back of the list so we don't spin on it. 644 + */ 645 + if (!spin_trylock(&inode->i_lock)) { 646 + list_move(&inode->i_lru, &inode_lru); 647 + continue; 648 + } 649 + 650 + /* 672 651 * Referenced or dirty inodes are still in use. Give them 673 652 * another pass through the LRU as we canot reclaim them now. 674 653 */ 675 654 if (atomic_read(&inode->i_count) || 676 655 (inode->i_state & ~I_REFERENCED)) { 677 656 list_del_init(&inode->i_lru); 657 + spin_unlock(&inode->i_lock); 678 658 inodes_stat.nr_unused--; 679 659 continue; 680 660 } 681 661 682 662 /* recently referenced inodes get one more pass */ 683 663 if (inode->i_state & I_REFERENCED) { 684 - list_move(&inode->i_lru, &inode_lru); 685 664 inode->i_state &= ~I_REFERENCED; 665 + list_move(&inode->i_lru, &inode_lru); 666 + spin_unlock(&inode->i_lock); 686 667 continue; 687 668 } 688 669 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 689 670 __iget(inode); 690 - spin_unlock(&inode_lock); 671 + spin_unlock(&inode->i_lock); 672 + spin_unlock(&inode_lru_lock); 691 673 if (remove_inode_buffers(inode)) 692 674 reap += invalidate_mapping_pages(&inode->i_data, 693 675 0, -1); 694 676 iput(inode); 695 - spin_lock(&inode_lock); 677 + spin_lock(&inode_lru_lock); 696 678 697 679 if (inode != list_entry(inode_lru.next, 698 680 struct inode, i_lru)) 699 681 continue; /* wrong inode or list_empty */ 700 - if (!can_unuse(inode)) 682 + /* avoid lock inversions with trylock */ 683 + if (!spin_trylock(&inode->i_lock)) 701 684 continue; 685 + if (!can_unuse(inode)) { 686 + spin_unlock(&inode->i_lock); 687 + continue; 688 + } 702 689 } 703 690 WARN_ON(inode->i_state & I_NEW); 704 691 inode->i_state |= I_FREEING; 692 + spin_unlock(&inode->i_lock); 705 693 706 - /* 707 - * Move the inode off the IO lists and LRU once I_FREEING is 708 - * set so that it won't get moved back on there if it is dirty. 709 - */ 710 694 list_move(&inode->i_lru, &freeable); 711 - list_del_init(&inode->i_wb_list); 712 695 inodes_stat.nr_unused--; 713 696 } 714 697 if (current_is_kswapd()) 715 698 __count_vm_events(KSWAPD_INODESTEAL, reap); 716 699 else 717 700 __count_vm_events(PGINODESTEAL, reap); 718 - spin_unlock(&inode_lock); 701 + spin_unlock(&inode_lru_lock); 719 702 720 703 dispose_list(&freeable); 721 704 up_read(&iprune_sem); ··· 778 733 779 734 repeat: 780 735 hlist_for_each_entry(inode, node, head, i_hash) { 781 - if (inode->i_sb != sb) 736 + spin_lock(&inode->i_lock); 737 + if (inode->i_sb != sb) { 738 + spin_unlock(&inode->i_lock); 782 739 continue; 783 - if (!test(inode, data)) 740 + } 741 + if (!test(inode, data)) { 742 + spin_unlock(&inode->i_lock); 784 743 continue; 744 + } 785 745 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 786 746 __wait_on_freeing_inode(inode); 787 747 goto repeat; 788 748 } 789 749 __iget(inode); 750 + spin_unlock(&inode->i_lock); 790 751 return inode; 791 752 } 792 753 return NULL; ··· 810 759 811 760 repeat: 812 761 hlist_for_each_entry(inode, node, head, i_hash) { 813 - if (inode->i_ino != ino) 762 + spin_lock(&inode->i_lock); 763 + if (inode->i_ino != ino) { 764 + spin_unlock(&inode->i_lock); 814 765 continue; 815 - if (inode->i_sb != sb) 766 + } 767 + if (inode->i_sb != sb) { 768 + spin_unlock(&inode->i_lock); 816 769 continue; 770 + } 817 771 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 818 772 __wait_on_freeing_inode(inode); 819 773 goto repeat; 820 774 } 821 775 __iget(inode); 776 + spin_unlock(&inode->i_lock); 822 777 return inode; 823 778 } 824 779 return NULL; ··· 884 827 { 885 828 struct inode *inode; 886 829 887 - spin_lock_prefetch(&inode_lock); 830 + spin_lock_prefetch(&inode_sb_list_lock); 888 831 889 832 inode = alloc_inode(sb); 890 833 if (inode) { 891 - spin_lock(&inode_lock); 892 - __inode_sb_list_add(inode); 834 + spin_lock(&inode->i_lock); 893 835 inode->i_state = 0; 894 - spin_unlock(&inode_lock); 836 + spin_unlock(&inode->i_lock); 837 + inode_sb_list_add(inode); 895 838 } 896 839 return inode; 897 840 } 898 841 EXPORT_SYMBOL(new_inode); 899 842 843 + /** 844 + * unlock_new_inode - clear the I_NEW state and wake up any waiters 845 + * @inode: new inode to unlock 846 + * 847 + * Called when the inode is fully initialised to clear the new state of the 848 + * inode and wake up anyone waiting for the inode to finish initialisation. 849 + */ 900 850 void unlock_new_inode(struct inode *inode) 901 851 { 902 852 #ifdef CONFIG_DEBUG_LOCK_ALLOC ··· 923 859 } 924 860 } 925 861 #endif 926 - /* 927 - * This is special! We do not need the spinlock when clearing I_NEW, 928 - * because we're guaranteed that nobody else tries to do anything about 929 - * the state of the inode when it is locked, as we just created it (so 930 - * there can be no old holders that haven't tested I_NEW). 931 - * However we must emit the memory barrier so that other CPUs reliably 932 - * see the clearing of I_NEW after the other inode initialisation has 933 - * completed. 934 - */ 935 - smp_mb(); 862 + spin_lock(&inode->i_lock); 936 863 WARN_ON(!(inode->i_state & I_NEW)); 937 864 inode->i_state &= ~I_NEW; 938 - wake_up_inode(inode); 865 + wake_up_bit(&inode->i_state, __I_NEW); 866 + spin_unlock(&inode->i_lock); 939 867 } 940 868 EXPORT_SYMBOL(unlock_new_inode); 941 869 942 - /* 943 - * This is called without the inode lock held.. Be careful. 870 + /** 871 + * iget5_locked - obtain an inode from a mounted file system 872 + * @sb: super block of file system 873 + * @hashval: hash value (usually inode number) to get 874 + * @test: callback used for comparisons between inodes 875 + * @set: callback used to initialize a new struct inode 876 + * @data: opaque data pointer to pass to @test and @set 944 877 * 945 - * We no longer cache the sb_flags in i_flags - see fs.h 946 - * -- rmk@arm.uk.linux.org 878 + * Search for the inode specified by @hashval and @data in the inode cache, 879 + * and if present it is return it with an increased reference count. This is 880 + * a generalized version of iget_locked() for file systems where the inode 881 + * number is not sufficient for unique identification of an inode. 882 + * 883 + * If the inode is not in cache, allocate a new inode and return it locked, 884 + * hashed, and with the I_NEW flag set. The file system gets to fill it in 885 + * before unlocking it via unlock_new_inode(). 886 + * 887 + * Note both @test and @set are called with the inode_hash_lock held, so can't 888 + * sleep. 947 889 */ 948 - static struct inode *get_new_inode(struct super_block *sb, 949 - struct hlist_head *head, 950 - int (*test)(struct inode *, void *), 951 - int (*set)(struct inode *, void *), 952 - void *data) 890 + struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 891 + int (*test)(struct inode *, void *), 892 + int (*set)(struct inode *, void *), void *data) 953 893 { 894 + struct hlist_head *head = inode_hashtable + hash(sb, hashval); 954 895 struct inode *inode; 896 + 897 + spin_lock(&inode_hash_lock); 898 + inode = find_inode(sb, head, test, data); 899 + spin_unlock(&inode_hash_lock); 900 + 901 + if (inode) { 902 + wait_on_inode(inode); 903 + return inode; 904 + } 955 905 956 906 inode = alloc_inode(sb); 957 907 if (inode) { 958 908 struct inode *old; 959 909 960 - spin_lock(&inode_lock); 910 + spin_lock(&inode_hash_lock); 961 911 /* We released the lock, so.. */ 962 912 old = find_inode(sb, head, test, data); 963 913 if (!old) { 964 914 if (set(inode, data)) 965 915 goto set_failed; 966 916 967 - hlist_add_head(&inode->i_hash, head); 968 - __inode_sb_list_add(inode); 917 + spin_lock(&inode->i_lock); 969 918 inode->i_state = I_NEW; 970 - spin_unlock(&inode_lock); 919 + hlist_add_head(&inode->i_hash, head); 920 + spin_unlock(&inode->i_lock); 921 + inode_sb_list_add(inode); 922 + spin_unlock(&inode_hash_lock); 971 923 972 924 /* Return the locked inode with I_NEW set, the 973 925 * caller is responsible for filling in the contents ··· 996 916 * us. Use the old inode instead of the one we just 997 917 * allocated. 998 918 */ 999 - spin_unlock(&inode_lock); 919 + spin_unlock(&inode_hash_lock); 1000 920 destroy_inode(inode); 1001 921 inode = old; 1002 922 wait_on_inode(inode); ··· 1004 924 return inode; 1005 925 1006 926 set_failed: 1007 - spin_unlock(&inode_lock); 927 + spin_unlock(&inode_hash_lock); 1008 928 destroy_inode(inode); 1009 929 return NULL; 1010 930 } 931 + EXPORT_SYMBOL(iget5_locked); 1011 932 1012 - /* 1013 - * get_new_inode_fast is the fast path version of get_new_inode, see the 1014 - * comment at iget_locked for details. 933 + /** 934 + * iget_locked - obtain an inode from a mounted file system 935 + * @sb: super block of file system 936 + * @ino: inode number to get 937 + * 938 + * Search for the inode specified by @ino in the inode cache and if present 939 + * return it with an increased reference count. This is for file systems 940 + * where the inode number is sufficient for unique identification of an inode. 941 + * 942 + * If the inode is not in cache, allocate a new inode and return it locked, 943 + * hashed, and with the I_NEW flag set. The file system gets to fill it in 944 + * before unlocking it via unlock_new_inode(). 1015 945 */ 1016 - static struct inode *get_new_inode_fast(struct super_block *sb, 1017 - struct hlist_head *head, unsigned long ino) 946 + struct inode *iget_locked(struct super_block *sb, unsigned long ino) 1018 947 { 948 + struct hlist_head *head = inode_hashtable + hash(sb, ino); 1019 949 struct inode *inode; 950 + 951 + spin_lock(&inode_hash_lock); 952 + inode = find_inode_fast(sb, head, ino); 953 + spin_unlock(&inode_hash_lock); 954 + if (inode) { 955 + wait_on_inode(inode); 956 + return inode; 957 + } 1020 958 1021 959 inode = alloc_inode(sb); 1022 960 if (inode) { 1023 961 struct inode *old; 1024 962 1025 - spin_lock(&inode_lock); 963 + spin_lock(&inode_hash_lock); 1026 964 /* We released the lock, so.. */ 1027 965 old = find_inode_fast(sb, head, ino); 1028 966 if (!old) { 1029 967 inode->i_ino = ino; 1030 - hlist_add_head(&inode->i_hash, head); 1031 - __inode_sb_list_add(inode); 968 + spin_lock(&inode->i_lock); 1032 969 inode->i_state = I_NEW; 1033 - spin_unlock(&inode_lock); 970 + hlist_add_head(&inode->i_hash, head); 971 + spin_unlock(&inode->i_lock); 972 + inode_sb_list_add(inode); 973 + spin_unlock(&inode_hash_lock); 1034 974 1035 975 /* Return the locked inode with I_NEW set, the 1036 976 * caller is responsible for filling in the contents ··· 1063 963 * us. Use the old inode instead of the one we just 1064 964 * allocated. 1065 965 */ 1066 - spin_unlock(&inode_lock); 966 + spin_unlock(&inode_hash_lock); 1067 967 destroy_inode(inode); 1068 968 inode = old; 1069 969 wait_on_inode(inode); 1070 970 } 1071 971 return inode; 1072 972 } 973 + EXPORT_SYMBOL(iget_locked); 1073 974 1074 975 /* 1075 976 * search the inode cache for a matching inode number. ··· 1085 984 struct hlist_node *node; 1086 985 struct inode *inode; 1087 986 987 + spin_lock(&inode_hash_lock); 1088 988 hlist_for_each_entry(inode, node, b, i_hash) { 1089 - if (inode->i_ino == ino && inode->i_sb == sb) 989 + if (inode->i_ino == ino && inode->i_sb == sb) { 990 + spin_unlock(&inode_hash_lock); 1090 991 return 0; 992 + } 1091 993 } 994 + spin_unlock(&inode_hash_lock); 1092 995 1093 996 return 1; 1094 997 } ··· 1122 1017 static unsigned int counter; 1123 1018 ino_t res; 1124 1019 1125 - spin_lock(&inode_lock); 1126 1020 spin_lock(&iunique_lock); 1127 1021 do { 1128 1022 if (counter <= max_reserved) ··· 1129 1025 res = counter++; 1130 1026 } while (!test_inode_iunique(sb, res)); 1131 1027 spin_unlock(&iunique_lock); 1132 - spin_unlock(&inode_lock); 1133 1028 1134 1029 return res; 1135 1030 } ··· 1136 1033 1137 1034 struct inode *igrab(struct inode *inode) 1138 1035 { 1139 - spin_lock(&inode_lock); 1140 - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 1036 + spin_lock(&inode->i_lock); 1037 + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1141 1038 __iget(inode); 1142 - else 1039 + spin_unlock(&inode->i_lock); 1040 + } else { 1041 + spin_unlock(&inode->i_lock); 1143 1042 /* 1144 1043 * Handle the case where s_op->clear_inode is not been 1145 1044 * called yet, and somebody is calling igrab 1146 1045 * while the inode is getting freed. 1147 1046 */ 1148 1047 inode = NULL; 1149 - spin_unlock(&inode_lock); 1048 + } 1150 1049 return inode; 1151 1050 } 1152 1051 EXPORT_SYMBOL(igrab); 1153 - 1154 - /** 1155 - * ifind - internal function, you want ilookup5() or iget5(). 1156 - * @sb: super block of file system to search 1157 - * @head: the head of the list to search 1158 - * @test: callback used for comparisons between inodes 1159 - * @data: opaque data pointer to pass to @test 1160 - * @wait: if true wait for the inode to be unlocked, if false do not 1161 - * 1162 - * ifind() searches for the inode specified by @data in the inode 1163 - * cache. This is a generalized version of ifind_fast() for file systems where 1164 - * the inode number is not sufficient for unique identification of an inode. 1165 - * 1166 - * If the inode is in the cache, the inode is returned with an incremented 1167 - * reference count. 1168 - * 1169 - * Otherwise NULL is returned. 1170 - * 1171 - * Note, @test is called with the inode_lock held, so can't sleep. 1172 - */ 1173 - static struct inode *ifind(struct super_block *sb, 1174 - struct hlist_head *head, int (*test)(struct inode *, void *), 1175 - void *data, const int wait) 1176 - { 1177 - struct inode *inode; 1178 - 1179 - spin_lock(&inode_lock); 1180 - inode = find_inode(sb, head, test, data); 1181 - if (inode) { 1182 - spin_unlock(&inode_lock); 1183 - if (likely(wait)) 1184 - wait_on_inode(inode); 1185 - return inode; 1186 - } 1187 - spin_unlock(&inode_lock); 1188 - return NULL; 1189 - } 1190 - 1191 - /** 1192 - * ifind_fast - internal function, you want ilookup() or iget(). 1193 - * @sb: super block of file system to search 1194 - * @head: head of the list to search 1195 - * @ino: inode number to search for 1196 - * 1197 - * ifind_fast() searches for the inode @ino in the inode cache. This is for 1198 - * file systems where the inode number is sufficient for unique identification 1199 - * of an inode. 1200 - * 1201 - * If the inode is in the cache, the inode is returned with an incremented 1202 - * reference count. 1203 - * 1204 - * Otherwise NULL is returned. 1205 - */ 1206 - static struct inode *ifind_fast(struct super_block *sb, 1207 - struct hlist_head *head, unsigned long ino) 1208 - { 1209 - struct inode *inode; 1210 - 1211 - spin_lock(&inode_lock); 1212 - inode = find_inode_fast(sb, head, ino); 1213 - if (inode) { 1214 - spin_unlock(&inode_lock); 1215 - wait_on_inode(inode); 1216 - return inode; 1217 - } 1218 - spin_unlock(&inode_lock); 1219 - return NULL; 1220 - } 1221 1052 1222 1053 /** 1223 1054 * ilookup5_nowait - search for an inode in the inode cache ··· 1160 1123 * @test: callback used for comparisons between inodes 1161 1124 * @data: opaque data pointer to pass to @test 1162 1125 * 1163 - * ilookup5() uses ifind() to search for the inode specified by @hashval and 1164 - * @data in the inode cache. This is a generalized version of ilookup() for 1165 - * file systems where the inode number is not sufficient for unique 1166 - * identification of an inode. 1167 - * 1126 + * Search for the inode specified by @hashval and @data in the inode cache. 1168 1127 * If the inode is in the cache, the inode is returned with an incremented 1169 - * reference count. Note, the inode lock is not waited upon so you have to be 1170 - * very careful what you do with the returned inode. You probably should be 1171 - * using ilookup5() instead. 1128 + * reference count. 1172 1129 * 1173 - * Otherwise NULL is returned. 1130 + * Note: I_NEW is not waited upon so you have to be very careful what you do 1131 + * with the returned inode. You probably should be using ilookup5() instead. 1174 1132 * 1175 - * Note, @test is called with the inode_lock held, so can't sleep. 1133 + * Note: @test is called with the inode_hash_lock held, so can't sleep. 1176 1134 */ 1177 1135 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1178 1136 int (*test)(struct inode *, void *), void *data) 1179 1137 { 1180 1138 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1139 + struct inode *inode; 1181 1140 1182 - return ifind(sb, head, test, data, 0); 1141 + spin_lock(&inode_hash_lock); 1142 + inode = find_inode(sb, head, test, data); 1143 + spin_unlock(&inode_hash_lock); 1144 + 1145 + return inode; 1183 1146 } 1184 1147 EXPORT_SYMBOL(ilookup5_nowait); 1185 1148 ··· 1190 1153 * @test: callback used for comparisons between inodes 1191 1154 * @data: opaque data pointer to pass to @test 1192 1155 * 1193 - * ilookup5() uses ifind() to search for the inode specified by @hashval and 1194 - * @data in the inode cache. This is a generalized version of ilookup() for 1195 - * file systems where the inode number is not sufficient for unique 1196 - * identification of an inode. 1197 - * 1198 - * If the inode is in the cache, the inode lock is waited upon and the inode is 1156 + * Search for the inode specified by @hashval and @data in the inode cache, 1157 + * and if the inode is in the cache, return the inode with an incremented 1158 + * reference count. Waits on I_NEW before returning the inode. 1199 1159 * returned with an incremented reference count. 1200 1160 * 1201 - * Otherwise NULL is returned. 1161 + * This is a generalized version of ilookup() for file systems where the 1162 + * inode number is not sufficient for unique identification of an inode. 1202 1163 * 1203 - * Note, @test is called with the inode_lock held, so can't sleep. 1164 + * Note: @test is called with the inode_hash_lock held, so can't sleep. 1204 1165 */ 1205 1166 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1206 1167 int (*test)(struct inode *, void *), void *data) 1207 1168 { 1208 - struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1169 + struct inode *inode = ilookup5_nowait(sb, hashval, test, data); 1209 1170 1210 - return ifind(sb, head, test, data, 1); 1171 + if (inode) 1172 + wait_on_inode(inode); 1173 + return inode; 1211 1174 } 1212 1175 EXPORT_SYMBOL(ilookup5); 1213 1176 ··· 1216 1179 * @sb: super block of file system to search 1217 1180 * @ino: inode number to search for 1218 1181 * 1219 - * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 1220 - * This is for file systems where the inode number is sufficient for unique 1221 - * identification of an inode. 1222 - * 1223 - * If the inode is in the cache, the inode is returned with an incremented 1224 - * reference count. 1225 - * 1226 - * Otherwise NULL is returned. 1182 + * Search for the inode @ino in the inode cache, and if the inode is in the 1183 + * cache, the inode is returned with an incremented reference count. 1227 1184 */ 1228 1185 struct inode *ilookup(struct super_block *sb, unsigned long ino) 1229 1186 { 1230 1187 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1188 + struct inode *inode; 1231 1189 1232 - return ifind_fast(sb, head, ino); 1190 + spin_lock(&inode_hash_lock); 1191 + inode = find_inode_fast(sb, head, ino); 1192 + spin_unlock(&inode_hash_lock); 1193 + 1194 + if (inode) 1195 + wait_on_inode(inode); 1196 + return inode; 1233 1197 } 1234 1198 EXPORT_SYMBOL(ilookup); 1235 - 1236 - /** 1237 - * iget5_locked - obtain an inode from a mounted file system 1238 - * @sb: super block of file system 1239 - * @hashval: hash value (usually inode number) to get 1240 - * @test: callback used for comparisons between inodes 1241 - * @set: callback used to initialize a new struct inode 1242 - * @data: opaque data pointer to pass to @test and @set 1243 - * 1244 - * iget5_locked() uses ifind() to search for the inode specified by @hashval 1245 - * and @data in the inode cache and if present it is returned with an increased 1246 - * reference count. This is a generalized version of iget_locked() for file 1247 - * systems where the inode number is not sufficient for unique identification 1248 - * of an inode. 1249 - * 1250 - * If the inode is not in cache, get_new_inode() is called to allocate a new 1251 - * inode and this is returned locked, hashed, and with the I_NEW flag set. The 1252 - * file system gets to fill it in before unlocking it via unlock_new_inode(). 1253 - * 1254 - * Note both @test and @set are called with the inode_lock held, so can't sleep. 1255 - */ 1256 - struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 1257 - int (*test)(struct inode *, void *), 1258 - int (*set)(struct inode *, void *), void *data) 1259 - { 1260 - struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1261 - struct inode *inode; 1262 - 1263 - inode = ifind(sb, head, test, data, 1); 1264 - if (inode) 1265 - return inode; 1266 - /* 1267 - * get_new_inode() will do the right thing, re-trying the search 1268 - * in case it had to block at any point. 1269 - */ 1270 - return get_new_inode(sb, head, test, set, data); 1271 - } 1272 - EXPORT_SYMBOL(iget5_locked); 1273 - 1274 - /** 1275 - * iget_locked - obtain an inode from a mounted file system 1276 - * @sb: super block of file system 1277 - * @ino: inode number to get 1278 - * 1279 - * iget_locked() uses ifind_fast() to search for the inode specified by @ino in 1280 - * the inode cache and if present it is returned with an increased reference 1281 - * count. This is for file systems where the inode number is sufficient for 1282 - * unique identification of an inode. 1283 - * 1284 - * If the inode is not in cache, get_new_inode_fast() is called to allocate a 1285 - * new inode and this is returned locked, hashed, and with the I_NEW flag set. 1286 - * The file system gets to fill it in before unlocking it via 1287 - * unlock_new_inode(). 1288 - */ 1289 - struct inode *iget_locked(struct super_block *sb, unsigned long ino) 1290 - { 1291 - struct hlist_head *head = inode_hashtable + hash(sb, ino); 1292 - struct inode *inode; 1293 - 1294 - inode = ifind_fast(sb, head, ino); 1295 - if (inode) 1296 - return inode; 1297 - /* 1298 - * get_new_inode_fast() will do the right thing, re-trying the search 1299 - * in case it had to block at any point. 1300 - */ 1301 - return get_new_inode_fast(sb, head, ino); 1302 - } 1303 - EXPORT_SYMBOL(iget_locked); 1304 1199 1305 1200 int insert_inode_locked(struct inode *inode) 1306 1201 { ··· 1240 1271 ino_t ino = inode->i_ino; 1241 1272 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1242 1273 1243 - inode->i_state |= I_NEW; 1244 1274 while (1) { 1245 1275 struct hlist_node *node; 1246 1276 struct inode *old = NULL; 1247 - spin_lock(&inode_lock); 1277 + spin_lock(&inode_hash_lock); 1248 1278 hlist_for_each_entry(old, node, head, i_hash) { 1249 1279 if (old->i_ino != ino) 1250 1280 continue; 1251 1281 if (old->i_sb != sb) 1252 1282 continue; 1253 - if (old->i_state & (I_FREEING|I_WILL_FREE)) 1283 + spin_lock(&old->i_lock); 1284 + if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1285 + spin_unlock(&old->i_lock); 1254 1286 continue; 1287 + } 1255 1288 break; 1256 1289 } 1257 1290 if (likely(!node)) { 1291 + spin_lock(&inode->i_lock); 1292 + inode->i_state |= I_NEW; 1258 1293 hlist_add_head(&inode->i_hash, head); 1259 - spin_unlock(&inode_lock); 1294 + spin_unlock(&inode->i_lock); 1295 + spin_unlock(&inode_hash_lock); 1260 1296 return 0; 1261 1297 } 1262 1298 __iget(old); 1263 - spin_unlock(&inode_lock); 1299 + spin_unlock(&old->i_lock); 1300 + spin_unlock(&inode_hash_lock); 1264 1301 wait_on_inode(old); 1265 1302 if (unlikely(!inode_unhashed(old))) { 1266 1303 iput(old); ··· 1283 1308 struct super_block *sb = inode->i_sb; 1284 1309 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1285 1310 1286 - inode->i_state |= I_NEW; 1287 - 1288 1311 while (1) { 1289 1312 struct hlist_node *node; 1290 1313 struct inode *old = NULL; 1291 1314 1292 - spin_lock(&inode_lock); 1315 + spin_lock(&inode_hash_lock); 1293 1316 hlist_for_each_entry(old, node, head, i_hash) { 1294 1317 if (old->i_sb != sb) 1295 1318 continue; 1296 1319 if (!test(old, data)) 1297 1320 continue; 1298 - if (old->i_state & (I_FREEING|I_WILL_FREE)) 1321 + spin_lock(&old->i_lock); 1322 + if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1323 + spin_unlock(&old->i_lock); 1299 1324 continue; 1325 + } 1300 1326 break; 1301 1327 } 1302 1328 if (likely(!node)) { 1329 + spin_lock(&inode->i_lock); 1330 + inode->i_state |= I_NEW; 1303 1331 hlist_add_head(&inode->i_hash, head); 1304 - spin_unlock(&inode_lock); 1332 + spin_unlock(&inode->i_lock); 1333 + spin_unlock(&inode_hash_lock); 1305 1334 return 0; 1306 1335 } 1307 1336 __iget(old); 1308 - spin_unlock(&inode_lock); 1337 + spin_unlock(&old->i_lock); 1338 + spin_unlock(&inode_hash_lock); 1309 1339 wait_on_inode(old); 1310 1340 if (unlikely(!inode_unhashed(old))) { 1311 1341 iput(old); ··· 1355 1375 const struct super_operations *op = inode->i_sb->s_op; 1356 1376 int drop; 1357 1377 1378 + WARN_ON(inode->i_state & I_NEW); 1379 + 1358 1380 if (op && op->drop_inode) 1359 1381 drop = op->drop_inode(inode); 1360 1382 else 1361 1383 drop = generic_drop_inode(inode); 1362 1384 1363 - if (!drop) { 1364 - if (sb->s_flags & MS_ACTIVE) { 1365 - inode->i_state |= I_REFERENCED; 1366 - if (!(inode->i_state & (I_DIRTY|I_SYNC))) { 1367 - inode_lru_list_add(inode); 1368 - } 1369 - spin_unlock(&inode_lock); 1370 - return; 1371 - } 1372 - WARN_ON(inode->i_state & I_NEW); 1373 - inode->i_state |= I_WILL_FREE; 1374 - spin_unlock(&inode_lock); 1375 - write_inode_now(inode, 1); 1376 - spin_lock(&inode_lock); 1377 - WARN_ON(inode->i_state & I_NEW); 1378 - inode->i_state &= ~I_WILL_FREE; 1379 - __remove_inode_hash(inode); 1385 + if (!drop && (sb->s_flags & MS_ACTIVE)) { 1386 + inode->i_state |= I_REFERENCED; 1387 + if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1388 + inode_lru_list_add(inode); 1389 + spin_unlock(&inode->i_lock); 1390 + return; 1380 1391 } 1381 1392 1382 - WARN_ON(inode->i_state & I_NEW); 1393 + if (!drop) { 1394 + inode->i_state |= I_WILL_FREE; 1395 + spin_unlock(&inode->i_lock); 1396 + write_inode_now(inode, 1); 1397 + spin_lock(&inode->i_lock); 1398 + WARN_ON(inode->i_state & I_NEW); 1399 + inode->i_state &= ~I_WILL_FREE; 1400 + } 1401 + 1383 1402 inode->i_state |= I_FREEING; 1384 - 1385 - /* 1386 - * Move the inode off the IO lists and LRU once I_FREEING is 1387 - * set so that it won't get moved back on there if it is dirty. 1388 - */ 1389 1403 inode_lru_list_del(inode); 1390 - list_del_init(&inode->i_wb_list); 1404 + spin_unlock(&inode->i_lock); 1391 1405 1392 - __inode_sb_list_del(inode); 1393 - spin_unlock(&inode_lock); 1394 1406 evict(inode); 1395 - remove_inode_hash(inode); 1396 - wake_up_inode(inode); 1397 - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 1398 - destroy_inode(inode); 1399 1407 } 1400 1408 1401 1409 /** ··· 1400 1432 if (inode) { 1401 1433 BUG_ON(inode->i_state & I_CLEAR); 1402 1434 1403 - if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1435 + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) 1404 1436 iput_final(inode); 1405 1437 } 1406 1438 } ··· 1579 1611 * to recheck inode state. 1580 1612 * 1581 1613 * It doesn't matter if I_NEW is not set initially, a call to 1582 - * wake_up_inode() after removing from the hash list will DTRT. 1583 - * 1584 - * This is called with inode_lock held. 1614 + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 1615 + * will DTRT. 1585 1616 */ 1586 1617 static void __wait_on_freeing_inode(struct inode *inode) 1587 1618 { ··· 1588 1621 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1589 1622 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1590 1623 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1591 - spin_unlock(&inode_lock); 1624 + spin_unlock(&inode->i_lock); 1625 + spin_unlock(&inode_hash_lock); 1592 1626 schedule(); 1593 1627 finish_wait(wq, &wait.wait); 1594 - spin_lock(&inode_lock); 1628 + spin_lock(&inode_hash_lock); 1595 1629 } 1596 1630 1597 1631 static __initdata unsigned long ihash_entries;
+7
fs/internal.h
··· 125 125 /* 126 126 * inode.c 127 127 */ 128 + extern spinlock_t inode_sb_list_lock; 129 + 130 + /* 131 + * fs-writeback.c 132 + */ 133 + extern void inode_wb_list_del(struct inode *inode); 134 + 128 135 extern int get_nr_dirty_inodes(void); 129 136 extern void evict_inodes(struct super_block *); 130 137 extern int invalidate_inodes(struct super_block *, bool);
+1 -1
fs/logfs/inode.c
··· 293 293 return ret; 294 294 } 295 295 296 - /* called with inode_lock held */ 296 + /* called with inode->i_lock held */ 297 297 static int logfs_drop_inode(struct inode *inode) 298 298 { 299 299 struct logfs_super *super = logfs_super(inode->i_sb);
+18 -5
fs/namei.c
··· 992 992 return 0; 993 993 } 994 994 995 + static inline bool managed_dentry_might_block(struct dentry *dentry) 996 + { 997 + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && 998 + dentry->d_op->d_manage(dentry, true) < 0); 999 + } 1000 + 995 1001 /* 996 1002 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 997 1003 * meet a managed dentry and we're not walking to "..". True is returned to ··· 1006 1000 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 1007 1001 struct inode **inode, bool reverse_transit) 1008 1002 { 1009 - while (d_mountpoint(path->dentry)) { 1003 + for (;;) { 1010 1004 struct vfsmount *mounted; 1011 - if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && 1012 - !reverse_transit && 1013 - path->dentry->d_op->d_manage(path->dentry, true) < 0) 1005 + /* 1006 + * Don't forget we might have a non-mountpoint managed dentry 1007 + * that wants to block transit. 1008 + */ 1009 + *inode = path->dentry->d_inode; 1010 + if (!reverse_transit && 1011 + unlikely(managed_dentry_might_block(path->dentry))) 1014 1012 return false; 1013 + 1014 + if (!d_mountpoint(path->dentry)) 1015 + break; 1016 + 1015 1017 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 1016 1018 if (!mounted) 1017 1019 break; 1018 1020 path->mnt = mounted; 1019 1021 path->dentry = mounted->mnt_root; 1020 1022 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 1021 - *inode = path->dentry->d_inode; 1022 1023 } 1023 1024 1024 1025 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+25 -17
fs/notify/inode_mark.c
··· 22 22 #include <linux/module.h> 23 23 #include <linux/mutex.h> 24 24 #include <linux/spinlock.h> 25 - #include <linux/writeback.h> /* for inode_lock */ 26 25 27 26 #include <asm/atomic.h> 28 27 29 28 #include <linux/fsnotify_backend.h> 30 29 #include "fsnotify.h" 30 + 31 + #include "../internal.h" 31 32 32 33 /* 33 34 * Recalculate the mask of events relevant to a given inode locked. ··· 238 237 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. 239 238 * @list: list of inodes being unmounted (sb->s_inodes) 240 239 * 241 - * Called with inode_lock held, protecting the unmounting super block's list 242 - * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. 243 - * We temporarily drop inode_lock, however, and CAN block. 240 + * Called during unmount with no locks held, so needs to be safe against 241 + * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. 244 242 */ 245 243 void fsnotify_unmount_inodes(struct list_head *list) 246 244 { 247 245 struct inode *inode, *next_i, *need_iput = NULL; 248 246 249 - spin_lock(&inode_lock); 247 + spin_lock(&inode_sb_list_lock); 250 248 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 251 249 struct inode *need_iput_tmp; 252 250 ··· 254 254 * I_WILL_FREE, or I_NEW which is fine because by that point 255 255 * the inode cannot have any associated watches. 256 256 */ 257 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 257 + spin_lock(&inode->i_lock); 258 + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { 259 + spin_unlock(&inode->i_lock); 258 260 continue; 261 + } 259 262 260 263 /* 261 264 * If i_count is zero, the inode cannot have any watches and ··· 266 263 * evict all inodes with zero i_count from icache which is 267 264 * unnecessarily violent and may in fact be illegal to do. 268 265 */ 269 - if (!atomic_read(&inode->i_count)) 266 + if (!atomic_read(&inode->i_count)) { 267 + spin_unlock(&inode->i_lock); 270 268 continue; 269 + } 271 270 272 271 need_iput_tmp = need_iput; 273 272 need_iput = NULL; ··· 279 274 __iget(inode); 280 275 else 281 276 need_iput_tmp = NULL; 277 + spin_unlock(&inode->i_lock); 282 278 283 279 /* In case the dropping of a reference would nuke next_i. */ 284 280 if ((&next_i->i_sb_list != list) && 285 - atomic_read(&next_i->i_count) && 286 - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 287 - __iget(next_i); 288 - need_iput = next_i; 281 + atomic_read(&next_i->i_count)) { 282 + spin_lock(&next_i->i_lock); 283 + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 284 + __iget(next_i); 285 + need_iput = next_i; 286 + } 287 + spin_unlock(&next_i->i_lock); 289 288 } 290 289 291 290 /* 292 - * We can safely drop inode_lock here because we hold 291 + * We can safely drop inode_sb_list_lock here because we hold 293 292 * references on both inode and next_i. Also no new inodes 294 - * will be added since the umount has begun. Finally, 295 - * iprune_mutex keeps shrink_icache_memory() away. 293 + * will be added since the umount has begun. 296 294 */ 297 - spin_unlock(&inode_lock); 295 + spin_unlock(&inode_sb_list_lock); 298 296 299 297 if (need_iput_tmp) 300 298 iput(need_iput_tmp); ··· 309 301 310 302 iput(inode); 311 303 312 - spin_lock(&inode_lock); 304 + spin_lock(&inode_sb_list_lock); 313 305 } 314 - spin_unlock(&inode_lock); 306 + spin_unlock(&inode_sb_list_lock); 315 307 }
-1
fs/notify/mark.c
··· 91 91 #include <linux/slab.h> 92 92 #include <linux/spinlock.h> 93 93 #include <linux/srcu.h> 94 - #include <linux/writeback.h> /* for inode_lock */ 95 94 96 95 #include <asm/atomic.h> 97 96
-1
fs/notify/vfsmount_mark.c
··· 23 23 #include <linux/mount.h> 24 24 #include <linux/mutex.h> 25 25 #include <linux/spinlock.h> 26 - #include <linux/writeback.h> /* for inode_lock */ 27 26 28 27 #include <asm/atomic.h> 29 28
+2 -2
fs/ntfs/inode.c
··· 54 54 * 55 55 * Return 1 if the attributes match and 0 if not. 56 56 * 57 - * NOTE: This function runs with the inode_lock spin lock held so it is not 57 + * NOTE: This function runs with the inode->i_lock spin lock held so it is not 58 58 * allowed to sleep. 59 59 */ 60 60 int ntfs_test_inode(struct inode *vi, ntfs_attr *na) ··· 98 98 * 99 99 * Return 0 on success and -errno on error. 100 100 * 101 - * NOTE: This function runs with the inode_lock spin lock held so it is not 101 + * NOTE: This function runs with the inode->i_lock spin lock held so it is not 102 102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.) 103 103 */ 104 104 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+23 -18
fs/quota/dquot.c
··· 76 76 #include <linux/buffer_head.h> 77 77 #include <linux/capability.h> 78 78 #include <linux/quotaops.h> 79 - #include <linux/writeback.h> /* for inode_lock, oddly enough.. */ 79 + #include "../internal.h" /* ugh */ 80 80 81 81 #include <asm/uaccess.h> 82 82 ··· 900 900 int reserved = 0; 901 901 #endif 902 902 903 - spin_lock(&inode_lock); 903 + spin_lock(&inode_sb_list_lock); 904 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 905 - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 905 + spin_lock(&inode->i_lock); 906 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 907 + !atomic_read(&inode->i_writecount) || 908 + !dqinit_needed(inode, type)) { 909 + spin_unlock(&inode->i_lock); 906 910 continue; 911 + } 907 912 #ifdef CONFIG_QUOTA_DEBUG 908 913 if (unlikely(inode_get_rsv_space(inode) > 0)) 909 914 reserved = 1; 910 915 #endif 911 - if (!atomic_read(&inode->i_writecount)) 912 - continue; 913 - if (!dqinit_needed(inode, type)) 914 - continue; 915 - 916 916 __iget(inode); 917 - spin_unlock(&inode_lock); 917 + spin_unlock(&inode->i_lock); 918 + spin_unlock(&inode_sb_list_lock); 918 919 919 920 iput(old_inode); 920 921 __dquot_initialize(inode, type); 921 - /* We hold a reference to 'inode' so it couldn't have been 922 - * removed from s_inodes list while we dropped the inode_lock. 923 - * We cannot iput the inode now as we can be holding the last 924 - * reference and we cannot iput it under inode_lock. So we 925 - * keep the reference and iput it later. */ 922 + 923 + /* 924 + * We hold a reference to 'inode' so it couldn't have been 925 + * removed from s_inodes list while we dropped the 926 + * inode_sb_list_lock We cannot iput the inode now as we can be 927 + * holding the last reference and we cannot iput it under 928 + * inode_sb_list_lock. So we keep the reference and iput it 929 + * later. 930 + */ 926 931 old_inode = inode; 927 - spin_lock(&inode_lock); 932 + spin_lock(&inode_sb_list_lock); 928 933 } 929 - spin_unlock(&inode_lock); 934 + spin_unlock(&inode_sb_list_lock); 930 935 iput(old_inode); 931 936 932 937 #ifdef CONFIG_QUOTA_DEBUG ··· 1012 1007 struct inode *inode; 1013 1008 int reserved = 0; 1014 1009 1015 - spin_lock(&inode_lock); 1010 + spin_lock(&inode_sb_list_lock); 1016 1011 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1017 1012 /* 1018 1013 * We have to scan also I_NEW inodes because they can already ··· 1026 1021 remove_inode_dquot_ref(inode, type, tofree_head); 1027 1022 } 1028 1023 } 1029 - spin_unlock(&inode_lock); 1024 + spin_unlock(&inode_sb_list_lock); 1030 1025 #ifdef CONFIG_QUOTA_DEBUG 1031 1026 if (reserved) { 1032 1027 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+1 -1
include/linux/fs.h
··· 1636 1636 }; 1637 1637 1638 1638 /* 1639 - * Inode state bits. Protected by inode_lock. 1639 + * Inode state bits. Protected by inode->i_lock 1640 1640 * 1641 1641 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1642 1642 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
+1 -1
include/linux/quotaops.h
··· 277 277 /* 278 278 * Mark inode fully dirty. Since we are allocating blocks, inode 279 279 * would become fully dirty soon anyway and it reportedly 280 - * reduces inode_lock contention. 280 + * reduces lock contention. 281 281 */ 282 282 mark_inode_dirty(inode); 283 283 }
+1 -1
include/linux/writeback.h
··· 9 9 10 10 struct backing_dev_info; 11 11 12 - extern spinlock_t inode_lock; 12 + extern spinlock_t inode_wb_list_lock; 13 13 14 14 /* 15 15 * fs/fs-writeback.c
+4 -4
mm/backing-dev.c
··· 67 67 struct inode *inode; 68 68 69 69 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 70 - spin_lock(&inode_lock); 70 + spin_lock(&inode_wb_list_lock); 71 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 72 72 nr_dirty++; 73 73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 74 74 nr_io++; 75 75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 76 76 nr_more_io++; 77 - spin_unlock(&inode_lock); 77 + spin_unlock(&inode_wb_list_lock); 78 78 79 79 global_dirty_limits(&background_thresh, &dirty_thresh); 80 80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); ··· 676 676 if (bdi_has_dirty_io(bdi)) { 677 677 struct bdi_writeback *dst = &default_backing_dev_info.wb; 678 678 679 - spin_lock(&inode_lock); 679 + spin_lock(&inode_wb_list_lock); 680 680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 681 681 list_splice(&bdi->wb.b_io, &dst->b_io); 682 682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 683 - spin_unlock(&inode_lock); 683 + spin_unlock(&inode_wb_list_lock); 684 684 } 685 685 686 686 bdi_unregister(bdi);
+6 -4
mm/filemap.c
··· 80 80 * ->i_mutex 81 81 * ->i_alloc_sem (various) 82 82 * 83 - * ->inode_lock 84 - * ->sb_lock (fs/fs-writeback.c) 83 + * inode_wb_list_lock 84 + * sb_lock (fs/fs-writeback.c) 85 85 * ->mapping->tree_lock (__sync_single_inode) 86 86 * 87 87 * ->i_mmap_lock ··· 98 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 - * ->inode_lock (page_remove_rmap->set_page_dirty) 102 - * ->inode_lock (zap_pte_range->set_page_dirty) 101 + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 102 + * ->inode->i_lock (page_remove_rmap->set_page_dirty) 103 + * inode_wb_list_lock (zap_pte_range->set_page_dirty) 104 + * ->inode->i_lock (zap_pte_range->set_page_dirty) 103 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 104 106 * 105 107 * (code doesn't rely on that order, so you could switch it around)
+3 -2
mm/rmap.c
··· 31 31 * swap_lock (in swap_duplicate, swap_info_get) 32 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 - * inode_lock (in set_page_dirty's __mark_inode_dirty) 34 + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 35 + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 35 36 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 37 * mapping->tree_lock (widely used, in set_page_dirty, 37 38 * in arch-dependent flush_dcache_mmap_lock, 38 - * within inode_lock in __sync_single_inode) 39 + * within inode_wb_list_lock in __sync_single_inode) 39 40 * 40 41 * (code doesn't rely on that order so it could be switched around) 41 42 * ->tasklist_lock