Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs: kill i_alloc_sem

i_alloc_sem is a rather special rw_semaphore. It's the last one that may
be released by a non-owner, and it's write side is always mirrored by
real exclusion. It's intended use it to wait for all pending direct I/O
requests to finish before starting a truncate.

Replace it with a hand-grown construct:

- exclusion for truncates is already guaranteed by i_mutex, so it can
simply fall way
- the reader side is replaced by an i_dio_count member in struct inode
that counts the number of pending direct I/O requests. Truncate can't
proceed as long as it's non-zero
- when i_dio_count reaches non-zero we wake up a pending truncate using
wake_up_bit on a new bit in i_flags
- new references to i_dio_count can't appear while we are waiting for
it to read zero because the direct I/O count always needs i_mutex
(or an equivalent like XFS's i_iolock) for starting a new operation.

This scheme is much simpler, and saves the space of a spinlock_t and a
struct list_head in struct inode (typically 160 bits on a non-debug 64-bit
system).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Christoph Hellwig and committed by
Al Viro
bd5fe6c5 f9b5570d

+78 -53
+1 -4
fs/attr.c
··· 233 233 return error; 234 234 235 235 if (ia_valid & ATTR_SIZE) 236 - down_write(&dentry->d_inode->i_alloc_sem); 236 + inode_dio_wait(inode); 237 237 238 238 if (inode->i_op->setattr) 239 239 error = inode->i_op->setattr(dentry, attr); 240 240 else 241 241 error = simple_setattr(dentry, attr); 242 - 243 - if (ia_valid & ATTR_SIZE) 244 - up_write(&dentry->d_inode->i_alloc_sem); 245 242 246 243 if (!error) 247 244 fsnotify_change(dentry, ia_valid);
+51 -14
fs/direct-io.c
··· 135 135 struct page *pages[DIO_PAGES]; /* page buffer */ 136 136 }; 137 137 138 + static void __inode_dio_wait(struct inode *inode) 139 + { 140 + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); 141 + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); 142 + 143 + do { 144 + prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); 145 + if (atomic_read(&inode->i_dio_count)) 146 + schedule(); 147 + } while (atomic_read(&inode->i_dio_count)); 148 + finish_wait(wq, &q.wait); 149 + } 150 + 151 + /** 152 + * inode_dio_wait - wait for outstanding DIO requests to finish 153 + * @inode: inode to wait for 154 + * 155 + * Waits for all pending direct I/O requests to finish so that we can 156 + * proceed with a truncate or equivalent operation. 157 + * 158 + * Must be called under a lock that serializes taking new references 159 + * to i_dio_count, usually by inode->i_mutex. 160 + */ 161 + void inode_dio_wait(struct inode *inode) 162 + { 163 + if (atomic_read(&inode->i_dio_count)) 164 + __inode_dio_wait(inode); 165 + } 166 + EXPORT_SYMBOL_GPL(inode_dio_wait); 167 + 168 + /* 169 + * inode_dio_done - signal finish of a direct I/O requests 170 + * @inode: inode the direct I/O happens on 171 + * 172 + * This is called once we've finished processing a direct I/O request, 173 + * and is used to wake up callers waiting for direct I/O to be quiesced. 174 + */ 175 + void inode_dio_done(struct inode *inode) 176 + { 177 + if (atomic_dec_and_test(&inode->i_dio_count)) 178 + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); 179 + } 180 + EXPORT_SYMBOL_GPL(inode_dio_done); 181 + 138 182 /* 139 183 * How many pages are in the queue? 140 184 */ ··· 298 254 } 299 255 300 256 if (dio->flags & DIO_LOCKING) 301 - /* lockdep: non-owner release */ 302 - up_read_non_owner(&dio->inode->i_alloc_sem); 303 - 257 + inode_dio_done(dio->inode); 304 258 return ret; 305 259 } 306 260 ··· 1022 980 return ret; 1023 981 } 1024 982 1025 - /* 1026 - * Releases both i_mutex and i_alloc_sem 1027 - */ 1028 983 static ssize_t 1029 984 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 1030 985 const struct iovec *iov, loff_t offset, unsigned long nr_segs, ··· 1185 1146 * For writes this function is called under i_mutex and returns with 1186 1147 * i_mutex held, for reads, i_mutex is not held on entry, but it is 1187 1148 * taken and dropped again before returning. 1188 - * For reads and writes i_alloc_sem is taken in shared mode and released 1189 - * on I/O completion (which may happen asynchronously after returning to 1190 - * the caller). 1149 + * The i_dio_count counter keeps track of the number of outstanding 1150 + * direct I/O requests, and truncate waits for it to reach zero. 1151 + * New references to i_dio_count must only be grabbed with i_mutex 1152 + * held. 1191 1153 * 1192 1154 * - if the flags value does NOT contain DIO_LOCKING we don't use any 1193 1155 * internal locking but rather rely on the filesystem to synchronize 1194 1156 * direct I/O reads/writes versus each other and truncate. 1195 - * For reads and writes both i_mutex and i_alloc_sem are not held on 1196 - * entry and are never taken. 1197 1157 */ 1198 1158 ssize_t 1199 1159 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ··· 1272 1234 } 1273 1235 1274 1236 /* 1275 - * Will be released at I/O completion, possibly in a 1276 - * different thread. 1237 + * Will be decremented at I/O completion time. 1277 1238 */ 1278 - down_read_non_owner(&inode->i_alloc_sem); 1239 + atomic_inc(&inode->i_dio_count); 1279 1240 } 1280 1241 1281 1242 /*
+1 -2
fs/inode.c
··· 168 168 mutex_init(&inode->i_mutex); 169 169 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 170 170 171 - init_rwsem(&inode->i_alloc_sem); 172 - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); 171 + atomic_set(&inode->i_dio_count, 0); 173 172 174 173 mapping->a_ops = &empty_aops; 175 174 mapping->host = inode;
+1 -2
fs/ntfs/file.c
··· 1832 1832 * fails again. 1833 1833 */ 1834 1834 if (unlikely(NInoTruncateFailed(ni))) { 1835 - down_write(&vi->i_alloc_sem); 1835 + inode_dio_wait(vi); 1836 1836 err = ntfs_truncate(vi); 1837 - up_write(&vi->i_alloc_sem); 1838 1837 if (err || NInoTruncateFailed(ni)) { 1839 1838 if (!err) 1840 1839 err = -EIO;
+2 -8
fs/ntfs/inode.c
··· 2357 2357 * 2358 2358 * Returns 0 on success or -errno on error. 2359 2359 * 2360 - * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for 2361 - * writing. The only case in the kernel where ->i_alloc_sem is not held is 2362 - * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called 2363 - * with the current i_size as the offset. The analogous place in NTFS is in 2364 - * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again 2365 - * without holding ->i_alloc_sem. 2360 + * Called with ->i_mutex held. 2366 2361 */ 2367 2362 int ntfs_truncate(struct inode *vi) 2368 2363 { ··· 2882 2887 * We also abort all changes of user, group, and mode as we do not implement 2883 2888 * the NTFS ACLs yet. 2884 2889 * 2885 - * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also 2886 - * called with ->i_alloc_sem held for writing. 2890 + * Called with ->i_mutex held. 2887 2891 */ 2888 2892 int ntfs_setattr(struct dentry *dentry, struct iattr *attr) 2889 2893 {
+3 -4
fs/ocfs2/aops.c
··· 551 551 552 552 /* 553 553 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 554 - * particularly interested in the aio/dio case. Like the core uses 555 - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 556 - * truncation on another. 554 + * particularly interested in the aio/dio case. We use the rw_lock DLM lock 555 + * to protect io on one node from truncation on another. 557 556 */ 558 557 static void ocfs2_dio_end_io(struct kiocb *iocb, 559 558 loff_t offset, ··· 568 569 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 569 570 570 571 if (ocfs2_iocb_is_sem_locked(iocb)) { 571 - up_read(&inode->i_alloc_sem); 572 + inode_dio_done(inode); 572 573 ocfs2_iocb_clear_sem_locked(iocb); 573 574 } 574 575
+7 -8
fs/ocfs2/file.c
··· 2236 2236 ocfs2_iocb_clear_sem_locked(iocb); 2237 2237 2238 2238 relock: 2239 - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2239 + /* to match setattr's i_mutex -> rw_lock ordering */ 2240 2240 if (direct_io) { 2241 - down_read(&inode->i_alloc_sem); 2241 + atomic_inc(&inode->i_dio_count); 2242 2242 have_alloc_sem = 1; 2243 2243 /* communicate with ocfs2_dio_end_io */ 2244 2244 ocfs2_iocb_set_sem_locked(iocb); ··· 2290 2290 */ 2291 2291 if (direct_io && !can_do_direct) { 2292 2292 ocfs2_rw_unlock(inode, rw_level); 2293 - up_read(&inode->i_alloc_sem); 2293 + inode_dio_done(inode); 2294 2294 2295 2295 have_alloc_sem = 0; 2296 2296 rw_level = -1; ··· 2361 2361 /* 2362 2362 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2363 2363 * function pointer which is called when o_direct io completes so that 2364 - * it can unlock our rw lock. (it's the clustered equivalent of 2365 - * i_alloc_sem; protects truncate from racing with pending ios). 2364 + * it can unlock our rw lock. 2366 2365 * Unfortunately there are error cases which call end_io and others 2367 2366 * that don't. so we don't have to unlock the rw_lock if either an 2368 2367 * async dio is going to do it in the future or an end_io after an ··· 2378 2379 2379 2380 out_sems: 2380 2381 if (have_alloc_sem) { 2381 - up_read(&inode->i_alloc_sem); 2382 + inode_dio_done(inode); 2382 2383 ocfs2_iocb_clear_sem_locked(iocb); 2383 2384 } 2384 2385 ··· 2530 2531 * need locks to protect pending reads from racing with truncate. 2531 2532 */ 2532 2533 if (filp->f_flags & O_DIRECT) { 2533 - down_read(&inode->i_alloc_sem); 2534 2534 have_alloc_sem = 1; 2535 + atomic_inc(&inode->i_dio_count); 2535 2536 ocfs2_iocb_set_sem_locked(iocb); 2536 2537 2537 2538 ret = ocfs2_rw_lock(inode, 0); ··· 2574 2575 2575 2576 bail: 2576 2577 if (have_alloc_sem) { 2577 - up_read(&inode->i_alloc_sem); 2578 + inode_dio_done(inode); 2578 2579 ocfs2_iocb_clear_sem_locked(iocb); 2579 2580 } 2580 2581 if (rw_level != -1)
+1 -2
fs/reiserfs/xattr.c
··· 555 555 556 556 reiserfs_write_unlock(inode->i_sb); 557 557 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); 558 - down_write(&dentry->d_inode->i_alloc_sem); 558 + inode_dio_wait(dentry->d_inode); 559 559 reiserfs_write_lock(inode->i_sb); 560 560 561 561 err = reiserfs_setattr(dentry, &newattrs); 562 - up_write(&dentry->d_inode->i_alloc_sem); 563 562 mutex_unlock(&dentry->d_inode->i_mutex); 564 563 } else 565 564 update_ctime(inode);
+9 -2
include/linux/fs.h
··· 779 779 struct timespec i_ctime; 780 780 blkcnt_t i_blocks; 781 781 unsigned short i_bytes; 782 - struct rw_semaphore i_alloc_sem; 782 + atomic_t i_dio_count; 783 783 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 784 784 struct file_lock *i_flock; 785 785 struct address_space *i_mapping; ··· 1705 1705 * set during data writeback, and cleared with a wakeup 1706 1706 * on the bit address once it is done. 1707 1707 * 1708 + * I_REFERENCED Marks the inode as recently references on the LRU list. 1709 + * 1710 + * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). 1711 + * 1708 1712 * Q: What is the difference between I_WILL_FREE and I_FREEING? 1709 1713 */ 1710 1714 #define I_DIRTY_SYNC (1 << 0) ··· 1722 1718 #define __I_SYNC 7 1723 1719 #define I_SYNC (1 << __I_SYNC) 1724 1720 #define I_REFERENCED (1 << 8) 1721 + #define __I_DIO_WAKEUP 9 1722 + #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) 1725 1723 1726 1724 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1727 1725 ··· 1834 1828 struct lock_class_key i_lock_key; 1835 1829 struct lock_class_key i_mutex_key; 1836 1830 struct lock_class_key i_mutex_dir_key; 1837 - struct lock_class_key i_alloc_sem_key; 1838 1831 }; 1839 1832 1840 1833 extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, ··· 2409 2404 }; 2410 2405 2411 2406 void dio_end_io(struct bio *bio, int error); 2407 + void inode_dio_wait(struct inode *inode); 2408 + void inode_dio_done(struct inode *inode); 2412 2409 2413 2410 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 2414 2411 struct block_device *bdev, const struct iovec *iov, loff_t offset,
-3
mm/filemap.c
··· 78 78 * ->i_mutex (generic_file_buffered_write) 79 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 80 80 * 81 - * ->i_mutex 82 - * ->i_alloc_sem (various) 83 - * 84 81 * inode_wb_list_lock 85 82 * sb_lock (fs/fs-writeback.c) 86 83 * ->mapping->tree_lock (__sync_single_inode)
+1 -1
mm/madvise.c
··· 218 218 endoff = (loff_t)(end - vma->vm_start - 1) 219 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 220 220 221 - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 221 + /* vmtruncate_range needs to take i_mutex */ 222 222 up_read(&current->mm->mmap_sem); 223 223 error = vmtruncate_range(mapping->host, offset, endoff); 224 224 down_read(&current->mm->mmap_sem);
-1
mm/rmap.c
··· 21 21 * Lock ordering in mm: 22 22 * 23 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 - * inode->i_alloc_sem (vmtruncate_range) 25 24 * mm->mmap_sem 26 25 * page->flags PG_locked (lock_page) 27 26 * mapping->i_mmap_mutex
+1 -2
mm/truncate.c
··· 622 622 return -ENOSYS; 623 623 624 624 mutex_lock(&inode->i_mutex); 625 - down_write(&inode->i_alloc_sem); 625 + inode_dio_wait(inode); 626 626 unmap_mapping_range(mapping, offset, (end - offset), 1); 627 627 inode->i_op->truncate_range(inode, offset, end); 628 628 /* unmap again to remove racily COWed private pages */ 629 629 unmap_mapping_range(mapping, offset, (end - offset), 1); 630 - up_write(&inode->i_alloc_sem); 631 630 mutex_unlock(&inode->i_mutex); 632 631 633 632 return 0;