[PATCH] NFS: Fix race in nfs_release_page()

NFS: Fix race in nfs_release_page()

invalidate_inode_pages2() may find the dirty bit has been set on a page
owing to the fact that the page may still be mapped after it was locked.
Only after the call to unmap_mapping_range() are we sure that the page
can no longer be dirtied.
In order to fix this, NFS has hooked the releasepage() method and tries
to write the page out between the call to unmap_mapping_range() and the
call to remove_mapping(). This, however leads to deadlocks in the page
reclaim code, where the page may be locked without holding a reference
to the inode or dentry.

Fix is to add a new address_space_operation, launder_page(), which will
attempt to write out a dirty page without releasing the page lock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

Also, the bare SetPageDirty() can skew all sort of accounting leading to
other nasties.

[akpm@osdl.org: cleanup]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Trond Myklebust and committed by Linus Torvalds e3db7691 07031e14

+28 -9
+8
Documentation/filesystems/Locking
··· 171 int (*releasepage) (struct page *, int); 172 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 173 loff_t offset, unsigned long nr_segs); 174 175 locking rules: 176 All except set_page_dirty may block ··· 189 invalidatepage: no yes 190 releasepage: no yes 191 direct_IO: no 192 193 ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() 194 may be called from the request handler (/dev/loop). ··· 282 buffers from the page in preparation for freeing it. It returns zero to 283 indicate that the buffers are (or may be) freeable. If ->releasepage is zero, 284 the kernel assumes that the fs has no private interest in the buffers. 285 286 Note: currently almost all instances of address_space methods are 287 using BKL for internal serialization and that's one of the worst sources
··· 171 int (*releasepage) (struct page *, int); 172 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 173 loff_t offset, unsigned long nr_segs); 174 + int (*launder_page) (struct page *); 175 176 locking rules: 177 All except set_page_dirty may block ··· 188 invalidatepage: no yes 189 releasepage: no yes 190 direct_IO: no 191 + launder_page: no yes 192 193 ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() 194 may be called from the request handler (/dev/loop). ··· 280 buffers from the page in preparation for freeing it. It returns zero to 281 indicate that the buffers are (or may be) freeable. If ->releasepage is zero, 282 the kernel assumes that the fs has no private interest in the buffers. 283 + 284 + ->launder_page() may be called prior to releasing a page if 285 + it is still found to be dirty. It returns zero if the page was successfully 286 + cleaned, or an error value if not. Note that in order to prevent the page 287 + getting mapped back in and redirtied, it needs to be kept locked 288 + across the entire operation. 289 290 Note: currently almost all instances of address_space methods are 291 using BKL for internal serialization and that's one of the worst sources
+8 -8
fs/nfs/file.c
··· 315 316 static int nfs_release_page(struct page *page, gfp_t gfp) 317 { 318 - /* 319 - * Avoid deadlock on nfs_wait_on_request(). 320 - */ 321 - if (!(gfp & __GFP_FS)) 322 - return 0; 323 - /* Hack... Force nfs_wb_page() to write out the page */ 324 - SetPageDirty(page); 325 - return !nfs_wb_page(page->mapping->host, page); 326 } 327 328 const struct address_space_operations nfs_file_aops = { ··· 337 #ifdef CONFIG_NFS_DIRECTIO 338 .direct_IO = nfs_direct_IO, 339 #endif 340 }; 341 342 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
··· 315 316 static int nfs_release_page(struct page *page, gfp_t gfp) 317 { 318 + /* If PagePrivate() is set, then the page is not freeable */ 319 + return 0; 320 + } 321 + 322 + static int nfs_launder_page(struct page *page) 323 + { 324 + return nfs_wb_page(page->mapping->host, page); 325 } 326 327 const struct address_space_operations nfs_file_aops = { ··· 338 #ifdef CONFIG_NFS_DIRECTIO 339 .direct_IO = nfs_direct_IO, 340 #endif 341 + .launder_page = nfs_launder_page, 342 }; 343 344 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
+1
include/linux/fs.h
··· 426 /* migrate the contents of a page to the specified target */ 427 int (*migratepage) (struct address_space *, 428 struct page *, struct page *); 429 }; 430 431 struct backing_dev_info;
··· 426 /* migrate the contents of a page to the specified target */ 427 int (*migratepage) (struct address_space *, 428 struct page *, struct page *); 429 + int (*launder_page) (struct page *); 430 }; 431 432 struct backing_dev_info;
+11 -1
mm/truncate.c
··· 341 return 0; 342 } 343 344 /** 345 * invalidate_inode_pages2_range - remove range of pages from an address_space 346 * @mapping: the address_space ··· 414 PAGE_CACHE_SIZE, 0); 415 } 416 } 417 - if (!invalidate_complete_page2(mapping, page)) 418 ret = -EIO; 419 unlock_page(page); 420 }
··· 341 return 0; 342 } 343 344 + static int do_launder_page(struct address_space *mapping, struct page *page) 345 + { 346 + if (!PageDirty(page)) 347 + return 0; 348 + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) 349 + return 0; 350 + return mapping->a_ops->launder_page(page); 351 + } 352 + 353 /** 354 * invalidate_inode_pages2_range - remove range of pages from an address_space 355 * @mapping: the address_space ··· 405 PAGE_CACHE_SIZE, 0); 406 } 407 } 408 + ret = do_launder_page(mapping, page); 409 + if (ret == 0 && !invalidate_complete_page2(mapping, page)) 410 ret = -EIO; 411 unlock_page(page); 412 }