Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+3 -3

Documentation/filesystems/Locking

··· 189 189 loff_t pos, unsigned len, unsigned copied, 190 190 struct page *page, void *fsdata); 191 191 sector_t (*bmap)(struct address_space *, sector_t); 192 - int (*invalidatepage) (struct page *, unsigned long); 192 + void (*invalidatepage) (struct page *, unsigned int, unsigned int); 193 193 int (*releasepage) (struct page *, int); 194 194 void (*freepage)(struct page *); 195 195 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, ··· 310 310 keep it that way and don't breed new callers. 311 311 312 312 ->invalidatepage() is called when the filesystem must attempt to drop 313 - some or all of the buffers from the page when it is being truncated. It 314 - returns zero on success. If ->invalidatepage is zero, the kernel uses 313 + some or all of the buffers from the page when it is being truncated. It 314 + returns zero on success. If ->invalidatepage is zero, the kernel uses 315 315 block_invalidatepage() instead. 316 316 317 317 ->releasepage() is called when the kernel is about to try to drop the

+10 -10

Documentation/filesystems/vfs.txt

··· 549 549 ------------------------------- 550 550 551 551 This describes how the VFS can manipulate mapping of a file to page cache in 552 - your filesystem. As of kernel 2.6.22, the following members are defined: 552 + your filesystem. The following members are defined: 553 553 554 554 struct address_space_operations { 555 555 int (*writepage)(struct page *page, struct writeback_control *wbc); ··· 566 566 loff_t pos, unsigned len, unsigned copied, 567 567 struct page *page, void *fsdata); 568 568 sector_t (*bmap)(struct address_space *, sector_t); 569 - int (*invalidatepage) (struct page *, unsigned long); 569 + void (*invalidatepage) (struct page *, unsigned int, unsigned int); 570 570 int (*releasepage) (struct page *, int); 571 571 void (*freepage)(struct page *); 572 572 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, ··· 685 685 invalidatepage: If a page has PagePrivate set, then invalidatepage 686 686 will be called when part or all of the page is to be removed 687 687 from the address space. This generally corresponds to either a 688 - truncation or a complete invalidation of the address space 689 - (in the latter case 'offset' will always be 0). 690 - Any private data associated with the page should be updated 691 - to reflect this truncation. If offset is 0, then 692 - the private data should be released, because the page 693 - must be able to be completely discarded. This may be done by 694 - calling the ->releasepage function, but in this case the 695 - release MUST succeed. 688 + truncation, punch hole or a complete invalidation of the address 689 + space (in the latter case 'offset' will always be 0 and 'length' 690 + will be PAGE_CACHE_SIZE). Any private data associated with the page 691 + should be updated to reflect this truncation. If offset is 0 and 692 + length is PAGE_CACHE_SIZE, then the private data should be released, 693 + because the page must be able to be completely discarded. This may 694 + be done by calling the ->releasepage function, but in this case the 695 + release MUST succeed. 696 696 697 697 releasepage: releasepage is called on PagePrivate pages to indicate 698 698 that the page should be freed if possible. ->releasepage

+3 -2

fs/9p/vfs_addr.c

··· 148 148 * @offset: offset in the page 149 149 */ 150 150 151 - static void v9fs_invalidate_page(struct page *page, unsigned long offset) 151 + static void v9fs_invalidate_page(struct page *page, unsigned int offset, 152 + unsigned int length) 152 153 { 153 154 /* 154 155 * If called with zero offset, we should release 155 156 * the private state assocated with the page 156 157 */ 157 - if (offset == 0) 158 + if (offset == 0 && length == PAGE_CACHE_SIZE) 158 159 v9fs_fscache_invalidate_page(page); 159 160 } 160 161

+6 -4

fs/afs/file.c

··· 19 19 #include "internal.h" 20 20 21 21 static int afs_readpage(struct file *file, struct page *page); 22 - static void afs_invalidatepage(struct page *page, unsigned long offset); 22 + static void afs_invalidatepage(struct page *page, unsigned int offset, 23 + unsigned int length); 23 24 static int afs_releasepage(struct page *page, gfp_t gfp_flags); 24 25 static int afs_launder_page(struct page *page); 25 26 ··· 311 310 * - release a page and clean up its private data if offset is 0 (indicating 312 311 * the entire page) 313 312 */ 314 - static void afs_invalidatepage(struct page *page, unsigned long offset) 313 + static void afs_invalidatepage(struct page *page, unsigned int offset, 314 + unsigned int length) 315 315 { 316 316 struct afs_writeback *wb = (struct afs_writeback *) page_private(page); 317 317 318 - _enter("{%lu},%lu", page->index, offset); 318 + _enter("{%lu},%u,%u", page->index, offset, length); 319 319 320 320 BUG_ON(!PageLocked(page)); 321 321 322 322 /* we clean up only if the entire page is being invalidated */ 323 - if (offset == 0) { 323 + if (offset == 0 && length == PAGE_CACHE_SIZE) { 324 324 #ifdef CONFIG_AFS_FSCACHE 325 325 if (PageFsCache(page)) { 326 326 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);

+2 -1

fs/btrfs/disk-io.c

··· 1013 1013 return try_release_extent_buffer(page); 1014 1014 } 1015 1015 1016 - static void btree_invalidatepage(struct page *page, unsigned long offset) 1016 + static void btree_invalidatepage(struct page *page, unsigned int offset, 1017 + unsigned int length) 1017 1018 { 1018 1019 struct extent_io_tree *tree; 1019 1020 tree = &BTRFS_I(page->mapping->host)->io_tree;

+1 -1

fs/btrfs/extent_io.c

··· 2957 2957 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2958 2958 if (page->index > end_index || 2959 2959 (page->index == end_index && !pg_offset)) { 2960 - page->mapping->a_ops->invalidatepage(page, 0); 2960 + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); 2961 2961 unlock_page(page); 2962 2962 return 0; 2963 2963 }

+2 -1

fs/btrfs/inode.c

··· 7493 7493 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 7494 7494 } 7495 7495 7496 - static void btrfs_invalidatepage(struct page *page, unsigned long offset) 7496 + static void btrfs_invalidatepage(struct page *page, unsigned int offset, 7497 + unsigned int length) 7497 7498 { 7498 7499 struct inode *inode = page->mapping->host; 7499 7500 struct extent_io_tree *tree;

+18 -3

fs/buffer.c

··· 1454 1454 * block_invalidatepage - invalidate part or all of a buffer-backed page 1455 1455 * 1456 1456 * @page: the page which is affected 1457 - * @offset: the index of the truncation point 1457 + * @offset: start of the range to invalidate 1458 + * @length: length of the range to invalidate 1458 1459 * 1459 1460 * block_invalidatepage() is called when all or part of the page has become 1460 1461 * invalidated by a truncate operation. ··· 1466 1465 * point. Because the caller is about to free (and possibly reuse) those 1467 1466 * blocks on-disk. 1468 1467 */ 1469 - void block_invalidatepage(struct page *page, unsigned long offset) 1468 + void block_invalidatepage(struct page *page, unsigned int offset, 1469 + unsigned int length) 1470 1470 { 1471 1471 struct buffer_head *head, *bh, *next; 1472 1472 unsigned int curr_off = 0; 1473 + unsigned int stop = length + offset; 1473 1474 1474 1475 BUG_ON(!PageLocked(page)); 1475 1476 if (!page_has_buffers(page)) 1476 1477 goto out; 1478 + 1479 + /* 1480 + * Check for overflow 1481 + */ 1482 + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); 1477 1483 1478 1484 head = page_buffers(page); 1479 1485 bh = head; 1480 1486 do { 1481 1487 unsigned int next_off = curr_off + bh->b_size; 1482 1488 next = bh->b_this_page; 1489 + 1490 + /* 1491 + * Are we still fully in range ? 1492 + */ 1493 + if (next_off > stop) 1494 + goto out; 1483 1495 1484 1496 /* 1485 1497 * is this block fully invalidated? ··· 1514 1500 return; 1515 1501 } 1516 1502 EXPORT_SYMBOL(block_invalidatepage); 1503 + 1517 1504 1518 1505 /* 1519 1506 * We attach and possibly dirty the buffers atomically wrt ··· 2856 2841 * they may have been added in ext3_writepage(). Make them 2857 2842 * freeable here, so the page does not leak. 2858 2843 */ 2859 - do_invalidatepage(page, 0); 2844 + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); 2860 2845 unlock_page(page); 2861 2846 return 0; /* don't care */ 2862 2847 }

+8 -7

fs/ceph/addr.c

··· 143 143 * dirty page counters appropriately. Only called if there is private 144 144 * data on the page. 145 145 */ 146 - static void ceph_invalidatepage(struct page *page, unsigned long offset) 146 + static void ceph_invalidatepage(struct page *page, unsigned int offset, 147 + unsigned int length) 147 148 { 148 149 struct inode *inode; 149 150 struct ceph_inode_info *ci; ··· 164 163 if (!PageDirty(page)) 165 164 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 166 165 167 - if (offset == 0) 166 + if (offset == 0 && length == PAGE_CACHE_SIZE) 168 167 ClearPageChecked(page); 169 168 170 169 ci = ceph_inode(inode); 171 - if (offset == 0) { 172 - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", 173 - inode, page, page->index, offset); 170 + if (offset == 0 && length == PAGE_CACHE_SIZE) { 171 + dout("%p invalidatepage %p idx %lu full dirty page\n", 172 + inode, page, page->index); 174 173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 175 174 ceph_put_snap_context(snapc); 176 175 page->private = 0; 177 176 ClearPagePrivate(page); 178 177 } else { 179 - dout("%p invalidatepage %p idx %lu partial dirty page\n", 180 - inode, page, page->index); 178 + dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", 179 + inode, page, page->index, offset, length); 181 180 } 182 181 } 183 182

+3 -2

fs/cifs/file.c

··· 3546 3546 return cifs_fscache_release_page(page, gfp); 3547 3547 } 3548 3548 3549 - static void cifs_invalidate_page(struct page *page, unsigned long offset) 3549 + static void cifs_invalidate_page(struct page *page, unsigned int offset, 3550 + unsigned int length) 3550 3551 { 3551 3552 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); 3552 3553 3553 - if (offset == 0) 3554 + if (offset == 0 && length == PAGE_CACHE_SIZE) 3554 3555 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); 3555 3556 } 3556 3557

+4 -2

fs/exofs/inode.c

··· 953 953 return 0; 954 954 } 955 955 956 - static void exofs_invalidatepage(struct page *page, unsigned long offset) 956 + static void exofs_invalidatepage(struct page *page, unsigned int offset, 957 + unsigned int length) 957 958 { 958 - EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); 959 + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", 960 + page->index, offset, length); 959 961 WARN_ON(1); 960 962 } 961 963

+5 -4

fs/ext3/inode.c

··· 1825 1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1826 1826 } 1827 1827 1828 - static void ext3_invalidatepage(struct page *page, unsigned long offset) 1828 + static void ext3_invalidatepage(struct page *page, unsigned int offset, 1829 + unsigned int length) 1829 1830 { 1830 1831 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1831 1832 1832 - trace_ext3_invalidatepage(page, offset); 1833 + trace_ext3_invalidatepage(page, offset, length); 1833 1834 1834 1835 /* 1835 1836 * If it's a full truncate we just forget about the pending dirtying 1836 1837 */ 1837 - if (offset == 0) 1838 + if (offset == 0 && length == PAGE_CACHE_SIZE) 1838 1839 ClearPageChecked(page); 1839 1840 1840 - journal_invalidatepage(journal, page, offset); 1841 + journal_invalidatepage(journal, page, offset, length); 1841 1842 } 1842 1843 1843 1844 static int ext3_releasepage(struct page *page, gfp_t wait)

+2 -5

fs/ext3/namei.c

··· 576 576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 577 577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) 578 578 +((char *)de - bh->b_data))) { 579 - /* On error, skip the f_pos to the next block. */ 580 - dir_file->f_pos = (dir_file->f_pos | 581 - (dir->i_sb->s_blocksize - 1)) + 1; 582 - brelse (bh); 583 - return count; 579 + /* silently ignore the rest of the block */ 580 + break; 584 581 } 585 582 ext3fs_dirhash(de->name, de->name_len, hinfo); 586 583 if ((hinfo->hash < start_hash) ||

+9 -5

fs/ext4/balloc.c

··· 682 682 683 683 static inline int test_root(ext4_group_t a, int b) 684 684 { 685 - int num = b; 686 - 687 - while (a > num) 688 - num *= b; 689 - return num == a; 685 + while (1) { 686 + if (a < b) 687 + return 0; 688 + if (a == b) 689 + return 1; 690 + if ((a % b) != 0) 691 + return 0; 692 + a = a / b; 693 + } 690 694 } 691 695 692 696 static int ext4_group_sparse(ext4_group_t group)

+122 -65

fs/ext4/ext4.h

··· 177 177 }; 178 178 179 179 /* 180 - * For delayed allocation tracking 181 - */ 182 - struct mpage_da_data { 183 - struct inode *inode; 184 - sector_t b_blocknr; /* start block number of extent */ 185 - size_t b_size; /* size of extent */ 186 - unsigned long b_state; /* state of the extent */ 187 - unsigned long first_page, next_page; /* extent of pages */ 188 - struct writeback_control *wbc; 189 - int io_done; 190 - int pages_written; 191 - int retval; 192 - }; 193 - 194 - /* 195 180 * Flags for ext4_io_end->flags 196 181 */ 197 182 #define EXT4_IO_END_UNWRITTEN 0x0001 198 - #define EXT4_IO_END_ERROR 0x0002 199 - #define EXT4_IO_END_DIRECT 0x0004 183 + #define EXT4_IO_END_DIRECT 0x0002 200 184 201 185 /* 202 - * For converting uninitialized extents on a work queue. 186 + * For converting uninitialized extents on a work queue. 'handle' is used for 187 + * buffered writeback. 203 188 */ 204 189 typedef struct ext4_io_end { 205 190 struct list_head list; /* per-file finished IO list */ 191 + handle_t *handle; /* handle reserved for extent 192 + * conversion */ 206 193 struct inode *inode; /* file being written to */ 194 + struct bio *bio; /* Linked list of completed 195 + * bios covering the extent */ 207 196 unsigned int flag; /* unwritten or not */ 208 197 loff_t offset; /* offset in the file */ 209 198 ssize_t size; /* size of the extent */ 210 199 struct kiocb *iocb; /* iocb struct for AIO */ 211 200 int result; /* error value for AIO */ 201 + atomic_t count; /* reference counter */ 212 202 } ext4_io_end_t; 213 203 214 204 struct ext4_io_submit { ··· 571 581 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 572 582 573 583 /* 574 - * Flags used by ext4_discard_partial_page_buffers 575 - */ 576 - #define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 577 - 578 - /* 579 584 * ioctl commands 580 585 */ 581 586 #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS ··· 864 879 rwlock_t i_es_lock; 865 880 struct list_head i_es_lru; 866 881 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 882 + unsigned long i_touch_when; /* jiffies of last accessing */ 867 883 868 884 /* ialloc */ 869 885 ext4_group_t i_last_alloc_group; ··· 889 903 qsize_t i_reserved_quota; 890 904 #endif 891 905 892 - /* completed IOs that might need unwritten extents handling */ 893 - struct list_head i_completed_io_list; 906 + /* Lock protecting lists below */ 894 907 spinlock_t i_completed_io_lock; 908 + /* 909 + * Completed IOs that need unwritten extents handling and have 910 + * transaction reserved 911 + */ 912 + struct list_head i_rsv_conversion_list; 913 + /* 914 + * Completed IOs that need unwritten extents handling and don't have 915 + * transaction reserved 916 + */ 917 + struct list_head i_unrsv_conversion_list; 895 918 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 896 919 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 897 - struct work_struct i_unwritten_work; /* deferred extent conversion */ 920 + struct work_struct i_rsv_conversion_work; 921 + struct work_struct i_unrsv_conversion_work; 898 922 899 923 spinlock_t i_block_reservation_lock; 900 924 ··· 1241 1245 unsigned int s_mb_stats; 1242 1246 unsigned int s_mb_order2_reqs; 1243 1247 unsigned int s_mb_group_prealloc; 1244 - unsigned int s_max_writeback_mb_bump; 1245 1248 unsigned int s_max_dir_size_kb; 1246 1249 /* where last allocation was done - for stream allocation */ 1247 1250 unsigned long s_mb_last_group; ··· 1276 1281 struct flex_groups *s_flex_groups; 1277 1282 ext4_group_t s_flex_groups_allocated; 1278 1283 1279 - /* workqueue for dio unwritten */ 1280 - struct workqueue_struct *dio_unwritten_wq; 1284 + /* workqueue for unreserved extent convertions (dio) */ 1285 + struct workqueue_struct *unrsv_conversion_wq; 1286 + /* workqueue for reserved extent conversions (buffered io) */ 1287 + struct workqueue_struct *rsv_conversion_wq; 1281 1288 1282 1289 /* timer for periodic error stats printing */ 1283 1290 struct timer_list s_err_report; ··· 1304 1307 /* Reclaim extents from extent status tree */ 1305 1308 struct shrinker s_es_shrinker; 1306 1309 struct list_head s_es_lru; 1310 + unsigned long s_es_last_sorted; 1307 1311 struct percpu_counter s_extent_cache_cnt; 1308 1312 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 1313 }; ··· 1340 1342 struct ext4_io_end *io_end) 1341 1343 { 1342 1344 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1345 + /* Writeback has to have coversion transaction reserved */ 1346 + WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && 1347 + !(io_end->flag & EXT4_IO_END_DIRECT)); 1343 1348 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1344 1349 atomic_inc(&EXT4_I(inode)->i_unwritten); 1345 1350 } ··· 2000 1999 2001 2000 /* fsync.c */ 2002 2001 extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2003 - extern int ext4_flush_unwritten_io(struct inode *); 2004 2002 2005 2003 /* hash.c */ 2006 2004 extern int ext4fs_dirhash(const char *name, int len, struct ··· 2088 2088 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2089 2089 extern int ext4_can_truncate(struct inode *inode); 2090 2090 extern void ext4_truncate(struct inode *); 2091 - extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 2091 + extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); 2092 2092 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2093 2093 extern void ext4_set_inode_flags(struct inode *); 2094 2094 extern void ext4_get_inode_flags(struct ext4_inode_info *); ··· 2096 2096 extern void ext4_set_aops(struct inode *inode); 2097 2097 extern int ext4_writepage_trans_blocks(struct inode *); 2098 2098 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2099 - extern int ext4_discard_partial_page_buffers(handle_t *handle, 2100 - struct address_space *mapping, loff_t from, 2101 - loff_t length, int flags); 2099 + extern int ext4_block_truncate_page(handle_t *handle, 2100 + struct address_space *mapping, loff_t from); 2101 + extern int ext4_block_zero_page_range(handle_t *handle, 2102 + struct address_space *mapping, loff_t from, loff_t length); 2103 + extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2104 + loff_t lstart, loff_t lend); 2102 2105 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2103 2106 extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2104 2107 extern void ext4_da_update_reserve_space(struct inode *inode, ··· 2114 2111 const struct iovec *iov, loff_t offset, 2115 2112 unsigned long nr_segs); 2116 2113 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2117 - extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2114 + extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); 2118 2115 extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2119 2116 extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2120 2117 ext4_lblk_t first, ext4_lblk_t stop); ··· 2169 2166 ext4_group_t ngroup); 2170 2167 extern const char *ext4_decode_error(struct super_block *sb, int errno, 2171 2168 char nbuf[16]); 2169 + 2172 2170 extern __printf(4, 5) 2173 2171 void __ext4_error(struct super_block *, const char *, unsigned int, 2174 2172 const char *, ...); 2175 - #define ext4_error(sb, message...) __ext4_error(sb, __func__, \ 2176 - __LINE__, ## message) 2177 2173 extern __printf(5, 6) 2178 - void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2174 + void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2179 2175 const char *, ...); 2180 2176 extern __printf(5, 6) 2181 - void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2177 + void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2182 2178 const char *, ...); 2183 2179 extern void __ext4_std_error(struct super_block *, const char *, 2184 2180 unsigned int, int); 2185 2181 extern __printf(4, 5) 2186 2182 void __ext4_abort(struct super_block *, const char *, unsigned int, 2187 2183 const char *, ...); 2188 - #define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ 2189 - __LINE__, ## message) 2190 2184 extern __printf(4, 5) 2191 2185 void __ext4_warning(struct super_block *, const char *, unsigned int, 2192 2186 const char *, ...); 2193 - #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ 2194 - __LINE__, ## message) 2195 2187 extern __printf(3, 4) 2196 - void ext4_msg(struct super_block *, const char *, const char *, ...); 2188 + void __ext4_msg(struct super_block *, const char *, const char *, ...); 2197 2189 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 2198 2190 const char *, unsigned int, const char *); 2199 - #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ 2200 - __LINE__, msg) 2201 2191 extern __printf(7, 8) 2202 2192 void __ext4_grp_locked_error(const char *, unsigned int, 2203 2193 struct super_block *, ext4_group_t, 2204 2194 unsigned long, ext4_fsblk_t, 2205 2195 const char *, ...); 2206 - #define ext4_grp_locked_error(sb, grp, message...) \ 2207 - __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 2196 + 2197 + #ifdef CONFIG_PRINTK 2198 + 2199 + #define ext4_error_inode(inode, func, line, block, fmt, ...) \ 2200 + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) 2201 + #define ext4_error_file(file, func, line, block, fmt, ...) \ 2202 + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) 2203 + #define ext4_error(sb, fmt, ...) \ 2204 + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) 2205 + #define ext4_abort(sb, fmt, ...) \ 2206 + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) 2207 + #define ext4_warning(sb, fmt, ...) \ 2208 + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) 2209 + #define ext4_msg(sb, level, fmt, ...) \ 2210 + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) 2211 + #define dump_mmp_msg(sb, mmp, msg) \ 2212 + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) 2213 + #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ 2214 + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ 2215 + fmt, ##__VA_ARGS__) 2216 + 2217 + #else 2218 + 2219 + #define ext4_error_inode(inode, func, line, block, fmt, ...) \ 2220 + do { \ 2221 + no_printk(fmt, ##__VA_ARGS__); \ 2222 + __ext4_error_inode(inode, "", 0, block, " "); \ 2223 + } while (0) 2224 + #define ext4_error_file(file, func, line, block, fmt, ...) \ 2225 + do { \ 2226 + no_printk(fmt, ##__VA_ARGS__); \ 2227 + __ext4_error_file(file, "", 0, block, " "); \ 2228 + } while (0) 2229 + #define ext4_error(sb, fmt, ...) \ 2230 + do { \ 2231 + no_printk(fmt, ##__VA_ARGS__); \ 2232 + __ext4_error(sb, "", 0, " "); \ 2233 + } while (0) 2234 + #define ext4_abort(sb, fmt, ...) \ 2235 + do { \ 2236 + no_printk(fmt, ##__VA_ARGS__); \ 2237 + __ext4_abort(sb, "", 0, " "); \ 2238 + } while (0) 2239 + #define ext4_warning(sb, fmt, ...) \ 2240 + do { \ 2241 + no_printk(fmt, ##__VA_ARGS__); \ 2242 + __ext4_warning(sb, "", 0, " "); \ 2243 + } while (0) 2244 + #define ext4_msg(sb, level, fmt, ...) \ 2245 + do { \ 2246 + no_printk(fmt, ##__VA_ARGS__); \ 2247 + __ext4_msg(sb, "", " "); \ 2248 + } while (0) 2249 + #define dump_mmp_msg(sb, mmp, msg) \ 2250 + __dump_mmp_msg(sb, mmp, "", 0, "") 2251 + #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ 2252 + do { \ 2253 + no_printk(fmt, ##__VA_ARGS__); \ 2254 + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ 2255 + } while (0) 2256 + 2257 + #endif 2258 + 2208 2259 extern void ext4_update_dynamic_rev(struct super_block *sb); 2209 2260 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 2210 2261 __u32 compat); ··· 2369 2312 { 2370 2313 struct ext4_group_info ***grp_info; 2371 2314 long indexv, indexh; 2315 + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); 2372 2316 grp_info = EXT4_SB(sb)->s_group_info; 2373 2317 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); 2374 2318 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); ··· 2656 2598 2657 2599 extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2658 2600 extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2659 - extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2660 - int chunk); 2601 + extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); 2661 2602 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2662 2603 struct ext4_map_blocks *map, int flags); 2663 2604 extern void ext4_ext_truncate(handle_t *, struct inode *); ··· 2666 2609 extern void ext4_ext_release(struct super_block *); 2667 2610 extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2668 2611 loff_t len); 2669 - extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2670 - ssize_t len); 2612 + extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, 2613 + loff_t offset, ssize_t len); 2671 2614 extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2672 2615 struct ext4_map_blocks *map, int flags); 2673 2616 extern int ext4_ext_calc_metadata_amount(struct inode *inode, ··· 2707 2650 2708 2651 /* page-io.c */ 2709 2652 extern int __init ext4_init_pageio(void); 2710 - extern void ext4_add_complete_io(ext4_io_end_t *io_end); 2711 2653 extern void ext4_exit_pageio(void); 2712 - extern void ext4_ioend_shutdown(struct inode *); 2713 - extern void ext4_free_io_end(ext4_io_end_t *io); 2714 2654 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2715 - extern void ext4_end_io_work(struct work_struct *work); 2655 + extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); 2656 + extern int ext4_put_io_end(ext4_io_end_t *io_end); 2657 + extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); 2658 + extern void ext4_io_submit_init(struct ext4_io_submit *io, 2659 + struct writeback_control *wbc); 2660 + extern void ext4_end_io_rsv_work(struct work_struct *work); 2661 + extern void ext4_end_io_unrsv_work(struct work_struct *work); 2716 2662 extern void ext4_io_submit(struct ext4_io_submit *io); 2717 2663 extern int ext4_bio_write_page(struct ext4_io_submit *io, 2718 2664 struct page *page, ··· 2728 2668 extern int ext4_mmp_csum_verify(struct super_block *sb, 2729 2669 struct mmp_struct *mmp); 2730 2670 2731 - /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2671 + /* 2672 + * Note that these flags will never ever appear in a buffer_head's state flag. 2673 + * See EXT4_MAP_... to see where this is used. 2674 + */ 2732 2675 enum ext4_state_bits { 2733 2676 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2734 - = BH_JBDPrivateStart, 2677 + = BH_JBDPrivateStart, 2735 2678 BH_AllocFromCluster, /* allocated blocks were part of already 2736 - * allocated cluster. Note that this flag will 2737 - * never, ever appear in a buffer_head's state 2738 - * flag. See EXT4_MAP_FROM_CLUSTER to see where 2739 - * this is used. */ 2679 + * allocated cluster. */ 2740 2680 }; 2741 - 2742 - BUFFER_FNS(Uninit, uninit) 2743 - TAS_BUFFER_FNS(Uninit, uninit) 2744 2681 2745 2682 /* 2746 2683 * Add new method to test whether block and inode bitmaps are properly

+47 -11

fs/ext4/ext4_jbd2.c

··· 38 38 /* 39 39 * Wrappers for jbd2_journal_start/end. 40 40 */ 41 - handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 42 - int type, int nblocks) 41 + static int ext4_journal_check_start(struct super_block *sb) 43 42 { 44 43 journal_t *journal; 45 44 46 45 might_sleep(); 47 - 48 - trace_ext4_journal_start(sb, nblocks, _RET_IP_); 49 46 if (sb->s_flags & MS_RDONLY) 50 - return ERR_PTR(-EROFS); 51 - 47 + return -EROFS; 52 48 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 53 49 journal = EXT4_SB(sb)->s_journal; 54 - if (!journal) 55 - return ext4_get_nojournal(); 56 50 /* 57 51 * Special case here: if the journal has aborted behind our 58 52 * backs (eg. EIO in the commit thread), then we still need to 59 53 * take the FS itself readonly cleanly. 60 54 */ 61 - if (is_journal_aborted(journal)) { 55 + if (journal && is_journal_aborted(journal)) { 62 56 ext4_abort(sb, "Detected aborted journal"); 63 - return ERR_PTR(-EROFS); 57 + return -EROFS; 64 58 } 65 - return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); 59 + return 0; 60 + } 61 + 62 + handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 63 + int type, int blocks, int rsv_blocks) 64 + { 65 + journal_t *journal; 66 + int err; 67 + 68 + trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); 69 + err = ext4_journal_check_start(sb); 70 + if (err < 0) 71 + return ERR_PTR(err); 72 + 73 + journal = EXT4_SB(sb)->s_journal; 74 + if (!journal) 75 + return ext4_get_nojournal(); 76 + return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, 77 + type, line); 66 78 } 67 79 68 80 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) ··· 96 84 if (err) 97 85 __ext4_std_error(sb, where, line, err); 98 86 return err; 87 + } 88 + 89 + handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, 90 + int type) 91 + { 92 + struct super_block *sb; 93 + int err; 94 + 95 + if (!ext4_handle_valid(handle)) 96 + return ext4_get_nojournal(); 97 + 98 + sb = handle->h_journal->j_private; 99 + trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, 100 + _RET_IP_); 101 + err = ext4_journal_check_start(sb); 102 + if (err < 0) { 103 + jbd2_journal_free_reserved(handle); 104 + return ERR_PTR(err); 105 + } 106 + 107 + err = jbd2_journal_start_reserved(handle, type, line); 108 + if (err < 0) 109 + return ERR_PTR(err); 110 + return handle; 99 111 } 100 112 101 113 void ext4_journal_abort_handle(const char *caller, unsigned int line,

+23 -6

fs/ext4/ext4_jbd2.h

··· 134 134 #define EXT4_HT_MIGRATE 8 135 135 #define EXT4_HT_MOVE_EXTENTS 9 136 136 #define EXT4_HT_XATTR 10 137 - #define EXT4_HT_MAX 11 137 + #define EXT4_HT_EXT_CONVERT 11 138 + #define EXT4_HT_MAX 12 138 139 139 140 /** 140 141 * struct ext4_journal_cb_entry - Base structure for callback information. ··· 266 265 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 267 266 268 267 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 269 - int type, int nblocks); 268 + int type, int blocks, int rsv_blocks); 270 269 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 271 270 272 271 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) ··· 301 300 } 302 301 303 302 #define ext4_journal_start_sb(sb, type, nblocks) \ 304 - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) 303 + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) 305 304 306 305 #define ext4_journal_start(inode, type, nblocks) \ 307 - __ext4_journal_start((inode), __LINE__, (type), (nblocks)) 306 + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) 307 + 308 + #define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ 309 + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) 308 310 309 311 static inline handle_t *__ext4_journal_start(struct inode *inode, 310 312 unsigned int line, int type, 311 - int nblocks) 313 + int blocks, int rsv_blocks) 312 314 { 313 - return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); 315 + return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, 316 + rsv_blocks); 314 317 } 315 318 316 319 #define ext4_journal_stop(handle) \ 317 320 __ext4_journal_stop(__func__, __LINE__, (handle)) 321 + 322 + #define ext4_journal_start_reserved(handle, type) \ 323 + __ext4_journal_start_reserved((handle), __LINE__, (type)) 324 + 325 + handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, 326 + int type); 327 + 328 + static inline void ext4_journal_free_reserved(handle_t *handle) 329 + { 330 + if (ext4_handle_valid(handle)) 331 + jbd2_journal_free_reserved(handle); 332 + } 318 333 319 334 static inline handle_t *ext4_journal_current_handle(void) 320 335 {

+115 -78

fs/ext4/extents.c

··· 2125 2125 next_del = ext4_find_delayed_extent(inode, &es); 2126 2126 if (!exists && next_del) { 2127 2127 exists = 1; 2128 - flags |= FIEMAP_EXTENT_DELALLOC; 2128 + flags |= (FIEMAP_EXTENT_DELALLOC | 2129 + FIEMAP_EXTENT_UNKNOWN); 2129 2130 } 2130 2131 up_read(&EXT4_I(inode)->i_data_sem); 2131 2132 ··· 2329 2328 } 2330 2329 2331 2330 /* 2332 - * How many index/leaf blocks need to change/allocate to modify nrblocks? 2331 + * How many index/leaf blocks need to change/allocate to add @extents extents? 2333 2332 * 2334 - * if nrblocks are fit in a single extent (chunk flag is 1), then 2335 - * in the worse case, each tree level index/leaf need to be changed 2336 - * if the tree split due to insert a new extent, then the old tree 2337 - * index/leaf need to be updated too 2333 + * If we add a single extent, then in the worse case, each tree level 2334 + * index/leaf need to be changed in case of the tree split. 2338 2335 * 2339 - * If the nrblocks are discontiguous, they could cause 2340 - * the whole tree split more than once, but this is really rare. 2336 + * If more extents are inserted, they could cause the whole tree split more 2337 + * than once, but this is really rare. 2341 2338 */ 2342 - int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2339 + int ext4_ext_index_trans_blocks(struct inode *inode, int extents) 2343 2340 { 2344 2341 int index; 2345 2342 int depth; ··· 2348 2349 2349 2350 depth = ext_depth(inode); 2350 2351 2351 - if (chunk) 2352 + if (extents <= 1) 2352 2353 index = depth * 2; 2353 2354 else 2354 2355 index = depth * 3; ··· 2356 2357 return index; 2357 2358 } 2358 2359 2360 + static inline int get_default_free_blocks_flags(struct inode *inode) 2361 + { 2362 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2363 + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2364 + else if (ext4_should_journal_data(inode)) 2365 + return EXT4_FREE_BLOCKS_FORGET; 2366 + return 0; 2367 + } 2368 + 2359 2369 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2360 2370 struct ext4_extent *ex, 2361 - ext4_fsblk_t *partial_cluster, 2371 + long long *partial_cluster, 2362 2372 ext4_lblk_t from, ext4_lblk_t to) 2363 2373 { 2364 2374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2365 2375 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2366 2376 ext4_fsblk_t pblk; 2367 - int flags = 0; 2368 - 2369 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2370 - flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2371 - else if (ext4_should_journal_data(inode)) 2372 - flags |= EXT4_FREE_BLOCKS_FORGET; 2377 + int flags = get_default_free_blocks_flags(inode); 2373 2378 2374 2379 /* 2375 2380 * For bigalloc file systems, we never free a partial cluster ··· 2391 2388 * partial cluster here. 2392 2389 */ 2393 2390 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2394 - if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2391 + if ((*partial_cluster > 0) && 2392 + (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2395 2393 ext4_free_blocks(handle, inode, NULL, 2396 2394 EXT4_C2B(sbi, *partial_cluster), 2397 2395 sbi->s_cluster_ratio, flags); ··· 2418 2414 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2419 2415 /* tail removal */ 2420 2416 ext4_lblk_t num; 2417 + unsigned int unaligned; 2421 2418 2422 2419 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2423 2420 pblk = ext4_ext_pblock(ex) + ee_len - num; 2424 - ext_debug("free last %u blocks starting %llu\n", num, pblk); 2421 + /* 2422 + * Usually we want to free partial cluster at the end of the 2423 + * extent, except for the situation when the cluster is still 2424 + * used by any other extent (partial_cluster is negative). 2425 + */ 2426 + if (*partial_cluster < 0 && 2427 + -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) 2428 + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2429 + 2430 + ext_debug("free last %u blocks starting %llu partial %lld\n", 2431 + num, pblk, *partial_cluster); 2425 2432 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2426 2433 /* 2427 2434 * If the block range to be freed didn't start at the 2428 2435 * beginning of a cluster, and we removed the entire 2429 - * extent, save the partial cluster here, since we 2430 - * might need to delete if we determine that the 2431 - * truncate operation has removed all of the blocks in 2432 - * the cluster. 2436 + * extent and the cluster is not used by any other extent, 2437 + * save the partial cluster here, since we might need to 2438 + * delete if we determine that the truncate operation has 2439 + * removed all of the blocks in the cluster. 2440 + * 2441 + * On the other hand, if we did not manage to free the whole 2442 + * extent, we have to mark the cluster as used (store negative 2443 + * cluster number in partial_cluster). 2433 2444 */ 2434 - if (pblk & (sbi->s_cluster_ratio - 1) && 2435 - (ee_len == num)) 2445 + unaligned = pblk & (sbi->s_cluster_ratio - 1); 2446 + if (unaligned && (ee_len == num) && 2447 + (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) 2436 2448 *partial_cluster = EXT4_B2C(sbi, pblk); 2437 - else 2449 + else if (unaligned) 2450 + *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); 2451 + else if (*partial_cluster > 0) 2438 2452 *partial_cluster = 0; 2439 - } else if (from == le32_to_cpu(ex->ee_block) 2440 - && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2441 - /* head removal */ 2442 - ext4_lblk_t num; 2443 - ext4_fsblk_t start; 2444 - 2445 - num = to - from; 2446 - start = ext4_ext_pblock(ex); 2447 - 2448 - ext_debug("free first %u blocks starting %llu\n", num, start); 2449 - ext4_free_blocks(handle, inode, NULL, start, num, flags); 2450 - 2451 - } else { 2452 - printk(KERN_INFO "strange request: removal(2) " 2453 - "%u-%u from %u:%u\n", 2454 - from, to, le32_to_cpu(ex->ee_block), ee_len); 2455 - } 2453 + } else 2454 + ext4_error(sbi->s_sb, "strange request: removal(2) " 2455 + "%u-%u from %u:%u\n", 2456 + from, to, le32_to_cpu(ex->ee_block), ee_len); 2456 2457 return 0; 2457 2458 } 2458 2459 ··· 2470 2461 * @handle: The journal handle 2471 2462 * @inode: The files inode 2472 2463 * @path: The path to the leaf 2464 + * @partial_cluster: The cluster which we'll have to free if all extents 2465 + * has been released from it. It gets negative in case 2466 + * that the cluster is still used. 2473 2467 * @start: The first block to remove 2474 2468 * @end: The last block to remove 2475 2469 */ 2476 2470 static int 2477 2471 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2478 - struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2472 + struct ext4_ext_path *path, 2473 + long long *partial_cluster, 2479 2474 ext4_lblk_t start, ext4_lblk_t end) 2480 2475 { 2481 2476 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ··· 2492 2479 unsigned short ex_ee_len; 2493 2480 unsigned uninitialized = 0; 2494 2481 struct ext4_extent *ex; 2482 + ext4_fsblk_t pblk; 2495 2483 2496 2484 /* the header must be checked already in ext4_ext_remove_space() */ 2497 2485 ext_debug("truncate since %u in leaf to %u\n", start, end); ··· 2504 2490 return -EIO; 2505 2491 } 2506 2492 /* find where to start removing */ 2507 - ex = EXT_LAST_EXTENT(eh); 2493 + ex = path[depth].p_ext; 2494 + if (!ex) 2495 + ex = EXT_LAST_EXTENT(eh); 2508 2496 2509 2497 ex_ee_block = le32_to_cpu(ex->ee_block); 2510 2498 ex_ee_len = ext4_ext_get_actual_len(ex); ··· 2533 2517 2534 2518 /* If this extent is beyond the end of the hole, skip it */ 2535 2519 if (end < ex_ee_block) { 2520 + /* 2521 + * We're going to skip this extent and move to another, 2522 + * so if this extent is not cluster aligned we have 2523 + * to mark the current cluster as used to avoid 2524 + * accidentally freeing it later on 2525 + */ 2526 + pblk = ext4_ext_pblock(ex); 2527 + if (pblk & (sbi->s_cluster_ratio - 1)) 2528 + *partial_cluster = 2529 + -((long long)EXT4_B2C(sbi, pblk)); 2536 2530 ex--; 2537 2531 ex_ee_block = le32_to_cpu(ex->ee_block); 2538 2532 ex_ee_len = ext4_ext_get_actual_len(ex); ··· 2618 2592 sizeof(struct ext4_extent)); 2619 2593 } 2620 2594 le16_add_cpu(&eh->eh_entries, -1); 2621 - } else 2595 + } else if (*partial_cluster > 0) 2622 2596 *partial_cluster = 0; 2623 2597 2624 2598 err = ext4_ext_dirty(handle, inode, path + depth); ··· 2636 2610 err = ext4_ext_correct_indexes(handle, inode, path); 2637 2611 2638 2612 /* 2639 - * If there is still a entry in the leaf node, check to see if 2640 - * it references the partial cluster. This is the only place 2641 - * where it could; if it doesn't, we can free the cluster. 2613 + * Free the partial cluster only if the current extent does not 2614 + * reference it. Otherwise we might free used cluster. 2642 2615 */ 2643 - if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2616 + if (*partial_cluster > 0 && 2644 2617 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2645 2618 *partial_cluster)) { 2646 - int flags = EXT4_FREE_BLOCKS_FORGET; 2647 - 2648 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2649 - flags |= EXT4_FREE_BLOCKS_METADATA; 2619 + int flags = get_default_free_blocks_flags(inode); 2650 2620 2651 2621 ext4_free_blocks(handle, inode, NULL, 2652 2622 EXT4_C2B(sbi, *partial_cluster), ··· 2686 2664 struct super_block *sb = inode->i_sb; 2687 2665 int depth = ext_depth(inode); 2688 2666 struct ext4_ext_path *path = NULL; 2689 - ext4_fsblk_t partial_cluster = 0; 2667 + long long partial_cluster = 0; 2690 2668 handle_t *handle; 2691 2669 int i = 0, err = 0; 2692 2670 ··· 2698 2676 return PTR_ERR(handle); 2699 2677 2700 2678 again: 2701 - trace_ext4_ext_remove_space(inode, start, depth); 2679 + trace_ext4_ext_remove_space(inode, start, end, depth); 2702 2680 2703 2681 /* 2704 2682 * Check if we are removing extents inside the extent tree. If that ··· 2866 2844 } 2867 2845 } 2868 2846 2869 - trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2870 - path->p_hdr->eh_entries); 2847 + trace_ext4_ext_remove_space_done(inode, start, end, depth, 2848 + partial_cluster, path->p_hdr->eh_entries); 2871 2849 2872 2850 /* If we still have something in the partial cluster and we have removed 2873 2851 * even the first extent, then we should free the blocks in the partial 2874 2852 * cluster as well. */ 2875 - if (partial_cluster && path->p_hdr->eh_entries == 0) { 2876 - int flags = EXT4_FREE_BLOCKS_FORGET; 2877 - 2878 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2879 - flags |= EXT4_FREE_BLOCKS_METADATA; 2853 + if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { 2854 + int flags = get_default_free_blocks_flags(inode); 2880 2855 2881 2856 ext4_free_blocks(handle, inode, NULL, 2882 2857 EXT4_C2B(EXT4_SB(sb), partial_cluster), ··· 4382 4363 } 4383 4364 4384 4365 out3: 4385 - trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4366 + trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); 4386 4367 4387 4368 return err ? err : allocated; 4388 4369 } ··· 4465 4446 return -EOPNOTSUPP; 4466 4447 4467 4448 if (mode & FALLOC_FL_PUNCH_HOLE) 4468 - return ext4_punch_hole(file, offset, len); 4449 + return ext4_punch_hole(inode, offset, len); 4469 4450 4470 4451 ret = ext4_convert_inline_data(inode); 4471 4452 if (ret) ··· 4567 4548 * function, to convert the fallocated extents after IO is completed. 4568 4549 * Returns 0 on success. 4569 4550 */ 4570 - int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4571 - ssize_t len) 4551 + int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, 4552 + loff_t offset, ssize_t len) 4572 4553 { 4573 - handle_t *handle; 4574 4554 unsigned int max_blocks; 4575 4555 int ret = 0; 4576 4556 int ret2 = 0; ··· 4584 4566 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4585 4567 map.m_lblk); 4586 4568 /* 4587 - * credits to insert 1 extent into extent tree 4569 + * This is somewhat ugly but the idea is clear: When transaction is 4570 + * reserved, everything goes into it. Otherwise we rather start several 4571 + * smaller transactions for conversion of each extent separately. 4588 4572 */ 4589 - credits = ext4_chunk_trans_blocks(inode, max_blocks); 4573 + if (handle) { 4574 + handle = ext4_journal_start_reserved(handle, 4575 + EXT4_HT_EXT_CONVERT); 4576 + if (IS_ERR(handle)) 4577 + return PTR_ERR(handle); 4578 + credits = 0; 4579 + } else { 4580 + /* 4581 + * credits to insert 1 extent into extent tree 4582 + */ 4583 + credits = ext4_chunk_trans_blocks(inode, max_blocks); 4584 + } 4590 4585 while (ret >= 0 && ret < max_blocks) { 4591 4586 map.m_lblk += ret; 4592 4587 map.m_len = (max_blocks -= ret); 4593 - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 4594 - if (IS_ERR(handle)) { 4595 - ret = PTR_ERR(handle); 4596 - break; 4588 + if (credits) { 4589 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4590 + credits); 4591 + if (IS_ERR(handle)) { 4592 + ret = PTR_ERR(handle); 4593 + break; 4594 + } 4597 4595 } 4598 4596 ret = ext4_map_blocks(handle, inode, &map, 4599 4597 EXT4_GET_BLOCKS_IO_CONVERT_EXT); ··· 4620 4586 inode->i_ino, map.m_lblk, 4621 4587 map.m_len, ret); 4622 4588 ext4_mark_inode_dirty(handle, inode); 4623 - ret2 = ext4_journal_stop(handle); 4624 - if (ret <= 0 || ret2 ) 4589 + if (credits) 4590 + ret2 = ext4_journal_stop(handle); 4591 + if (ret <= 0 || ret2) 4625 4592 break; 4626 4593 } 4594 + if (!credits) 4595 + ret2 = ext4_journal_stop(handle); 4627 4596 return ret > 0 ? ret2 : ret; 4628 4597 } 4629 4598 ··· 4696 4659 error = ext4_get_inode_loc(inode, &iloc); 4697 4660 if (error) 4698 4661 return error; 4699 - physical = iloc.bh->b_blocknr << blockbits; 4662 + physical = (__u64)iloc.bh->b_blocknr << blockbits; 4700 4663 offset = EXT4_GOOD_OLD_INODE_SIZE + 4701 4664 EXT4_I(inode)->i_extra_isize; 4702 4665 physical += offset; ··· 4704 4667 flags |= FIEMAP_EXTENT_DATA_INLINE; 4705 4668 brelse(iloc.bh); 4706 4669 } else { /* external block */ 4707 - physical = EXT4_I(inode)->i_file_acl << blockbits; 4670 + physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; 4708 4671 length = inode->i_sb->s_blocksize; 4709 4672 } 4710 4673

+55 -20

fs/ext4/extents_status.c

··· 10 10 * Ext4 extents status tree core functions. 11 11 */ 12 12 #include <linux/rbtree.h> 13 + #include <linux/list_sort.h> 13 14 #include "ext4.h" 14 15 #include "extents_status.h" 15 16 #include "ext4_extents.h" ··· 292 291 293 292 read_unlock(&EXT4_I(inode)->i_es_lock); 294 293 295 - ext4_es_lru_add(inode); 296 294 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 297 295 } 298 296 ··· 672 672 error: 673 673 write_unlock(&EXT4_I(inode)->i_es_lock); 674 674 675 - ext4_es_lru_add(inode); 676 675 ext4_es_print_tree(inode); 677 676 678 677 return err; ··· 733 734 734 735 read_unlock(&EXT4_I(inode)->i_es_lock); 735 736 736 - ext4_es_lru_add(inode); 737 737 trace_ext4_es_lookup_extent_exit(inode, es, found); 738 738 return found; 739 739 } ··· 876 878 EXTENT_STATUS_WRITTEN); 877 879 } 878 880 881 + static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, 882 + struct list_head *b) 883 + { 884 + struct ext4_inode_info *eia, *eib; 885 + eia = list_entry(a, struct ext4_inode_info, i_es_lru); 886 + eib = list_entry(b, struct ext4_inode_info, i_es_lru); 887 + 888 + if (eia->i_touch_when == eib->i_touch_when) 889 + return 0; 890 + if (time_after(eia->i_touch_when, eib->i_touch_when)) 891 + return 1; 892 + else 893 + return -1; 894 + } 895 + 879 896 static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 880 897 { 881 898 struct ext4_sb_info *sbi = container_of(shrink, 882 899 struct ext4_sb_info, s_es_shrinker); 883 900 struct ext4_inode_info *ei; 884 - struct list_head *cur, *tmp, scanned; 901 + struct list_head *cur, *tmp; 902 + LIST_HEAD(skiped); 885 903 int nr_to_scan = sc->nr_to_scan; 886 904 int ret, nr_shrunk = 0; 887 905 ··· 907 893 if (!nr_to_scan) 908 894 return ret; 909 895 910 - INIT_LIST_HEAD(&scanned); 911 - 912 896 spin_lock(&sbi->s_es_lru_lock); 897 + 898 + /* 899 + * If the inode that is at the head of LRU list is newer than 900 + * last_sorted time, that means that we need to sort this list. 901 + */ 902 + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); 903 + if (sbi->s_es_last_sorted < ei->i_touch_when) { 904 + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 905 + sbi->s_es_last_sorted = jiffies; 906 + } 907 + 913 908 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 914 - list_move_tail(cur, &scanned); 909 + /* 910 + * If we have already reclaimed all extents from extent 911 + * status tree, just stop the loop immediately. 912 + */ 913 + if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) 914 + break; 915 915 916 916 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 917 917 918 - read_lock(&ei->i_es_lock); 919 - if (ei->i_es_lru_nr == 0) { 920 - read_unlock(&ei->i_es_lock); 918 + /* Skip the inode that is newer than the last_sorted time */ 919 + if (sbi->s_es_last_sorted < ei->i_touch_when) { 920 + list_move_tail(cur, &skiped); 921 921 continue; 922 922 } 923 - read_unlock(&ei->i_es_lock); 923 + 924 + if (ei->i_es_lru_nr == 0) 925 + continue; 924 926 925 927 write_lock(&ei->i_es_lock); 926 928 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 929 + if (ei->i_es_lru_nr == 0) 930 + list_del_init(&ei->i_es_lru); 927 931 write_unlock(&ei->i_es_lock); 928 932 929 933 nr_shrunk += ret; ··· 949 917 if (nr_to_scan == 0) 950 918 break; 951 919 } 952 - list_splice_tail(&scanned, &sbi->s_es_lru); 920 + 921 + /* Move the newer inodes into the tail of the LRU list. */ 922 + list_splice_tail(&skiped, &sbi->s_es_lru); 953 923 spin_unlock(&sbi->s_es_lru_lock); 954 924 955 925 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); ··· 959 925 return ret; 960 926 } 961 927 962 - void ext4_es_register_shrinker(struct super_block *sb) 928 + void ext4_es_register_shrinker(struct ext4_sb_info *sbi) 963 929 { 964 - struct ext4_sb_info *sbi; 965 - 966 - sbi = EXT4_SB(sb); 967 930 INIT_LIST_HEAD(&sbi->s_es_lru); 968 931 spin_lock_init(&sbi->s_es_lru_lock); 932 + sbi->s_es_last_sorted = 0; 969 933 sbi->s_es_shrinker.shrink = ext4_es_shrink; 970 934 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 971 935 register_shrinker(&sbi->s_es_shrinker); 972 936 } 973 937 974 - void ext4_es_unregister_shrinker(struct super_block *sb) 938 + void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 975 939 { 976 - unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 940 + unregister_shrinker(&sbi->s_es_shrinker); 977 941 } 978 942 979 943 void ext4_es_lru_add(struct inode *inode) ··· 979 947 struct ext4_inode_info *ei = EXT4_I(inode); 980 948 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 981 949 950 + ei->i_touch_when = jiffies; 951 + 952 + if (!list_empty(&ei->i_es_lru)) 953 + return; 954 + 982 955 spin_lock(&sbi->s_es_lru_lock); 983 956 if (list_empty(&ei->i_es_lru)) 984 957 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 985 - else 986 - list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); 987 958 spin_unlock(&sbi->s_es_lru_lock); 988 959 } 989 960

+3 -2

fs/ext4/extents_status.h

··· 39 39 EXTENT_STATUS_DELAYED | \ 40 40 EXTENT_STATUS_HOLE) 41 41 42 + struct ext4_sb_info; 42 43 struct ext4_extent; 43 44 44 45 struct extent_status { ··· 120 119 es->es_pblk = block; 121 120 } 122 121 123 - extern void ext4_es_register_shrinker(struct super_block *sb); 124 - extern void ext4_es_unregister_shrinker(struct super_block *sb); 122 + extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 123 + extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 125 124 extern void ext4_es_lru_add(struct inode *inode); 126 125 extern void ext4_es_lru_del(struct inode *inode); 127 126

+7 -7

fs/ext4/file.c

··· 312 312 blkbits = inode->i_sb->s_blocksize_bits; 313 313 startoff = *offset; 314 314 lastoff = startoff; 315 - endoff = (map->m_lblk + map->m_len) << blkbits; 315 + endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; 316 316 317 317 index = startoff >> PAGE_CACHE_SHIFT; 318 318 end = endoff >> PAGE_CACHE_SHIFT; ··· 457 457 ret = ext4_map_blocks(NULL, inode, &map, 0); 458 458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 459 459 if (last != start) 460 - dataoff = last << blkbits; 460 + dataoff = (loff_t)last << blkbits; 461 461 break; 462 462 } 463 463 ··· 468 468 ext4_es_find_delayed_extent_range(inode, last, last, &es); 469 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 470 470 if (last != start) 471 - dataoff = last << blkbits; 471 + dataoff = (loff_t)last << blkbits; 472 472 break; 473 473 } 474 474 ··· 486 486 } 487 487 488 488 last++; 489 - dataoff = last << blkbits; 489 + dataoff = (loff_t)last << blkbits; 490 490 } while (last <= end); 491 491 492 492 mutex_unlock(&inode->i_mutex); ··· 540 540 ret = ext4_map_blocks(NULL, inode, &map, 0); 541 541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 542 542 last += ret; 543 - holeoff = last << blkbits; 543 + holeoff = (loff_t)last << blkbits; 544 544 continue; 545 545 } 546 546 ··· 551 551 ext4_es_find_delayed_extent_range(inode, last, last, &es); 552 552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 553 553 last = es.es_lblk + es.es_len; 554 - holeoff = last << blkbits; 554 + holeoff = (loff_t)last << blkbits; 555 555 continue; 556 556 } 557 557 ··· 566 566 &map, &holeoff); 567 567 if (!unwritten) { 568 568 last += ret; 569 - holeoff = last << blkbits; 569 + holeoff = (loff_t)last << blkbits; 570 570 continue; 571 571 } 572 572 }

+12 -40

fs/ext4/fsync.c

··· 73 73 return ret; 74 74 } 75 75 76 - /** 77 - * __sync_file - generic_file_fsync without the locking and filemap_write 78 - * @inode: inode to sync 79 - * @datasync: only sync essential metadata if true 80 - * 81 - * This is just generic_file_fsync without the locking. This is needed for 82 - * nojournal mode to make sure this inodes data/metadata makes it to disk 83 - * properly. The i_mutex should be held already. 84 - */ 85 - static int __sync_inode(struct inode *inode, int datasync) 86 - { 87 - int err; 88 - int ret; 89 - 90 - ret = sync_mapping_buffers(inode->i_mapping); 91 - if (!(inode->i_state & I_DIRTY)) 92 - return ret; 93 - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 94 - return ret; 95 - 96 - err = sync_inode_metadata(inode, 1); 97 - if (ret == 0) 98 - ret = err; 99 - return ret; 100 - } 101 - 102 76 /* 103 77 * akpm: A new design for ext4_sync_file(). 104 78 * ··· 90 116 struct inode *inode = file->f_mapping->host; 91 117 struct ext4_inode_info *ei = EXT4_I(inode); 92 118 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 93 - int ret, err; 119 + int ret = 0, err; 94 120 tid_t commit_tid; 95 121 bool needs_barrier = false; 96 122 ··· 98 124 99 125 trace_ext4_sync_file_enter(file, datasync); 100 126 101 - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 102 - if (ret) 103 - return ret; 104 - mutex_lock(&inode->i_mutex); 105 - 106 - if (inode->i_sb->s_flags & MS_RDONLY) 127 + if (inode->i_sb->s_flags & MS_RDONLY) { 128 + /* Make sure that we read updated s_mount_flags value */ 129 + smp_rmb(); 130 + if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 131 + ret = -EROFS; 107 132 goto out; 108 - 109 - ret = ext4_flush_unwritten_io(inode); 110 - if (ret < 0) 111 - goto out; 133 + } 112 134 113 135 if (!journal) { 114 - ret = __sync_inode(inode, datasync); 136 + ret = generic_file_fsync(file, start, end, datasync); 115 137 if (!ret && !hlist_empty(&inode->i_dentry)) 116 138 ret = ext4_sync_parent(inode); 117 139 goto out; 118 140 } 119 141 142 + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 143 + if (ret) 144 + return ret; 120 145 /* 121 146 * data=writeback,ordered: 122 147 * The caller's filemap_fdatawrite()/wait will sync the data. ··· 145 172 if (!ret) 146 173 ret = err; 147 174 } 148 - out: 149 - mutex_unlock(&inode->i_mutex); 175 + out: 150 176 trace_ext4_sync_file_exit(inode, ret); 151 177 return ret; 152 178 }

+2 -1

fs/ext4/ialloc.c

··· 747 747 if (!handle) { 748 748 BUG_ON(nblocks <= 0); 749 749 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 750 - handle_type, nblocks); 750 + handle_type, nblocks, 751 + 0); 751 752 if (IS_ERR(handle)) { 752 753 err = PTR_ERR(handle); 753 754 ext4_std_error(sb, err);

+14 -26

fs/ext4/indirect.c

··· 624 624 partial--; 625 625 } 626 626 out: 627 - trace_ext4_ind_map_blocks_exit(inode, map, err); 627 + trace_ext4_ind_map_blocks_exit(inode, flags, map, err); 628 628 return err; 629 629 } 630 630 ··· 675 675 676 676 retry: 677 677 if (rw == READ && ext4_should_dioread_nolock(inode)) { 678 - if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { 679 - mutex_lock(&inode->i_mutex); 680 - ext4_flush_unwritten_io(inode); 681 - mutex_unlock(&inode->i_mutex); 682 - } 683 678 /* 684 679 * Nolock dioread optimization may be dynamically disabled 685 680 * via ext4_inode_block_unlocked_dio(). Check inode's state ··· 774 779 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 775 780 } 776 781 777 - int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 782 + /* 783 + * Calculate number of indirect blocks touched by mapping @nrblocks logically 784 + * contiguous blocks 785 + */ 786 + int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) 778 787 { 779 - int indirects; 780 - 781 - /* if nrblocks are contiguous */ 782 - if (chunk) { 783 - /* 784 - * With N contiguous data blocks, we need at most 785 - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, 786 - * 2 dindirect blocks, and 1 tindirect block 787 - */ 788 - return DIV_ROUND_UP(nrblocks, 789 - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; 790 - } 791 788 /* 792 - * if nrblocks are not contiguous, worse case, each block touch 793 - * a indirect block, and each indirect block touch a double indirect 794 - * block, plus a triple indirect block 789 + * With N contiguous data blocks, we need at most 790 + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, 791 + * 2 dindirect blocks, and 1 tindirect block 795 792 */ 796 - indirects = nrblocks * 2 + 1; 797 - return indirects; 793 + return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; 798 794 } 799 795 800 796 /* ··· 926 940 __le32 *last) 927 941 { 928 942 __le32 *p; 929 - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 943 + int flags = EXT4_FREE_BLOCKS_VALIDATED; 930 944 int err; 931 945 932 946 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 933 - flags |= EXT4_FREE_BLOCKS_METADATA; 947 + flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; 948 + else if (ext4_should_journal_data(inode)) 949 + flags |= EXT4_FREE_BLOCKS_FORGET; 934 950 935 951 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 936 952 count)) {

+2 -2

fs/ext4/inline.c

··· 72 72 entry = (struct ext4_xattr_entry *) 73 73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 74 74 75 - free += le32_to_cpu(entry->e_value_size); 75 + free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); 76 76 goto out; 77 77 } 78 78 ··· 1810 1810 if (error) 1811 1811 goto out; 1812 1812 1813 - physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1813 + physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1814 1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1815 1815 physical += offsetof(struct ext4_inode, i_block); 1816 1816 length = i_size_read(inode);

+794 -975

fs/ext4/inode.c

··· 132 132 new_size); 133 133 } 134 134 135 - static void ext4_invalidatepage(struct page *page, unsigned long offset); 135 + static void ext4_invalidatepage(struct page *page, unsigned int offset, 136 + unsigned int length); 136 137 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 137 138 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 138 - static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 139 - struct inode *inode, struct page *page, loff_t from, 140 - loff_t length, int flags); 139 + static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 140 + int pextents); 141 141 142 142 /* 143 143 * Test whether an inode is a fast symlink. ··· 215 215 filemap_write_and_wait(&inode->i_data); 216 216 } 217 217 truncate_inode_pages(&inode->i_data, 0); 218 - ext4_ioend_shutdown(inode); 218 + 219 + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 219 220 goto no_delete; 220 221 } 221 222 ··· 226 225 if (ext4_should_order_data(inode)) 227 226 ext4_begin_ordered_truncate(inode, 0); 228 227 truncate_inode_pages(&inode->i_data, 0); 229 - ext4_ioend_shutdown(inode); 230 228 229 + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 231 230 if (is_bad_inode(inode)) 232 231 goto no_delete; 233 232 ··· 424 423 #define check_block_validity(inode, map) \ 425 424 __check_block_validity((inode), __func__, __LINE__, (map)) 426 425 427 - /* 428 - * Return the number of contiguous dirty pages in a given inode 429 - * starting at page frame idx. 430 - */ 431 - static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, 432 - unsigned int max_pages) 433 - { 434 - struct address_space *mapping = inode->i_mapping; 435 - pgoff_t index; 436 - struct pagevec pvec; 437 - pgoff_t num = 0; 438 - int i, nr_pages, done = 0; 439 - 440 - if (max_pages == 0) 441 - return 0; 442 - pagevec_init(&pvec, 0); 443 - while (!done) { 444 - index = idx; 445 - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 446 - PAGECACHE_TAG_DIRTY, 447 - (pgoff_t)PAGEVEC_SIZE); 448 - if (nr_pages == 0) 449 - break; 450 - for (i = 0; i < nr_pages; i++) { 451 - struct page *page = pvec.pages[i]; 452 - struct buffer_head *bh, *head; 453 - 454 - lock_page(page); 455 - if (unlikely(page->mapping != mapping) || 456 - !PageDirty(page) || 457 - PageWriteback(page) || 458 - page->index != idx) { 459 - done = 1; 460 - unlock_page(page); 461 - break; 462 - } 463 - if (page_has_buffers(page)) { 464 - bh = head = page_buffers(page); 465 - do { 466 - if (!buffer_delay(bh) && 467 - !buffer_unwritten(bh)) 468 - done = 1; 469 - bh = bh->b_this_page; 470 - } while (!done && (bh != head)); 471 - } 472 - unlock_page(page); 473 - if (done) 474 - break; 475 - idx++; 476 - num++; 477 - if (num >= max_pages) { 478 - done = 1; 479 - break; 480 - } 481 - } 482 - pagevec_release(&pvec); 483 - } 484 - return num; 485 - } 486 - 487 426 #ifdef ES_AGGRESSIVE_TEST 488 427 static void ext4_map_blocks_es_recheck(handle_t *handle, 489 428 struct inode *inode, ··· 513 572 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 514 573 "logical block %lu\n", inode->i_ino, flags, map->m_len, 515 574 (unsigned long) map->m_lblk); 575 + 576 + ext4_es_lru_add(inode); 516 577 517 578 /* Lookup extent status tree firstly */ 518 579 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ··· 1061 1118 } 1062 1119 } 1063 1120 1064 - if (ext4_has_inline_data(inode)) 1065 - copied = ext4_write_inline_data_end(inode, pos, len, 1066 - copied, page); 1067 - else 1121 + if (ext4_has_inline_data(inode)) { 1122 + ret = ext4_write_inline_data_end(inode, pos, len, 1123 + copied, page); 1124 + if (ret < 0) 1125 + goto errout; 1126 + copied = ret; 1127 + } else 1068 1128 copied = block_write_end(file, mapping, pos, 1069 1129 len, copied, page, fsdata); 1070 1130 ··· 1103 1157 if (i_size_changed) 1104 1158 ext4_mark_inode_dirty(handle, inode); 1105 1159 1106 - if (copied < 0) 1107 - ret = copied; 1108 1160 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1109 1161 /* if we have allocated more blocks and copied 1110 1162 * less. We will have blocks allocated outside ··· 1359 1415 } 1360 1416 1361 1417 static void ext4_da_page_release_reservation(struct page *page, 1362 - unsigned long offset) 1418 + unsigned int offset, 1419 + unsigned int length) 1363 1420 { 1364 1421 int to_release = 0; 1365 1422 struct buffer_head *head, *bh; 1366 1423 unsigned int curr_off = 0; 1367 1424 struct inode *inode = page->mapping->host; 1368 1425 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1426 + unsigned int stop = offset + length; 1369 1427 int num_clusters; 1370 1428 ext4_fsblk_t lblk; 1429 + 1430 + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); 1371 1431 1372 1432 head = page_buffers(page); 1373 1433 bh = head; 1374 1434 do { 1375 1435 unsigned int next_off = curr_off + bh->b_size; 1436 + 1437 + if (next_off > stop) 1438 + break; 1376 1439 1377 1440 if ((offset <= curr_off) && (buffer_delay(bh))) { 1378 1441 to_release++; ··· 1411 1460 * Delayed allocation stuff 1412 1461 */ 1413 1462 1414 - /* 1415 - * mpage_da_submit_io - walks through extent of pages and try to write 1416 - * them with writepage() call back 1417 - * 1418 - * @mpd->inode: inode 1419 - * @mpd->first_page: first page of the extent 1420 - * @mpd->next_page: page after the last page of the extent 1421 - * 1422 - * By the time mpage_da_submit_io() is called we expect all blocks 1423 - * to be allocated. this may be wrong if allocation failed. 1424 - * 1425 - * As pages are already locked by write_cache_pages(), we can't use it 1426 - */ 1427 - static int mpage_da_submit_io(struct mpage_da_data *mpd, 1428 - struct ext4_map_blocks *map) 1429 - { 1430 - struct pagevec pvec; 1431 - unsigned long index, end; 1432 - int ret = 0, err, nr_pages, i; 1433 - struct inode *inode = mpd->inode; 1434 - struct address_space *mapping = inode->i_mapping; 1435 - loff_t size = i_size_read(inode); 1436 - unsigned int len, block_start; 1437 - struct buffer_head *bh, *page_bufs = NULL; 1438 - sector_t pblock = 0, cur_logical = 0; 1439 - struct ext4_io_submit io_submit; 1463 + struct mpage_da_data { 1464 + struct inode *inode; 1465 + struct writeback_control *wbc; 1440 1466 1441 - BUG_ON(mpd->next_page <= mpd->first_page); 1442 - memset(&io_submit, 0, sizeof(io_submit)); 1467 + pgoff_t first_page; /* The first page to write */ 1468 + pgoff_t next_page; /* Current page to examine */ 1469 + pgoff_t last_page; /* Last page to examine */ 1443 1470 /* 1444 - * We need to start from the first_page to the next_page - 1 1445 - * to make sure we also write the mapped dirty buffer_heads. 1446 - * If we look at mpd->b_blocknr we would only be looking 1447 - * at the currently mapped buffer_heads. 1471 + * Extent to map - this can be after first_page because that can be 1472 + * fully mapped. We somewhat abuse m_flags to store whether the extent 1473 + * is delalloc or unwritten. 1448 1474 */ 1449 - index = mpd->first_page; 1450 - end = mpd->next_page - 1; 1475 + struct ext4_map_blocks map; 1476 + struct ext4_io_submit io_submit; /* IO submission data */ 1477 + }; 1451 1478 1452 - pagevec_init(&pvec, 0); 1453 - while (index <= end) { 1454 - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1455 - if (nr_pages == 0) 1456 - break; 1457 - for (i = 0; i < nr_pages; i++) { 1458 - int skip_page = 0; 1459 - struct page *page = pvec.pages[i]; 1460 - 1461 - index = page->index; 1462 - if (index > end) 1463 - break; 1464 - 1465 - if (index == size >> PAGE_CACHE_SHIFT) 1466 - len = size & ~PAGE_CACHE_MASK; 1467 - else 1468 - len = PAGE_CACHE_SIZE; 1469 - if (map) { 1470 - cur_logical = index << (PAGE_CACHE_SHIFT - 1471 - inode->i_blkbits); 1472 - pblock = map->m_pblk + (cur_logical - 1473 - map->m_lblk); 1474 - } 1475 - index++; 1476 - 1477 - BUG_ON(!PageLocked(page)); 1478 - BUG_ON(PageWriteback(page)); 1479 - 1480 - bh = page_bufs = page_buffers(page); 1481 - block_start = 0; 1482 - do { 1483 - if (map && (cur_logical >= map->m_lblk) && 1484 - (cur_logical <= (map->m_lblk + 1485 - (map->m_len - 1)))) { 1486 - if (buffer_delay(bh)) { 1487 - clear_buffer_delay(bh); 1488 - bh->b_blocknr = pblock; 1489 - } 1490 - if (buffer_unwritten(bh) || 1491 - buffer_mapped(bh)) 1492 - BUG_ON(bh->b_blocknr != pblock); 1493 - if (map->m_flags & EXT4_MAP_UNINIT) 1494 - set_buffer_uninit(bh); 1495 - clear_buffer_unwritten(bh); 1496 - } 1497 - 1498 - /* 1499 - * skip page if block allocation undone and 1500 - * block is dirty 1501 - */ 1502 - if (ext4_bh_delay_or_unwritten(NULL, bh)) 1503 - skip_page = 1; 1504 - bh = bh->b_this_page; 1505 - block_start += bh->b_size; 1506 - cur_logical++; 1507 - pblock++; 1508 - } while (bh != page_bufs); 1509 - 1510 - if (skip_page) { 1511 - unlock_page(page); 1512 - continue; 1513 - } 1514 - 1515 - clear_page_dirty_for_io(page); 1516 - err = ext4_bio_write_page(&io_submit, page, len, 1517 - mpd->wbc); 1518 - if (!err) 1519 - mpd->pages_written++; 1520 - /* 1521 - * In error case, we have to continue because 1522 - * remaining pages are still locked 1523 - */ 1524 - if (ret == 0) 1525 - ret = err; 1526 - } 1527 - pagevec_release(&pvec); 1528 - } 1529 - ext4_io_submit(&io_submit); 1530 - return ret; 1531 - } 1532 - 1533 - static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1479 + static void mpage_release_unused_pages(struct mpage_da_data *mpd, 1480 + bool invalidate) 1534 1481 { 1535 1482 int nr_pages, i; 1536 1483 pgoff_t index, end; 1537 1484 struct pagevec pvec; 1538 1485 struct inode *inode = mpd->inode; 1539 1486 struct address_space *mapping = inode->i_mapping; 1540 - ext4_lblk_t start, last; 1487 + 1488 + /* This is necessary when next_page == 0. */ 1489 + if (mpd->first_page >= mpd->next_page) 1490 + return; 1541 1491 1542 1492 index = mpd->first_page; 1543 1493 end = mpd->next_page - 1; 1544 - 1545 - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1546 - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1547 - ext4_es_remove_extent(inode, start, last - start + 1); 1494 + if (invalidate) { 1495 + ext4_lblk_t start, last; 1496 + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1497 + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1498 + ext4_es_remove_extent(inode, start, last - start + 1); 1499 + } 1548 1500 1549 1501 pagevec_init(&pvec, 0); 1550 1502 while (index <= end) { ··· 1460 1606 break; 1461 1607 BUG_ON(!PageLocked(page)); 1462 1608 BUG_ON(PageWriteback(page)); 1463 - block_invalidatepage(page, 0); 1464 - ClearPageUptodate(page); 1609 + if (invalidate) { 1610 + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); 1611 + ClearPageUptodate(page); 1612 + } 1465 1613 unlock_page(page); 1466 1614 } 1467 1615 index = pvec.pages[nr_pages - 1]->index + 1; 1468 1616 pagevec_release(&pvec); 1469 1617 } 1470 - return; 1471 1618 } 1472 1619 1473 1620 static void ext4_print_free_blocks(struct inode *inode) ··· 1494 1639 ei->i_reserved_meta_blocks); 1495 1640 ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", 1496 1641 ei->i_allocated_meta_blocks); 1497 - return; 1498 - } 1499 - 1500 - /* 1501 - * mpage_da_map_and_submit - go through given space, map them 1502 - * if necessary, and then submit them for I/O 1503 - * 1504 - * @mpd - bh describing space 1505 - * 1506 - * The function skips space we know is already mapped to disk blocks. 1507 - * 1508 - */ 1509 - static void mpage_da_map_and_submit(struct mpage_da_data *mpd) 1510 - { 1511 - int err, blks, get_blocks_flags; 1512 - struct ext4_map_blocks map, *mapp = NULL; 1513 - sector_t next = mpd->b_blocknr; 1514 - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 1515 - loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 1516 - handle_t *handle = NULL; 1517 - 1518 - /* 1519 - * If the blocks are mapped already, or we couldn't accumulate 1520 - * any blocks, then proceed immediately to the submission stage. 1521 - */ 1522 - if ((mpd->b_size == 0) || 1523 - ((mpd->b_state & (1 << BH_Mapped)) && 1524 - !(mpd->b_state & (1 << BH_Delay)) && 1525 - !(mpd->b_state & (1 << BH_Unwritten)))) 1526 - goto submit_io; 1527 - 1528 - handle = ext4_journal_current_handle(); 1529 - BUG_ON(!handle); 1530 - 1531 - /* 1532 - * Call ext4_map_blocks() to allocate any delayed allocation 1533 - * blocks, or to convert an uninitialized extent to be 1534 - * initialized (in the case where we have written into 1535 - * one or more preallocated blocks). 1536 - * 1537 - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to 1538 - * indicate that we are on the delayed allocation path. This 1539 - * affects functions in many different parts of the allocation 1540 - * call path. This flag exists primarily because we don't 1541 - * want to change *many* call functions, so ext4_map_blocks() 1542 - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the 1543 - * inode's allocation semaphore is taken. 1544 - * 1545 - * If the blocks in questions were delalloc blocks, set 1546 - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 1547 - * variables are updated after the blocks have been allocated. 1548 - */ 1549 - map.m_lblk = next; 1550 - map.m_len = max_blocks; 1551 - /* 1552 - * We're in delalloc path and it is possible that we're going to 1553 - * need more metadata blocks than previously reserved. However 1554 - * we must not fail because we're in writeback and there is 1555 - * nothing we can do about it so it might result in data loss. 1556 - * So use reserved blocks to allocate metadata if possible. 1557 - */ 1558 - get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 1559 - EXT4_GET_BLOCKS_METADATA_NOFAIL; 1560 - if (ext4_should_dioread_nolock(mpd->inode)) 1561 - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 1562 - if (mpd->b_state & (1 << BH_Delay)) 1563 - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 1564 - 1565 - 1566 - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 1567 - if (blks < 0) { 1568 - struct super_block *sb = mpd->inode->i_sb; 1569 - 1570 - err = blks; 1571 - /* 1572 - * If get block returns EAGAIN or ENOSPC and there 1573 - * appears to be free blocks we will just let 1574 - * mpage_da_submit_io() unlock all of the pages. 1575 - */ 1576 - if (err == -EAGAIN) 1577 - goto submit_io; 1578 - 1579 - if (err == -ENOSPC && ext4_count_free_clusters(sb)) { 1580 - mpd->retval = err; 1581 - goto submit_io; 1582 - } 1583 - 1584 - /* 1585 - * get block failure will cause us to loop in 1586 - * writepages, because a_ops->writepage won't be able 1587 - * to make progress. The page will be redirtied by 1588 - * writepage and writepages will again try to write 1589 - * the same. 1590 - */ 1591 - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { 1592 - ext4_msg(sb, KERN_CRIT, 1593 - "delayed block allocation failed for inode %lu " 1594 - "at logical offset %llu with max blocks %zd " 1595 - "with error %d", mpd->inode->i_ino, 1596 - (unsigned long long) next, 1597 - mpd->b_size >> mpd->inode->i_blkbits, err); 1598 - ext4_msg(sb, KERN_CRIT, 1599 - "This should not happen!! Data will be lost"); 1600 - if (err == -ENOSPC) 1601 - ext4_print_free_blocks(mpd->inode); 1602 - } 1603 - /* invalidate all the pages */ 1604 - ext4_da_block_invalidatepages(mpd); 1605 - 1606 - /* Mark this page range as having been completed */ 1607 - mpd->io_done = 1; 1608 - return; 1609 - } 1610 - BUG_ON(blks == 0); 1611 - 1612 - mapp = &map; 1613 - if (map.m_flags & EXT4_MAP_NEW) { 1614 - struct block_device *bdev = mpd->inode->i_sb->s_bdev; 1615 - int i; 1616 - 1617 - for (i = 0; i < map.m_len; i++) 1618 - unmap_underlying_metadata(bdev, map.m_pblk + i); 1619 - } 1620 - 1621 - /* 1622 - * Update on-disk size along with block allocation. 1623 - */ 1624 - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; 1625 - if (disksize > i_size_read(mpd->inode)) 1626 - disksize = i_size_read(mpd->inode); 1627 - if (disksize > EXT4_I(mpd->inode)->i_disksize) { 1628 - ext4_update_i_disksize(mpd->inode, disksize); 1629 - err = ext4_mark_inode_dirty(handle, mpd->inode); 1630 - if (err) 1631 - ext4_error(mpd->inode->i_sb, 1632 - "Failed to mark inode %lu dirty", 1633 - mpd->inode->i_ino); 1634 - } 1635 - 1636 - submit_io: 1637 - mpage_da_submit_io(mpd, mapp); 1638 - mpd->io_done = 1; 1639 - } 1640 - 1641 - #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1642 - (1 << BH_Delay) | (1 << BH_Unwritten)) 1643 - 1644 - /* 1645 - * mpage_add_bh_to_extent - try to add one more block to extent of blocks 1646 - * 1647 - * @mpd->lbh - extent of blocks 1648 - * @logical - logical number of the block in the file 1649 - * @b_state - b_state of the buffer head added 1650 - * 1651 - * the function is used to collect contig. blocks in same state 1652 - */ 1653 - static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, 1654 - unsigned long b_state) 1655 - { 1656 - sector_t next; 1657 - int blkbits = mpd->inode->i_blkbits; 1658 - int nrblocks = mpd->b_size >> blkbits; 1659 - 1660 - /* 1661 - * XXX Don't go larger than mballoc is willing to allocate 1662 - * This is a stopgap solution. We eventually need to fold 1663 - * mpage_da_submit_io() into this function and then call 1664 - * ext4_map_blocks() multiple times in a loop 1665 - */ 1666 - if (nrblocks >= (8*1024*1024 >> blkbits)) 1667 - goto flush_it; 1668 - 1669 - /* check if the reserved journal credits might overflow */ 1670 - if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { 1671 - if (nrblocks >= EXT4_MAX_TRANS_DATA) { 1672 - /* 1673 - * With non-extent format we are limited by the journal 1674 - * credit available. Total credit needed to insert 1675 - * nrblocks contiguous blocks is dependent on the 1676 - * nrblocks. So limit nrblocks. 1677 - */ 1678 - goto flush_it; 1679 - } 1680 - } 1681 - /* 1682 - * First block in the extent 1683 - */ 1684 - if (mpd->b_size == 0) { 1685 - mpd->b_blocknr = logical; 1686 - mpd->b_size = 1 << blkbits; 1687 - mpd->b_state = b_state & BH_FLAGS; 1688 - return; 1689 - } 1690 - 1691 - next = mpd->b_blocknr + nrblocks; 1692 - /* 1693 - * Can we merge the block to our big extent? 1694 - */ 1695 - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { 1696 - mpd->b_size += 1 << blkbits; 1697 - return; 1698 - } 1699 - 1700 - flush_it: 1701 - /* 1702 - * We couldn't merge the block to our extent, so we 1703 - * need to flush current extent and start new one 1704 - */ 1705 - mpage_da_map_and_submit(mpd); 1706 1642 return; 1707 1643 } 1708 1644 ··· 1528 1882 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," 1529 1883 "logical block %lu\n", inode->i_ino, map->m_len, 1530 1884 (unsigned long) map->m_lblk); 1885 + 1886 + ext4_es_lru_add(inode); 1531 1887 1532 1888 /* Lookup extent status tree firstly */ 1533 1889 if (ext4_es_lookup_extent(inode, iblock, &es)) { ··· 1804 2156 * lock so we have to do some magic. 1805 2157 * 1806 2158 * This function can get called via... 1807 - * - ext4_da_writepages after taking page lock (have journal handle) 2159 + * - ext4_writepages after taking page lock (have journal handle) 1808 2160 * - journal_submit_inode_data_buffers (no journal handle) 1809 2161 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1810 2162 * - grab_page_cache when doing write_begin (have journal handle) ··· 1882 2234 */ 1883 2235 return __ext4_journalled_writepage(page, len); 1884 2236 1885 - memset(&io_submit, 0, sizeof(io_submit)); 2237 + ext4_io_submit_init(&io_submit, wbc); 2238 + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 2239 + if (!io_submit.io_end) { 2240 + redirty_page_for_writepage(wbc, page); 2241 + unlock_page(page); 2242 + return -ENOMEM; 2243 + } 1886 2244 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 1887 2245 ext4_io_submit(&io_submit); 2246 + /* Drop io_end reference we got from init */ 2247 + ext4_put_io_end_defer(io_submit.io_end); 1888 2248 return ret; 1889 2249 } 1890 2250 2251 + #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) 2252 + 1891 2253 /* 1892 - * This is called via ext4_da_writepages() to 1893 - * calculate the total number of credits to reserve to fit 1894 - * a single extent allocation into a single transaction, 1895 - * ext4_da_writpeages() will loop calling this before 1896 - * the block allocation. 2254 + * mballoc gives us at most this number of blocks... 2255 + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 2256 + * The rest of mballoc seems to handle chunks upto full group size. 1897 2257 */ 2258 + #define MAX_WRITEPAGES_EXTENT_LEN 2048 1898 2259 1899 - static int ext4_da_writepages_trans_blocks(struct inode *inode) 2260 + /* 2261 + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map 2262 + * 2263 + * @mpd - extent of blocks 2264 + * @lblk - logical number of the block in the file 2265 + * @b_state - b_state of the buffer head added 2266 + * 2267 + * the function is used to collect contig. blocks in same state 2268 + */ 2269 + static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, 2270 + unsigned long b_state) 1900 2271 { 1901 - int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2272 + struct ext4_map_blocks *map = &mpd->map; 1902 2273 1903 - /* 1904 - * With non-extent format the journal credit needed to 1905 - * insert nrblocks contiguous block is dependent on 1906 - * number of contiguous block. So we will limit 1907 - * number of contiguous block to a sane value 1908 - */ 1909 - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 1910 - (max_blocks > EXT4_MAX_TRANS_DATA)) 1911 - max_blocks = EXT4_MAX_TRANS_DATA; 2274 + /* Don't go larger than mballoc is willing to allocate */ 2275 + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) 2276 + return 0; 1912 2277 1913 - return ext4_chunk_trans_blocks(inode, max_blocks); 2278 + /* First block in the extent? */ 2279 + if (map->m_len == 0) { 2280 + map->m_lblk = lblk; 2281 + map->m_len = 1; 2282 + map->m_flags = b_state & BH_FLAGS; 2283 + return 1; 2284 + } 2285 + 2286 + /* Can we merge the block to our big extent? */ 2287 + if (lblk == map->m_lblk + map->m_len && 2288 + (b_state & BH_FLAGS) == map->m_flags) { 2289 + map->m_len++; 2290 + return 1; 2291 + } 2292 + return 0; 2293 + } 2294 + 2295 + static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, 2296 + struct buffer_head *head, 2297 + struct buffer_head *bh, 2298 + ext4_lblk_t lblk) 2299 + { 2300 + struct inode *inode = mpd->inode; 2301 + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) 2302 + >> inode->i_blkbits; 2303 + 2304 + do { 2305 + BUG_ON(buffer_locked(bh)); 2306 + 2307 + if (!buffer_dirty(bh) || !buffer_mapped(bh) || 2308 + (!buffer_delay(bh) && !buffer_unwritten(bh)) || 2309 + lblk >= blocks) { 2310 + /* Found extent to map? */ 2311 + if (mpd->map.m_len) 2312 + return false; 2313 + if (lblk >= blocks) 2314 + return true; 2315 + continue; 2316 + } 2317 + if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) 2318 + return false; 2319 + } while (lblk++, (bh = bh->b_this_page) != head); 2320 + return true; 2321 + } 2322 + 2323 + static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) 2324 + { 2325 + int len; 2326 + loff_t size = i_size_read(mpd->inode); 2327 + int err; 2328 + 2329 + BUG_ON(page->index != mpd->first_page); 2330 + if (page->index == size >> PAGE_CACHE_SHIFT) 2331 + len = size & ~PAGE_CACHE_MASK; 2332 + else 2333 + len = PAGE_CACHE_SIZE; 2334 + clear_page_dirty_for_io(page); 2335 + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); 2336 + if (!err) 2337 + mpd->wbc->nr_to_write--; 2338 + mpd->first_page++; 2339 + 2340 + return err; 1914 2341 } 1915 2342 1916 2343 /* 1917 - * write_cache_pages_da - walk the list of dirty pages of the given 1918 - * address space and accumulate pages that need writing, and call 1919 - * mpage_da_map_and_submit to map a single contiguous memory region 1920 - * and then write them. 2344 + * mpage_map_buffers - update buffers corresponding to changed extent and 2345 + * submit fully mapped pages for IO 2346 + * 2347 + * @mpd - description of extent to map, on return next extent to map 2348 + * 2349 + * Scan buffers corresponding to changed extent (we expect corresponding pages 2350 + * to be already locked) and update buffer state according to new extent state. 2351 + * We map delalloc buffers to their physical location, clear unwritten bits, 2352 + * and mark buffers as uninit when we perform writes to uninitialized extents 2353 + * and do extent conversion after IO is finished. If the last page is not fully 2354 + * mapped, we update @map to the next extent in the last page that needs 2355 + * mapping. Otherwise we submit the page for IO. 1921 2356 */ 1922 - static int write_cache_pages_da(handle_t *handle, 1923 - struct address_space *mapping, 1924 - struct writeback_control *wbc, 1925 - struct mpage_da_data *mpd, 1926 - pgoff_t *done_index) 2357 + static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) 1927 2358 { 1928 - struct buffer_head *bh, *head; 1929 - struct inode *inode = mapping->host; 1930 - struct pagevec pvec; 1931 - unsigned int nr_pages; 1932 - sector_t logical; 1933 - pgoff_t index, end; 1934 - long nr_to_write = wbc->nr_to_write; 1935 - int i, tag, ret = 0; 2359 + struct pagevec pvec; 2360 + int nr_pages, i; 2361 + struct inode *inode = mpd->inode; 2362 + struct buffer_head *head, *bh; 2363 + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; 2364 + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) 2365 + >> inode->i_blkbits; 2366 + pgoff_t start, end; 2367 + ext4_lblk_t lblk; 2368 + sector_t pblock; 2369 + int err; 1936 2370 1937 - memset(mpd, 0, sizeof(struct mpage_da_data)); 1938 - mpd->wbc = wbc; 1939 - mpd->inode = inode; 2371 + start = mpd->map.m_lblk >> bpp_bits; 2372 + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; 2373 + lblk = start << bpp_bits; 2374 + pblock = mpd->map.m_pblk; 2375 + 1940 2376 pagevec_init(&pvec, 0); 1941 - index = wbc->range_start >> PAGE_CACHE_SHIFT; 1942 - end = wbc->range_end >> PAGE_CACHE_SHIFT; 2377 + while (start <= end) { 2378 + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, 2379 + PAGEVEC_SIZE); 2380 + if (nr_pages == 0) 2381 + break; 2382 + for (i = 0; i < nr_pages; i++) { 2383 + struct page *page = pvec.pages[i]; 1943 2384 1944 - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2385 + if (page->index > end) 2386 + break; 2387 + /* Upto 'end' pages must be contiguous */ 2388 + BUG_ON(page->index != start); 2389 + bh = head = page_buffers(page); 2390 + do { 2391 + if (lblk < mpd->map.m_lblk) 2392 + continue; 2393 + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { 2394 + /* 2395 + * Buffer after end of mapped extent. 2396 + * Find next buffer in the page to map. 2397 + */ 2398 + mpd->map.m_len = 0; 2399 + mpd->map.m_flags = 0; 2400 + add_page_bufs_to_extent(mpd, head, bh, 2401 + lblk); 2402 + pagevec_release(&pvec); 2403 + return 0; 2404 + } 2405 + if (buffer_delay(bh)) { 2406 + clear_buffer_delay(bh); 2407 + bh->b_blocknr = pblock++; 2408 + } 2409 + clear_buffer_unwritten(bh); 2410 + } while (++lblk < blocks && 2411 + (bh = bh->b_this_page) != head); 2412 + 2413 + /* 2414 + * FIXME: This is going to break if dioread_nolock 2415 + * supports blocksize < pagesize as we will try to 2416 + * convert potentially unmapped parts of inode. 2417 + */ 2418 + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; 2419 + /* Page fully mapped - let IO run! */ 2420 + err = mpage_submit_page(mpd, page); 2421 + if (err < 0) { 2422 + pagevec_release(&pvec); 2423 + return err; 2424 + } 2425 + start++; 2426 + } 2427 + pagevec_release(&pvec); 2428 + } 2429 + /* Extent fully mapped and matches with page boundary. We are done. */ 2430 + mpd->map.m_len = 0; 2431 + mpd->map.m_flags = 0; 2432 + return 0; 2433 + } 2434 + 2435 + static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) 2436 + { 2437 + struct inode *inode = mpd->inode; 2438 + struct ext4_map_blocks *map = &mpd->map; 2439 + int get_blocks_flags; 2440 + int err; 2441 + 2442 + trace_ext4_da_write_pages_extent(inode, map); 2443 + /* 2444 + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or 2445 + * to convert an uninitialized extent to be initialized (in the case 2446 + * where we have written into one or more preallocated blocks). It is 2447 + * possible that we're going to need more metadata blocks than 2448 + * previously reserved. However we must not fail because we're in 2449 + * writeback and there is nothing we can do about it so it might result 2450 + * in data loss. So use reserved blocks to allocate metadata if 2451 + * possible. 2452 + * 2453 + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks 2454 + * in question are delalloc blocks. This affects functions in many 2455 + * different parts of the allocation call path. This flag exists 2456 + * primarily because we don't want to change *many* call functions, so 2457 + * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag 2458 + * once the inode's allocation semaphore is taken. 2459 + */ 2460 + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2461 + EXT4_GET_BLOCKS_METADATA_NOFAIL; 2462 + if (ext4_should_dioread_nolock(inode)) 2463 + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2464 + if (map->m_flags & (1 << BH_Delay)) 2465 + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2466 + 2467 + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); 2468 + if (err < 0) 2469 + return err; 2470 + if (map->m_flags & EXT4_MAP_UNINIT) { 2471 + if (!mpd->io_submit.io_end->handle && 2472 + ext4_handle_valid(handle)) { 2473 + mpd->io_submit.io_end->handle = handle->h_rsv_handle; 2474 + handle->h_rsv_handle = NULL; 2475 + } 2476 + ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); 2477 + } 2478 + 2479 + BUG_ON(map->m_len == 0); 2480 + if (map->m_flags & EXT4_MAP_NEW) { 2481 + struct block_device *bdev = inode->i_sb->s_bdev; 2482 + int i; 2483 + 2484 + for (i = 0; i < map->m_len; i++) 2485 + unmap_underlying_metadata(bdev, map->m_pblk + i); 2486 + } 2487 + return 0; 2488 + } 2489 + 2490 + /* 2491 + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length 2492 + * mpd->len and submit pages underlying it for IO 2493 + * 2494 + * @handle - handle for journal operations 2495 + * @mpd - extent to map 2496 + * 2497 + * The function maps extent starting at mpd->lblk of length mpd->len. If it is 2498 + * delayed, blocks are allocated, if it is unwritten, we may need to convert 2499 + * them to initialized or split the described range from larger unwritten 2500 + * extent. Note that we need not map all the described range since allocation 2501 + * can return less blocks or the range is covered by more unwritten extents. We 2502 + * cannot map more because we are limited by reserved transaction credits. On 2503 + * the other hand we always make sure that the last touched page is fully 2504 + * mapped so that it can be written out (and thus forward progress is 2505 + * guaranteed). After mapping we submit all mapped pages for IO. 2506 + */ 2507 + static int mpage_map_and_submit_extent(handle_t *handle, 2508 + struct mpage_da_data *mpd, 2509 + bool *give_up_on_write) 2510 + { 2511 + struct inode *inode = mpd->inode; 2512 + struct ext4_map_blocks *map = &mpd->map; 2513 + int err; 2514 + loff_t disksize; 2515 + 2516 + mpd->io_submit.io_end->offset = 2517 + ((loff_t)map->m_lblk) << inode->i_blkbits; 2518 + while (map->m_len) { 2519 + err = mpage_map_one_extent(handle, mpd); 2520 + if (err < 0) { 2521 + struct super_block *sb = inode->i_sb; 2522 + 2523 + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 2524 + goto invalidate_dirty_pages; 2525 + /* 2526 + * Let the uper layers retry transient errors. 2527 + * In the case of ENOSPC, if ext4_count_free_blocks() 2528 + * is non-zero, a commit should free up blocks. 2529 + */ 2530 + if ((err == -ENOMEM) || 2531 + (err == -ENOSPC && ext4_count_free_clusters(sb))) 2532 + return err; 2533 + ext4_msg(sb, KERN_CRIT, 2534 + "Delayed block allocation failed for " 2535 + "inode %lu at logical offset %llu with" 2536 + " max blocks %u with error %d", 2537 + inode->i_ino, 2538 + (unsigned long long)map->m_lblk, 2539 + (unsigned)map->m_len, -err); 2540 + ext4_msg(sb, KERN_CRIT, 2541 + "This should not happen!! Data will " 2542 + "be lost\n"); 2543 + if (err == -ENOSPC) 2544 + ext4_print_free_blocks(inode); 2545 + invalidate_dirty_pages: 2546 + *give_up_on_write = true; 2547 + return err; 2548 + } 2549 + /* 2550 + * Update buffer state, submit mapped pages, and get us new 2551 + * extent to map 2552 + */ 2553 + err = mpage_map_and_submit_buffers(mpd); 2554 + if (err < 0) 2555 + return err; 2556 + } 2557 + 2558 + /* Update on-disk size after IO is submitted */ 2559 + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; 2560 + if (disksize > i_size_read(inode)) 2561 + disksize = i_size_read(inode); 2562 + if (disksize > EXT4_I(inode)->i_disksize) { 2563 + int err2; 2564 + 2565 + ext4_update_i_disksize(inode, disksize); 2566 + err2 = ext4_mark_inode_dirty(handle, inode); 2567 + if (err2) 2568 + ext4_error(inode->i_sb, 2569 + "Failed to mark inode %lu dirty", 2570 + inode->i_ino); 2571 + if (!err) 2572 + err = err2; 2573 + } 2574 + return err; 2575 + } 2576 + 2577 + /* 2578 + * Calculate the total number of credits to reserve for one writepages 2579 + * iteration. This is called from ext4_writepages(). We map an extent of 2580 + * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2581 + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + 2582 + * bpp - 1 blocks in bpp different extents. 2583 + */ 2584 + static int ext4_da_writepages_trans_blocks(struct inode *inode) 2585 + { 2586 + int bpp = ext4_journal_blocks_per_page(inode); 2587 + 2588 + return ext4_meta_trans_blocks(inode, 2589 + MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); 2590 + } 2591 + 2592 + /* 2593 + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages 2594 + * and underlying extent to map 2595 + * 2596 + * @mpd - where to look for pages 2597 + * 2598 + * Walk dirty pages in the mapping. If they are fully mapped, submit them for 2599 + * IO immediately. When we find a page which isn't mapped we start accumulating 2600 + * extent of buffers underlying these pages that needs mapping (formed by 2601 + * either delayed or unwritten buffers). We also lock the pages containing 2602 + * these buffers. The extent found is returned in @mpd structure (starting at 2603 + * mpd->lblk with length mpd->len blocks). 2604 + * 2605 + * Note that this function can attach bios to one io_end structure which are 2606 + * neither logically nor physically contiguous. Although it may seem as an 2607 + * unnecessary complication, it is actually inevitable in blocksize < pagesize 2608 + * case as we need to track IO to all buffers underlying a page in one io_end. 2609 + */ 2610 + static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) 2611 + { 2612 + struct address_space *mapping = mpd->inode->i_mapping; 2613 + struct pagevec pvec; 2614 + unsigned int nr_pages; 2615 + pgoff_t index = mpd->first_page; 2616 + pgoff_t end = mpd->last_page; 2617 + int tag; 2618 + int i, err = 0; 2619 + int blkbits = mpd->inode->i_blkbits; 2620 + ext4_lblk_t lblk; 2621 + struct buffer_head *head; 2622 + 2623 + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) 1945 2624 tag = PAGECACHE_TAG_TOWRITE; 1946 2625 else 1947 2626 tag = PAGECACHE_TAG_DIRTY; 1948 2627 1949 - *done_index = index; 2628 + pagevec_init(&pvec, 0); 2629 + mpd->map.m_len = 0; 2630 + mpd->next_page = index; 1950 2631 while (index <= end) { 1951 2632 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 1952 2633 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 1953 2634 if (nr_pages == 0) 1954 - return 0; 2635 + goto out; 1955 2636 1956 2637 for (i = 0; i < nr_pages; i++) { 1957 2638 struct page *page = pvec.pages[i]; ··· 2295 2318 if (page->index > end) 2296 2319 goto out; 2297 2320 2298 - *done_index = page->index + 1; 2299 - 2300 - /* 2301 - * If we can't merge this page, and we have 2302 - * accumulated an contiguous region, write it 2303 - */ 2304 - if ((mpd->next_page != page->index) && 2305 - (mpd->next_page != mpd->first_page)) { 2306 - mpage_da_map_and_submit(mpd); 2307 - goto ret_extent_tail; 2308 - } 2321 + /* If we can't merge this page, we are done. */ 2322 + if (mpd->map.m_len > 0 && mpd->next_page != page->index) 2323 + goto out; 2309 2324 2310 2325 lock_page(page); 2311 - 2312 2326 /* 2313 - * If the page is no longer dirty, or its 2314 - * mapping no longer corresponds to inode we 2315 - * are writing (which means it has been 2316 - * truncated or invalidated), or the page is 2317 - * already under writeback and we are not 2318 - * doing a data integrity writeback, skip the page 2327 + * If the page is no longer dirty, or its mapping no 2328 + * longer corresponds to inode we are writing (which 2329 + * means it has been truncated or invalidated), or the 2330 + * page is already under writeback and we are not doing 2331 + * a data integrity writeback, skip the page 2319 2332 */ 2320 2333 if (!PageDirty(page) || 2321 2334 (PageWriteback(page) && 2322 - (wbc->sync_mode == WB_SYNC_NONE)) || 2335 + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || 2323 2336 unlikely(page->mapping != mapping)) { 2324 2337 unlock_page(page); 2325 2338 continue; ··· 2318 2351 wait_on_page_writeback(page); 2319 2352 BUG_ON(PageWriteback(page)); 2320 2353 2321 - /* 2322 - * If we have inline data and arrive here, it means that 2323 - * we will soon create the block for the 1st page, so 2324 - * we'd better clear the inline data here. 2325 - */ 2326 - if (ext4_has_inline_data(inode)) { 2327 - BUG_ON(ext4_test_inode_state(inode, 2328 - EXT4_STATE_MAY_INLINE_DATA)); 2329 - ext4_destroy_inline_data(handle, inode); 2330 - } 2331 - 2332 - if (mpd->next_page != page->index) 2354 + if (mpd->map.m_len == 0) 2333 2355 mpd->first_page = page->index; 2334 2356 mpd->next_page = page->index + 1; 2335 - logical = (sector_t) page->index << 2336 - (PAGE_CACHE_SHIFT - inode->i_blkbits); 2337 - 2338 2357 /* Add all dirty buffers to mpd */ 2358 + lblk = ((ext4_lblk_t)page->index) << 2359 + (PAGE_CACHE_SHIFT - blkbits); 2339 2360 head = page_buffers(page); 2340 - bh = head; 2341 - do { 2342 - BUG_ON(buffer_locked(bh)); 2343 - /* 2344 - * We need to try to allocate unmapped blocks 2345 - * in the same page. Otherwise we won't make 2346 - * progress with the page in ext4_writepage 2347 - */ 2348 - if (ext4_bh_delay_or_unwritten(NULL, bh)) { 2349 - mpage_add_bh_to_extent(mpd, logical, 2350 - bh->b_state); 2351 - if (mpd->io_done) 2352 - goto ret_extent_tail; 2353 - } else if (buffer_dirty(bh) && 2354 - buffer_mapped(bh)) { 2355 - /* 2356 - * mapped dirty buffer. We need to 2357 - * update the b_state because we look 2358 - * at b_state in mpage_da_map_blocks. 2359 - * We don't update b_size because if we 2360 - * find an unmapped buffer_head later 2361 - * we need to use the b_state flag of 2362 - * that buffer_head. 2363 - */ 2364 - if (mpd->b_size == 0) 2365 - mpd->b_state = 2366 - bh->b_state & BH_FLAGS; 2367 - } 2368 - logical++; 2369 - } while ((bh = bh->b_this_page) != head); 2370 - 2371 - if (nr_to_write > 0) { 2372 - nr_to_write--; 2373 - if (nr_to_write == 0 && 2374 - wbc->sync_mode == WB_SYNC_NONE) 2375 - /* 2376 - * We stop writing back only if we are 2377 - * not doing integrity sync. In case of 2378 - * integrity sync we have to keep going 2379 - * because someone may be concurrently 2380 - * dirtying pages, and we might have 2381 - * synced a lot of newly appeared dirty 2382 - * pages, but have not synced all of the 2383 - * old dirty pages. 2384 - */ 2361 + if (!add_page_bufs_to_extent(mpd, head, head, lblk)) 2362 + goto out; 2363 + /* So far everything mapped? Submit the page for IO. */ 2364 + if (mpd->map.m_len == 0) { 2365 + err = mpage_submit_page(mpd, page); 2366 + if (err < 0) 2385 2367 goto out; 2386 2368 } 2369 + 2370 + /* 2371 + * Accumulated enough dirty pages? This doesn't apply 2372 + * to WB_SYNC_ALL mode. For integrity sync we have to 2373 + * keep going because someone may be concurrently 2374 + * dirtying pages, and we might have synced a lot of 2375 + * newly appeared dirty pages, but have not synced all 2376 + * of the old dirty pages. 2377 + */ 2378 + if (mpd->wbc->sync_mode == WB_SYNC_NONE && 2379 + mpd->next_page - mpd->first_page >= 2380 + mpd->wbc->nr_to_write) 2381 + goto out; 2387 2382 } 2388 2383 pagevec_release(&pvec); 2389 2384 cond_resched(); 2390 2385 } 2391 2386 return 0; 2392 - ret_extent_tail: 2393 - ret = MPAGE_DA_EXTENT_TAIL; 2394 2387 out: 2395 2388 pagevec_release(&pvec); 2396 - cond_resched(); 2389 + return err; 2390 + } 2391 + 2392 + static int __writepage(struct page *page, struct writeback_control *wbc, 2393 + void *data) 2394 + { 2395 + struct address_space *mapping = data; 2396 + int ret = ext4_writepage(page, wbc); 2397 + mapping_set_error(mapping, ret); 2397 2398 return ret; 2398 2399 } 2399 2400 2400 - 2401 - static int ext4_da_writepages(struct address_space *mapping, 2402 - struct writeback_control *wbc) 2401 + static int ext4_writepages(struct address_space *mapping, 2402 + struct writeback_control *wbc) 2403 2403 { 2404 - pgoff_t index; 2404 + pgoff_t writeback_index = 0; 2405 + long nr_to_write = wbc->nr_to_write; 2405 2406 int range_whole = 0; 2407 + int cycled = 1; 2406 2408 handle_t *handle = NULL; 2407 2409 struct mpage_da_data mpd; 2408 2410 struct inode *inode = mapping->host; 2409 - int pages_written = 0; 2410 - unsigned int max_pages; 2411 - int range_cyclic, cycled = 1, io_done = 0; 2412 - int needed_blocks, ret = 0; 2413 - long desired_nr_to_write, nr_to_writebump = 0; 2414 - loff_t range_start = wbc->range_start; 2411 + int needed_blocks, rsv_blocks = 0, ret = 0; 2415 2412 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2416 - pgoff_t done_index = 0; 2417 - pgoff_t end; 2413 + bool done; 2418 2414 struct blk_plug plug; 2415 + bool give_up_on_write = false; 2419 2416 2420 - trace_ext4_da_writepages(inode, wbc); 2417 + trace_ext4_writepages(inode, wbc); 2421 2418 2422 2419 /* 2423 2420 * No pages to write? This is mainly a kludge to avoid starting ··· 2391 2460 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2392 2461 return 0; 2393 2462 2463 + if (ext4_should_journal_data(inode)) { 2464 + struct blk_plug plug; 2465 + int ret; 2466 + 2467 + blk_start_plug(&plug); 2468 + ret = write_cache_pages(mapping, wbc, __writepage, mapping); 2469 + blk_finish_plug(&plug); 2470 + return ret; 2471 + } 2472 + 2394 2473 /* 2395 2474 * If the filesystem has aborted, it is read-only, so return 2396 2475 * right away instead of dumping stack traces later on that 2397 2476 * will obscure the real source of the problem. We test 2398 2477 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2399 2478 * the latter could be true if the filesystem is mounted 2400 - * read-only, and in that case, ext4_da_writepages should 2479 + * read-only, and in that case, ext4_writepages should 2401 2480 * *never* be called, so if that ever happens, we would want 2402 2481 * the stack trace. 2403 2482 */ 2404 2483 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2405 2484 return -EROFS; 2406 2485 2407 - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2408 - range_whole = 1; 2409 - 2410 - range_cyclic = wbc->range_cyclic; 2411 - if (wbc->range_cyclic) { 2412 - index = mapping->writeback_index; 2413 - if (index) 2414 - cycled = 0; 2415 - wbc->range_start = index << PAGE_CACHE_SHIFT; 2416 - wbc->range_end = LLONG_MAX; 2417 - wbc->range_cyclic = 0; 2418 - end = -1; 2419 - } else { 2420 - index = wbc->range_start >> PAGE_CACHE_SHIFT; 2421 - end = wbc->range_end >> PAGE_CACHE_SHIFT; 2486 + if (ext4_should_dioread_nolock(inode)) { 2487 + /* 2488 + * We may need to convert upto one extent per block in 2489 + * the page and we may dirty the inode. 2490 + */ 2491 + rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); 2422 2492 } 2423 2493 2424 2494 /* 2425 - * This works around two forms of stupidity. The first is in 2426 - * the writeback code, which caps the maximum number of pages 2427 - * written to be 1024 pages. This is wrong on multiple 2428 - * levels; different architectues have a different page size, 2429 - * which changes the maximum amount of data which gets 2430 - * written. Secondly, 4 megabytes is way too small. XFS 2431 - * forces this value to be 16 megabytes by multiplying 2432 - * nr_to_write parameter by four, and then relies on its 2433 - * allocator to allocate larger extents to make them 2434 - * contiguous. Unfortunately this brings us to the second 2435 - * stupidity, which is that ext4's mballoc code only allocates 2436 - * at most 2048 blocks. So we force contiguous writes up to 2437 - * the number of dirty blocks in the inode, or 2438 - * sbi->max_writeback_mb_bump whichever is smaller. 2495 + * If we have inline data and arrive here, it means that 2496 + * we will soon create the block for the 1st page, so 2497 + * we'd better clear the inline data here. 2439 2498 */ 2440 - max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2441 - if (!range_cyclic && range_whole) { 2442 - if (wbc->nr_to_write == LONG_MAX) 2443 - desired_nr_to_write = wbc->nr_to_write; 2444 - else 2445 - desired_nr_to_write = wbc->nr_to_write * 8; 2446 - } else 2447 - desired_nr_to_write = ext4_num_dirty_pages(inode, index, 2448 - max_pages); 2449 - if (desired_nr_to_write > max_pages) 2450 - desired_nr_to_write = max_pages; 2451 - 2452 - if (wbc->nr_to_write < desired_nr_to_write) { 2453 - nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; 2454 - wbc->nr_to_write = desired_nr_to_write; 2499 + if (ext4_has_inline_data(inode)) { 2500 + /* Just inode will be modified... */ 2501 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 2502 + if (IS_ERR(handle)) { 2503 + ret = PTR_ERR(handle); 2504 + goto out_writepages; 2505 + } 2506 + BUG_ON(ext4_test_inode_state(inode, 2507 + EXT4_STATE_MAY_INLINE_DATA)); 2508 + ext4_destroy_inline_data(handle, inode); 2509 + ext4_journal_stop(handle); 2455 2510 } 2456 2511 2512 + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2513 + range_whole = 1; 2514 + 2515 + if (wbc->range_cyclic) { 2516 + writeback_index = mapping->writeback_index; 2517 + if (writeback_index) 2518 + cycled = 0; 2519 + mpd.first_page = writeback_index; 2520 + mpd.last_page = -1; 2521 + } else { 2522 + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; 2523 + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; 2524 + } 2525 + 2526 + mpd.inode = inode; 2527 + mpd.wbc = wbc; 2528 + ext4_io_submit_init(&mpd.io_submit, wbc); 2457 2529 retry: 2458 2530 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2459 - tag_pages_for_writeback(mapping, index, end); 2460 - 2531 + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); 2532 + done = false; 2461 2533 blk_start_plug(&plug); 2462 - while (!ret && wbc->nr_to_write > 0) { 2534 + while (!done && mpd.first_page <= mpd.last_page) { 2535 + /* For each extent of pages we use new io_end */ 2536 + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2537 + if (!mpd.io_submit.io_end) { 2538 + ret = -ENOMEM; 2539 + break; 2540 + } 2463 2541 2464 2542 /* 2465 - * we insert one extent at a time. So we need 2466 - * credit needed for single extent allocation. 2467 - * journalled mode is currently not supported 2468 - * by delalloc 2543 + * We have two constraints: We find one extent to map and we 2544 + * must always write out whole page (makes a difference when 2545 + * blocksize < pagesize) so that we don't block on IO when we 2546 + * try to write out the rest of the page. Journalled mode is 2547 + * not supported by delalloc. 2469 2548 */ 2470 2549 BUG_ON(ext4_should_journal_data(inode)); 2471 2550 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2472 2551 2473 - /* start a new transaction*/ 2474 - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2475 - needed_blocks); 2552 + /* start a new transaction */ 2553 + handle = ext4_journal_start_with_reserve(inode, 2554 + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); 2476 2555 if (IS_ERR(handle)) { 2477 2556 ret = PTR_ERR(handle); 2478 2557 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2479 2558 "%ld pages, ino %lu; err %d", __func__, 2480 2559 wbc->nr_to_write, inode->i_ino, ret); 2481 - blk_finish_plug(&plug); 2482 - goto out_writepages; 2560 + /* Release allocated io_end */ 2561 + ext4_put_io_end(mpd.io_submit.io_end); 2562 + break; 2483 2563 } 2484 2564 2485 - /* 2486 - * Now call write_cache_pages_da() to find the next 2487 - * contiguous region of logical blocks that need 2488 - * blocks to be allocated by ext4 and submit them. 2489 - */ 2490 - ret = write_cache_pages_da(handle, mapping, 2491 - wbc, &mpd, &done_index); 2492 - /* 2493 - * If we have a contiguous extent of pages and we 2494 - * haven't done the I/O yet, map the blocks and submit 2495 - * them for I/O. 2496 - */ 2497 - if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2498 - mpage_da_map_and_submit(&mpd); 2499 - ret = MPAGE_DA_EXTENT_TAIL; 2565 + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); 2566 + ret = mpage_prepare_extent_to_map(&mpd); 2567 + if (!ret) { 2568 + if (mpd.map.m_len) 2569 + ret = mpage_map_and_submit_extent(handle, &mpd, 2570 + &give_up_on_write); 2571 + else { 2572 + /* 2573 + * We scanned the whole range (or exhausted 2574 + * nr_to_write), submitted what was mapped and 2575 + * didn't find anything needing mapping. We are 2576 + * done. 2577 + */ 2578 + done = true; 2579 + } 2500 2580 } 2501 - trace_ext4_da_write_pages(inode, &mpd); 2502 - wbc->nr_to_write -= mpd.pages_written; 2503 - 2504 2581 ext4_journal_stop(handle); 2582 + /* Submit prepared bio */ 2583 + ext4_io_submit(&mpd.io_submit); 2584 + /* Unlock pages we didn't use */ 2585 + mpage_release_unused_pages(&mpd, give_up_on_write); 2586 + /* Drop our io_end reference we got from init */ 2587 + ext4_put_io_end(mpd.io_submit.io_end); 2505 2588 2506 - if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2507 - /* commit the transaction which would 2589 + if (ret == -ENOSPC && sbi->s_journal) { 2590 + /* 2591 + * Commit the transaction which would 2508 2592 * free blocks released in the transaction 2509 2593 * and try again 2510 2594 */ 2511 2595 jbd2_journal_force_commit_nested(sbi->s_journal); 2512 2596 ret = 0; 2513 - } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2514 - /* 2515 - * Got one extent now try with rest of the pages. 2516 - * If mpd.retval is set -EIO, journal is aborted. 2517 - * So we don't need to write any more. 2518 - */ 2519 - pages_written += mpd.pages_written; 2520 - ret = mpd.retval; 2521 - io_done = 1; 2522 - } else if (wbc->nr_to_write) 2523 - /* 2524 - * There is no more writeout needed 2525 - * or we requested for a noblocking writeout 2526 - * and we found the device congested 2527 - */ 2597 + continue; 2598 + } 2599 + /* Fatal error - ENOMEM, EIO... */ 2600 + if (ret) 2528 2601 break; 2529 2602 } 2530 2603 blk_finish_plug(&plug); 2531 - if (!io_done && !cycled) { 2604 + if (!ret && !cycled) { 2532 2605 cycled = 1; 2533 - index = 0; 2534 - wbc->range_start = index << PAGE_CACHE_SHIFT; 2535 - wbc->range_end = mapping->writeback_index - 1; 2606 + mpd.last_page = writeback_index - 1; 2607 + mpd.first_page = 0; 2536 2608 goto retry; 2537 2609 } 2538 2610 2539 2611 /* Update index */ 2540 - wbc->range_cyclic = range_cyclic; 2541 2612 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2542 2613 /* 2543 - * set the writeback_index so that range_cyclic 2614 + * Set the writeback_index so that range_cyclic 2544 2615 * mode will write it back later 2545 2616 */ 2546 - mapping->writeback_index = done_index; 2617 + mapping->writeback_index = mpd.first_page; 2547 2618 2548 2619 out_writepages: 2549 - wbc->nr_to_write -= nr_to_writebump; 2550 - wbc->range_start = range_start; 2551 - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2620 + trace_ext4_writepages_result(inode, wbc, ret, 2621 + nr_to_write - wbc->nr_to_write); 2552 2622 return ret; 2553 2623 } 2554 2624 ··· 2761 2829 return ret ? ret : copied; 2762 2830 } 2763 2831 2764 - static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2832 + static void ext4_da_invalidatepage(struct page *page, unsigned int offset, 2833 + unsigned int length) 2765 2834 { 2766 2835 /* 2767 2836 * Drop reserved blocks ··· 2771 2838 if (!page_has_buffers(page)) 2772 2839 goto out; 2773 2840 2774 - ext4_da_page_release_reservation(page, offset); 2841 + ext4_da_page_release_reservation(page, offset, length); 2775 2842 2776 2843 out: 2777 - ext4_invalidatepage(page, offset); 2844 + ext4_invalidatepage(page, offset, length); 2778 2845 2779 2846 return; 2780 2847 } ··· 2797 2864 * laptop_mode, not even desirable). However, to do otherwise 2798 2865 * would require replicating code paths in: 2799 2866 * 2800 - * ext4_da_writepages() -> 2867 + * ext4_writepages() -> 2801 2868 * write_cache_pages() ---> (via passed in callback function) 2802 2869 * __mpage_da_writepage() --> 2803 2870 * mpage_add_bh_to_extent() ··· 2922 2989 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2923 2990 } 2924 2991 2925 - static void ext4_invalidatepage(struct page *page, unsigned long offset) 2992 + static void ext4_invalidatepage(struct page *page, unsigned int offset, 2993 + unsigned int length) 2926 2994 { 2927 - trace_ext4_invalidatepage(page, offset); 2995 + trace_ext4_invalidatepage(page, offset, length); 2928 2996 2929 2997 /* No journalling happens on data buffers when this function is used */ 2930 2998 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2931 2999 2932 - block_invalidatepage(page, offset); 3000 + block_invalidatepage(page, offset, length); 2933 3001 } 2934 3002 2935 3003 static int __ext4_journalled_invalidatepage(struct page *page, 2936 - unsigned long offset) 3004 + unsigned int offset, 3005 + unsigned int length) 2937 3006 { 2938 3007 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2939 3008 2940 - trace_ext4_journalled_invalidatepage(page, offset); 3009 + trace_ext4_journalled_invalidatepage(page, offset, length); 2941 3010 2942 3011 /* 2943 3012 * If it's a full truncate we just forget about the pending dirtying 2944 3013 */ 2945 - if (offset == 0) 3014 + if (offset == 0 && length == PAGE_CACHE_SIZE) 2946 3015 ClearPageChecked(page); 2947 3016 2948 - return jbd2_journal_invalidatepage(journal, page, offset); 3017 + return jbd2_journal_invalidatepage(journal, page, offset, length); 2949 3018 } 2950 3019 2951 3020 /* Wrapper for aops... */ 2952 3021 static void ext4_journalled_invalidatepage(struct page *page, 2953 - unsigned long offset) 3022 + unsigned int offset, 3023 + unsigned int length) 2954 3024 { 2955 - WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 3025 + WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); 2956 3026 } 2957 3027 2958 3028 static int ext4_releasepage(struct page *page, gfp_t wait) ··· 3003 3067 struct inode *inode = file_inode(iocb->ki_filp); 3004 3068 ext4_io_end_t *io_end = iocb->private; 3005 3069 3006 - /* if not async direct IO or dio with 0 bytes write, just return */ 3007 - if (!io_end || !size) 3008 - goto out; 3070 + /* if not async direct IO just return */ 3071 + if (!io_end) { 3072 + inode_dio_done(inode); 3073 + if (is_async) 3074 + aio_complete(iocb, ret, 0); 3075 + return; 3076 + } 3009 3077 3010 3078 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3011 3079 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", ··· 3017 3077 size); 3018 3078 3019 3079 iocb->private = NULL; 3020 - 3021 - /* if not aio dio with unwritten extents, just free io and return */ 3022 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 3023 - ext4_free_io_end(io_end); 3024 - out: 3025 - inode_dio_done(inode); 3026 - if (is_async) 3027 - aio_complete(iocb, ret, 0); 3028 - return; 3029 - } 3030 - 3031 3080 io_end->offset = offset; 3032 3081 io_end->size = size; 3033 3082 if (is_async) { 3034 3083 io_end->iocb = iocb; 3035 3084 io_end->result = ret; 3036 3085 } 3037 - 3038 - ext4_add_complete_io(io_end); 3086 + ext4_put_io_end_defer(io_end); 3039 3087 } 3040 3088 3041 3089 /* ··· 3057 3129 get_block_t *get_block_func = NULL; 3058 3130 int dio_flags = 0; 3059 3131 loff_t final_size = offset + count; 3132 + ext4_io_end_t *io_end = NULL; 3060 3133 3061 3134 /* Use the old path for reads and writes beyond i_size. */ 3062 3135 if (rw != WRITE || final_size > inode->i_size) ··· 3065 3136 3066 3137 BUG_ON(iocb->private == NULL); 3067 3138 3139 + /* 3140 + * Make all waiters for direct IO properly wait also for extent 3141 + * conversion. This also disallows race between truncate() and 3142 + * overwrite DIO as i_dio_count needs to be incremented under i_mutex. 3143 + */ 3144 + if (rw == WRITE) 3145 + atomic_inc(&inode->i_dio_count); 3146 + 3068 3147 /* If we do a overwrite dio, i_mutex locking can be released */ 3069 3148 overwrite = *((int *)iocb->private); 3070 3149 3071 3150 if (overwrite) { 3072 - atomic_inc(&inode->i_dio_count); 3073 3151 down_read(&EXT4_I(inode)->i_data_sem); 3074 3152 mutex_unlock(&inode->i_mutex); 3075 3153 } ··· 3103 3167 iocb->private = NULL; 3104 3168 ext4_inode_aio_set(inode, NULL); 3105 3169 if (!is_sync_kiocb(iocb)) { 3106 - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3170 + io_end = ext4_init_io_end(inode, GFP_NOFS); 3107 3171 if (!io_end) { 3108 3172 ret = -ENOMEM; 3109 3173 goto retake_lock; 3110 3174 } 3111 3175 io_end->flag |= EXT4_IO_END_DIRECT; 3112 - iocb->private = io_end; 3176 + /* 3177 + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() 3178 + */ 3179 + iocb->private = ext4_get_io_end(io_end); 3113 3180 /* 3114 3181 * we save the io structure for current async direct 3115 3182 * IO, so that later ext4_map_blocks() could flag the ··· 3136 3197 NULL, 3137 3198 dio_flags); 3138 3199 3139 - if (iocb->private) 3140 - ext4_inode_aio_set(inode, NULL); 3141 3200 /* 3142 - * The io_end structure takes a reference to the inode, that 3143 - * structure needs to be destroyed and the reference to the 3144 - * inode need to be dropped, when IO is complete, even with 0 3145 - * byte write, or failed. 3146 - * 3147 - * In the successful AIO DIO case, the io_end structure will 3148 - * be destroyed and the reference to the inode will be dropped 3149 - * after the end_io call back function is called. 3150 - * 3151 - * In the case there is 0 byte write, or error case, since VFS 3152 - * direct IO won't invoke the end_io call back function, we 3153 - * need to free the end_io structure here. 3201 + * Put our reference to io_end. This can free the io_end structure e.g. 3202 + * in sync IO case or in case of error. It can even perform extent 3203 + * conversion if all bios we submitted finished before we got here. 3204 + * Note that in that case iocb->private can be already set to NULL 3205 + * here. 3154 3206 */ 3155 - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3156 - ext4_free_io_end(iocb->private); 3157 - iocb->private = NULL; 3158 - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3207 + if (io_end) { 3208 + ext4_inode_aio_set(inode, NULL); 3209 + ext4_put_io_end(io_end); 3210 + /* 3211 + * When no IO was submitted ext4_end_io_dio() was not 3212 + * called so we have to put iocb's reference. 3213 + */ 3214 + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { 3215 + WARN_ON(iocb->private != io_end); 3216 + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 3217 + WARN_ON(io_end->iocb); 3218 + /* 3219 + * Generic code already did inode_dio_done() so we 3220 + * have to clear EXT4_IO_END_DIRECT to not do it for 3221 + * the second time. 3222 + */ 3223 + io_end->flag = 0; 3224 + ext4_put_io_end(io_end); 3225 + iocb->private = NULL; 3226 + } 3227 + } 3228 + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3159 3229 EXT4_STATE_DIO_UNWRITTEN)) { 3160 3230 int err; 3161 3231 /* 3162 3232 * for non AIO case, since the IO is already 3163 3233 * completed, we could do the conversion right here 3164 3234 */ 3165 - err = ext4_convert_unwritten_extents(inode, 3235 + err = ext4_convert_unwritten_extents(NULL, inode, 3166 3236 offset, ret); 3167 3237 if (err < 0) 3168 3238 ret = err; ··· 3179 3231 } 3180 3232 3181 3233 retake_lock: 3234 + if (rw == WRITE) 3235 + inode_dio_done(inode); 3182 3236 /* take i_mutex locking again if we do a ovewrite dio */ 3183 3237 if (overwrite) { 3184 - inode_dio_done(inode); 3185 3238 up_read(&EXT4_I(inode)->i_data_sem); 3186 3239 mutex_lock(&inode->i_mutex); 3187 3240 } ··· 3241 3292 .readpage = ext4_readpage, 3242 3293 .readpages = ext4_readpages, 3243 3294 .writepage = ext4_writepage, 3295 + .writepages = ext4_writepages, 3244 3296 .write_begin = ext4_write_begin, 3245 3297 .write_end = ext4_write_end, 3246 3298 .bmap = ext4_bmap, ··· 3257 3307 .readpage = ext4_readpage, 3258 3308 .readpages = ext4_readpages, 3259 3309 .writepage = ext4_writepage, 3310 + .writepages = ext4_writepages, 3260 3311 .write_begin = ext4_write_begin, 3261 3312 .write_end = ext4_journalled_write_end, 3262 3313 .set_page_dirty = ext4_journalled_set_page_dirty, ··· 3273 3322 .readpage = ext4_readpage, 3274 3323 .readpages = ext4_readpages, 3275 3324 .writepage = ext4_writepage, 3276 - .writepages = ext4_da_writepages, 3325 + .writepages = ext4_writepages, 3277 3326 .write_begin = ext4_da_write_begin, 3278 3327 .write_end = ext4_da_write_end, 3279 3328 .bmap = ext4_bmap, ··· 3306 3355 inode->i_mapping->a_ops = &ext4_aops; 3307 3356 } 3308 3357 3358 + /* 3359 + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3360 + * up to the end of the block which corresponds to `from'. 3361 + * This required during truncate. We need to physically zero the tail end 3362 + * of that block so it doesn't yield old data if the file is later grown. 3363 + */ 3364 + int ext4_block_truncate_page(handle_t *handle, 3365 + struct address_space *mapping, loff_t from) 3366 + { 3367 + unsigned offset = from & (PAGE_CACHE_SIZE-1); 3368 + unsigned length; 3369 + unsigned blocksize; 3370 + struct inode *inode = mapping->host; 3371 + 3372 + blocksize = inode->i_sb->s_blocksize; 3373 + length = blocksize - (offset & (blocksize - 1)); 3374 + 3375 + return ext4_block_zero_page_range(handle, mapping, from, length); 3376 + } 3309 3377 3310 3378 /* 3311 - * ext4_discard_partial_page_buffers() 3312 - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. 3313 - * This function finds and locks the page containing the offset 3314 - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. 3315 - * Calling functions that already have the page locked should call 3316 - * ext4_discard_partial_page_buffers_no_lock directly. 3379 + * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3380 + * starting from file offset 'from'. The range to be zero'd must 3381 + * be contained with in one block. If the specified range exceeds 3382 + * the end of the block it will be shortened to end of the block 3383 + * that cooresponds to 'from' 3317 3384 */ 3318 - int ext4_discard_partial_page_buffers(handle_t *handle, 3319 - struct address_space *mapping, loff_t from, 3320 - loff_t length, int flags) 3385 + int ext4_block_zero_page_range(handle_t *handle, 3386 + struct address_space *mapping, loff_t from, loff_t length) 3321 3387 { 3388 + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3389 + unsigned offset = from & (PAGE_CACHE_SIZE-1); 3390 + unsigned blocksize, max, pos; 3391 + ext4_lblk_t iblock; 3322 3392 struct inode *inode = mapping->host; 3393 + struct buffer_head *bh; 3323 3394 struct page *page; 3324 3395 int err = 0; 3325 3396 ··· 3350 3377 if (!page) 3351 3378 return -ENOMEM; 3352 3379 3353 - err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, 3354 - from, length, flags); 3355 - 3356 - unlock_page(page); 3357 - page_cache_release(page); 3358 - return err; 3359 - } 3360 - 3361 - /* 3362 - * ext4_discard_partial_page_buffers_no_lock() 3363 - * Zeros a page range of length 'length' starting from offset 'from'. 3364 - * Buffer heads that correspond to the block aligned regions of the 3365 - * zeroed range will be unmapped. Unblock aligned regions 3366 - * will have the corresponding buffer head mapped if needed so that 3367 - * that region of the page can be updated with the partial zero out. 3368 - * 3369 - * This function assumes that the page has already been locked. The 3370 - * The range to be discarded must be contained with in the given page. 3371 - * If the specified range exceeds the end of the page it will be shortened 3372 - * to the end of the page that corresponds to 'from'. This function is 3373 - * appropriate for updating a page and it buffer heads to be unmapped and 3374 - * zeroed for blocks that have been either released, or are going to be 3375 - * released. 3376 - * 3377 - * handle: The journal handle 3378 - * inode: The files inode 3379 - * page: A locked page that contains the offset "from" 3380 - * from: The starting byte offset (from the beginning of the file) 3381 - * to begin discarding 3382 - * len: The length of bytes to discard 3383 - * flags: Optional flags that may be used: 3384 - * 3385 - * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 3386 - * Only zero the regions of the page whose buffer heads 3387 - * have already been unmapped. This flag is appropriate 3388 - * for updating the contents of a page whose blocks may 3389 - * have already been released, and we only want to zero 3390 - * out the regions that correspond to those released blocks. 3391 - * 3392 - * Returns zero on success or negative on failure. 3393 - */ 3394 - static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3395 - struct inode *inode, struct page *page, loff_t from, 3396 - loff_t length, int flags) 3397 - { 3398 - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3399 - unsigned int offset = from & (PAGE_CACHE_SIZE-1); 3400 - unsigned int blocksize, max, pos; 3401 - ext4_lblk_t iblock; 3402 - struct buffer_head *bh; 3403 - int err = 0; 3404 - 3405 3380 blocksize = inode->i_sb->s_blocksize; 3406 - max = PAGE_CACHE_SIZE - offset; 3407 - 3408 - if (index != page->index) 3409 - return -EINVAL; 3381 + max = blocksize - (offset & (blocksize - 1)); 3410 3382 3411 3383 /* 3412 3384 * correct length if it does not fall between 3413 - * 'from' and the end of the page 3385 + * 'from' and the end of the block 3414 3386 */ 3415 3387 if (length > max || length < 0) 3416 3388 length = max; ··· 3373 3455 iblock++; 3374 3456 pos += blocksize; 3375 3457 } 3376 - 3377 - pos = offset; 3378 - while (pos < offset + length) { 3379 - unsigned int end_of_block, range_to_discard; 3380 - 3381 - err = 0; 3382 - 3383 - /* The length of space left to zero and unmap */ 3384 - range_to_discard = offset + length - pos; 3385 - 3386 - /* The length of space until the end of the block */ 3387 - end_of_block = blocksize - (pos & (blocksize-1)); 3388 - 3389 - /* 3390 - * Do not unmap or zero past end of block 3391 - * for this buffer head 3392 - */ 3393 - if (range_to_discard > end_of_block) 3394 - range_to_discard = end_of_block; 3395 - 3396 - 3397 - /* 3398 - * Skip this buffer head if we are only zeroing unampped 3399 - * regions of the page 3400 - */ 3401 - if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && 3402 - buffer_mapped(bh)) 3403 - goto next; 3404 - 3405 - /* If the range is block aligned, unmap */ 3406 - if (range_to_discard == blocksize) { 3407 - clear_buffer_dirty(bh); 3408 - bh->b_bdev = NULL; 3409 - clear_buffer_mapped(bh); 3410 - clear_buffer_req(bh); 3411 - clear_buffer_new(bh); 3412 - clear_buffer_delay(bh); 3413 - clear_buffer_unwritten(bh); 3414 - clear_buffer_uptodate(bh); 3415 - zero_user(page, pos, range_to_discard); 3416 - BUFFER_TRACE(bh, "Buffer discarded"); 3417 - goto next; 3418 - } 3419 - 3420 - /* 3421 - * If this block is not completely contained in the range 3422 - * to be discarded, then it is not going to be released. Because 3423 - * we need to keep this block, we need to make sure this part 3424 - * of the page is uptodate before we modify it by writeing 3425 - * partial zeros on it. 3426 - */ 3458 + if (buffer_freed(bh)) { 3459 + BUFFER_TRACE(bh, "freed: skip"); 3460 + goto unlock; 3461 + } 3462 + if (!buffer_mapped(bh)) { 3463 + BUFFER_TRACE(bh, "unmapped"); 3464 + ext4_get_block(inode, iblock, bh, 0); 3465 + /* unmapped? It's a hole - nothing to do */ 3427 3466 if (!buffer_mapped(bh)) { 3428 - /* 3429 - * Buffer head must be mapped before we can read 3430 - * from the block 3431 - */ 3432 - BUFFER_TRACE(bh, "unmapped"); 3433 - ext4_get_block(inode, iblock, bh, 0); 3434 - /* unmapped? It's a hole - nothing to do */ 3435 - if (!buffer_mapped(bh)) { 3436 - BUFFER_TRACE(bh, "still unmapped"); 3437 - goto next; 3438 - } 3467 + BUFFER_TRACE(bh, "still unmapped"); 3468 + goto unlock; 3439 3469 } 3440 - 3441 - /* Ok, it's mapped. Make sure it's up-to-date */ 3442 - if (PageUptodate(page)) 3443 - set_buffer_uptodate(bh); 3444 - 3445 - if (!buffer_uptodate(bh)) { 3446 - err = -EIO; 3447 - ll_rw_block(READ, 1, &bh); 3448 - wait_on_buffer(bh); 3449 - /* Uhhuh. Read error. Complain and punt.*/ 3450 - if (!buffer_uptodate(bh)) 3451 - goto next; 3452 - } 3453 - 3454 - if (ext4_should_journal_data(inode)) { 3455 - BUFFER_TRACE(bh, "get write access"); 3456 - err = ext4_journal_get_write_access(handle, bh); 3457 - if (err) 3458 - goto next; 3459 - } 3460 - 3461 - zero_user(page, pos, range_to_discard); 3462 - 3463 - err = 0; 3464 - if (ext4_should_journal_data(inode)) { 3465 - err = ext4_handle_dirty_metadata(handle, inode, bh); 3466 - } else 3467 - mark_buffer_dirty(bh); 3468 - 3469 - BUFFER_TRACE(bh, "Partial buffer zeroed"); 3470 - next: 3471 - bh = bh->b_this_page; 3472 - iblock++; 3473 - pos += range_to_discard; 3474 3470 } 3475 3471 3472 + /* Ok, it's mapped. Make sure it's up-to-date */ 3473 + if (PageUptodate(page)) 3474 + set_buffer_uptodate(bh); 3475 + 3476 + if (!buffer_uptodate(bh)) { 3477 + err = -EIO; 3478 + ll_rw_block(READ, 1, &bh); 3479 + wait_on_buffer(bh); 3480 + /* Uhhuh. Read error. Complain and punt. */ 3481 + if (!buffer_uptodate(bh)) 3482 + goto unlock; 3483 + } 3484 + if (ext4_should_journal_data(inode)) { 3485 + BUFFER_TRACE(bh, "get write access"); 3486 + err = ext4_journal_get_write_access(handle, bh); 3487 + if (err) 3488 + goto unlock; 3489 + } 3490 + zero_user(page, offset, length); 3491 + BUFFER_TRACE(bh, "zeroed end of block"); 3492 + 3493 + if (ext4_should_journal_data(inode)) { 3494 + err = ext4_handle_dirty_metadata(handle, inode, bh); 3495 + } else { 3496 + err = 0; 3497 + mark_buffer_dirty(bh); 3498 + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) 3499 + err = ext4_jbd2_file_inode(handle, inode); 3500 + } 3501 + 3502 + unlock: 3503 + unlock_page(page); 3504 + page_cache_release(page); 3505 + return err; 3506 + } 3507 + 3508 + int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3509 + loff_t lstart, loff_t length) 3510 + { 3511 + struct super_block *sb = inode->i_sb; 3512 + struct address_space *mapping = inode->i_mapping; 3513 + unsigned partial_start, partial_end; 3514 + ext4_fsblk_t start, end; 3515 + loff_t byte_end = (lstart + length - 1); 3516 + int err = 0; 3517 + 3518 + partial_start = lstart & (sb->s_blocksize - 1); 3519 + partial_end = byte_end & (sb->s_blocksize - 1); 3520 + 3521 + start = lstart >> sb->s_blocksize_bits; 3522 + end = byte_end >> sb->s_blocksize_bits; 3523 + 3524 + /* Handle partial zero within the single block */ 3525 + if (start == end && 3526 + (partial_start || (partial_end != sb->s_blocksize - 1))) { 3527 + err = ext4_block_zero_page_range(handle, mapping, 3528 + lstart, length); 3529 + return err; 3530 + } 3531 + /* Handle partial zero out on the start of the range */ 3532 + if (partial_start) { 3533 + err = ext4_block_zero_page_range(handle, mapping, 3534 + lstart, sb->s_blocksize); 3535 + if (err) 3536 + return err; 3537 + } 3538 + /* Handle partial zero out on the end of the range */ 3539 + if (partial_end != sb->s_blocksize - 1) 3540 + err = ext4_block_zero_page_range(handle, mapping, 3541 + byte_end - partial_end, 3542 + partial_end + 1); 3476 3543 return err; 3477 3544 } 3478 3545 ··· 3483 3580 * Returns: 0 on success or negative on failure 3484 3581 */ 3485 3582 3486 - int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3583 + int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) 3487 3584 { 3488 - struct inode *inode = file_inode(file); 3489 3585 struct super_block *sb = inode->i_sb; 3490 3586 ext4_lblk_t first_block, stop_block; 3491 3587 struct address_space *mapping = inode->i_mapping; 3492 - loff_t first_page, last_page, page_len; 3493 - loff_t first_page_offset, last_page_offset; 3588 + loff_t first_block_offset, last_block_offset; 3494 3589 handle_t *handle; 3495 3590 unsigned int credits; 3496 3591 int ret = 0; ··· 3539 3638 offset; 3540 3639 } 3541 3640 3542 - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3543 - last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3641 + first_block_offset = round_up(offset, sb->s_blocksize); 3642 + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 3544 3643 3545 - first_page_offset = first_page << PAGE_CACHE_SHIFT; 3546 - last_page_offset = last_page << PAGE_CACHE_SHIFT; 3547 - 3548 - /* Now release the pages */ 3549 - if (last_page_offset > first_page_offset) { 3550 - truncate_pagecache_range(inode, first_page_offset, 3551 - last_page_offset - 1); 3552 - } 3644 + /* Now release the pages and zero block aligned part of pages*/ 3645 + if (last_block_offset > first_block_offset) 3646 + truncate_pagecache_range(inode, first_block_offset, 3647 + last_block_offset); 3553 3648 3554 3649 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3555 3650 ext4_inode_block_unlocked_dio(inode); 3556 - ret = ext4_flush_unwritten_io(inode); 3557 - if (ret) 3558 - goto out_dio; 3559 3651 inode_dio_wait(inode); 3560 3652 3561 3653 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ··· 3562 3668 goto out_dio; 3563 3669 } 3564 3670 3565 - /* 3566 - * Now we need to zero out the non-page-aligned data in the 3567 - * pages at the start and tail of the hole, and unmap the 3568 - * buffer heads for the block aligned regions of the page that 3569 - * were completely zeroed. 3570 - */ 3571 - if (first_page > last_page) { 3572 - /* 3573 - * If the file space being truncated is contained 3574 - * within a page just zero out and unmap the middle of 3575 - * that page 3576 - */ 3577 - ret = ext4_discard_partial_page_buffers(handle, 3578 - mapping, offset, length, 0); 3579 - 3580 - if (ret) 3581 - goto out_stop; 3582 - } else { 3583 - /* 3584 - * zero out and unmap the partial page that contains 3585 - * the start of the hole 3586 - */ 3587 - page_len = first_page_offset - offset; 3588 - if (page_len > 0) { 3589 - ret = ext4_discard_partial_page_buffers(handle, mapping, 3590 - offset, page_len, 0); 3591 - if (ret) 3592 - goto out_stop; 3593 - } 3594 - 3595 - /* 3596 - * zero out and unmap the partial page that contains 3597 - * the end of the hole 3598 - */ 3599 - page_len = offset + length - last_page_offset; 3600 - if (page_len > 0) { 3601 - ret = ext4_discard_partial_page_buffers(handle, mapping, 3602 - last_page_offset, page_len, 0); 3603 - if (ret) 3604 - goto out_stop; 3605 - } 3606 - } 3607 - 3608 - /* 3609 - * If i_size is contained in the last page, we need to 3610 - * unmap and zero the partial page after i_size 3611 - */ 3612 - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && 3613 - inode->i_size % PAGE_CACHE_SIZE != 0) { 3614 - page_len = PAGE_CACHE_SIZE - 3615 - (inode->i_size & (PAGE_CACHE_SIZE - 1)); 3616 - 3617 - if (page_len > 0) { 3618 - ret = ext4_discard_partial_page_buffers(handle, 3619 - mapping, inode->i_size, page_len, 0); 3620 - 3621 - if (ret) 3622 - goto out_stop; 3623 - } 3624 - } 3671 + ret = ext4_zero_partial_blocks(handle, inode, offset, 3672 + length); 3673 + if (ret) 3674 + goto out_stop; 3625 3675 3626 3676 first_block = (offset + sb->s_blocksize - 1) >> 3627 3677 EXT4_BLOCK_SIZE_BITS(sb); ··· 3641 3803 unsigned int credits; 3642 3804 handle_t *handle; 3643 3805 struct address_space *mapping = inode->i_mapping; 3644 - loff_t page_len; 3645 3806 3646 3807 /* 3647 3808 * There is a possibility that we're either freeing the inode ··· 3667 3830 return; 3668 3831 } 3669 3832 3670 - /* 3671 - * finish any pending end_io work so we won't run the risk of 3672 - * converting any truncated blocks to initialized later 3673 - */ 3674 - ext4_flush_unwritten_io(inode); 3675 - 3676 3833 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3677 3834 credits = ext4_writepage_trans_blocks(inode); 3678 3835 else ··· 3678 3847 return; 3679 3848 } 3680 3849 3681 - if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3682 - page_len = PAGE_CACHE_SIZE - 3683 - (inode->i_size & (PAGE_CACHE_SIZE - 1)); 3684 - 3685 - if (ext4_discard_partial_page_buffers(handle, 3686 - mapping, inode->i_size, page_len, 0)) 3687 - goto out_stop; 3688 - } 3850 + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 3851 + ext4_block_truncate_page(handle, mapping, inode->i_size); 3689 3852 3690 3853 /* 3691 3854 * We add the inode to the orphan list, so that if this ··· 4448 4623 inode->i_size >> PAGE_CACHE_SHIFT); 4449 4624 if (!page) 4450 4625 return; 4451 - ret = __ext4_journalled_invalidatepage(page, offset); 4626 + ret = __ext4_journalled_invalidatepage(page, offset, 4627 + PAGE_CACHE_SIZE - offset); 4452 4628 unlock_page(page); 4453 4629 page_cache_release(page); 4454 4630 if (ret != -EBUSY) ··· 4631 4805 struct kstat *stat) 4632 4806 { 4633 4807 struct inode *inode; 4634 - unsigned long delalloc_blocks; 4808 + unsigned long long delalloc_blocks; 4635 4809 4636 4810 inode = dentry->d_inode; 4637 4811 generic_fillattr(inode, stat); ··· 4649 4823 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 4650 4824 EXT4_I(inode)->i_reserved_data_blocks); 4651 4825 4652 - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4826 + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); 4653 4827 return 0; 4654 4828 } 4655 4829 4656 - static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4830 + static int ext4_index_trans_blocks(struct inode *inode, int lblocks, 4831 + int pextents) 4657 4832 { 4658 4833 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4659 - return ext4_ind_trans_blocks(inode, nrblocks, chunk); 4660 - return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4834 + return ext4_ind_trans_blocks(inode, lblocks); 4835 + return ext4_ext_index_trans_blocks(inode, pextents); 4661 4836 } 4662 4837 4663 4838 /* ··· 4672 4845 * 4673 4846 * Also account for superblock, inode, quota and xattr blocks 4674 4847 */ 4675 - static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4848 + static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 4849 + int pextents) 4676 4850 { 4677 4851 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4678 4852 int gdpblocks; ··· 4681 4853 int ret = 0; 4682 4854 4683 4855 /* 4684 - * How many index blocks need to touch to modify nrblocks? 4685 - * The "Chunk" flag indicating whether the nrblocks is 4686 - * physically contiguous on disk 4687 - * 4688 - * For Direct IO and fallocate, they calls get_block to allocate 4689 - * one single extent at a time, so they could set the "Chunk" flag 4856 + * How many index blocks need to touch to map @lblocks logical blocks 4857 + * to @pextents physical extents? 4690 4858 */ 4691 - idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4859 + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); 4692 4860 4693 4861 ret = idxblocks; 4694 4862 ··· 4692 4868 * Now let's see how many group bitmaps and group descriptors need 4693 4869 * to account 4694 4870 */ 4695 - groups = idxblocks; 4696 - if (chunk) 4697 - groups += 1; 4698 - else 4699 - groups += nrblocks; 4700 - 4871 + groups = idxblocks + pextents; 4701 4872 gdpblocks = groups; 4702 4873 if (groups > ngroups) 4703 4874 groups = ngroups; ··· 4723 4904 int bpp = ext4_journal_blocks_per_page(inode); 4724 4905 int ret; 4725 4906 4726 - ret = ext4_meta_trans_blocks(inode, bpp, 0); 4907 + ret = ext4_meta_trans_blocks(inode, bpp, bpp); 4727 4908 4728 4909 /* Account for data blocks for journalled mode */ 4729 4910 if (ext4_should_journal_data(inode))

+13 -8

fs/ext4/mballoc.c

··· 2105 2105 group = ac->ac_g_ex.fe_group; 2106 2106 2107 2107 for (i = 0; i < ngroups; group++, i++) { 2108 + cond_resched(); 2108 2109 /* 2109 2110 * Artificially restricted ngroups for non-extent 2110 2111 * files makes group > ngroups possible on first loop. ··· 4406 4405 repeat: 4407 4406 /* allocate space in core */ 4408 4407 *errp = ext4_mb_regular_allocator(ac); 4408 + if (*errp) 4409 + goto discard_and_exit; 4410 + 4411 + /* as we've just preallocated more space than 4412 + * user requested originally, we store allocated 4413 + * space in a special descriptor */ 4414 + if (ac->ac_status == AC_STATUS_FOUND && 4415 + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4416 + *errp = ext4_mb_new_preallocation(ac); 4409 4417 if (*errp) { 4418 + discard_and_exit: 4410 4419 ext4_discard_allocated_blocks(ac); 4411 4420 goto errout; 4412 4421 } 4413 - 4414 - /* as we've just preallocated more space than 4415 - * user requested orinally, we store allocated 4416 - * space in a special descriptor */ 4417 - if (ac->ac_status == AC_STATUS_FOUND && 4418 - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4419 - ext4_mb_new_preallocation(ac); 4420 4422 } 4421 4423 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4422 4424 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); ··· 4616 4612 BUG_ON(bh && (count > 1)); 4617 4613 4618 4614 for (i = 0; i < count; i++) { 4615 + cond_resched(); 4619 4616 if (!bh) 4620 4617 tbh = sb_find_get_block(inode->i_sb, 4621 4618 block + i); 4622 - if (unlikely(!tbh)) 4619 + if (!tbh) 4623 4620 continue; 4624 4621 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4625 4622 inode, tbh, block + i);

-3

fs/ext4/move_extent.c

··· 912 912 struct page *pagep[2] = {NULL, NULL}; 913 913 handle_t *handle; 914 914 ext4_lblk_t orig_blk_offset; 915 - long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 916 915 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 917 916 unsigned int w_flags = 0; 918 917 unsigned int tmp_data_size, data_size, replaced_size; ··· 938 939 939 940 orig_blk_offset = orig_page_offset * blocks_per_page + 940 941 data_offset_in_page; 941 - 942 - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 943 942 944 943 /* Calculate data_size */ 945 944 if ((orig_blk_offset + block_len_in_page - 1) ==

+2 -5

fs/ext4/namei.c

··· 918 918 bh->b_data, bh->b_size, 919 919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 920 920 + ((char *)de - bh->b_data))) { 921 - /* On error, skip the f_pos to the next block. */ 922 - dir_file->f_pos = (dir_file->f_pos | 923 - (dir->i_sb->s_blocksize - 1)) + 1; 924 - brelse(bh); 925 - return count; 921 + /* silently ignore the rest of the block */ 922 + break; 926 923 } 927 924 ext4fs_dirhash(de->name, de->name_len, hinfo); 928 925 if ((hinfo->hash < start_hash) ||

+246 -205

fs/ext4/page-io.c

··· 46 46 } 47 47 48 48 /* 49 - * This function is called by ext4_evict_inode() to make sure there is 50 - * no more pending I/O completion work left to do. 51 - */ 52 - void ext4_ioend_shutdown(struct inode *inode) 53 - { 54 - wait_queue_head_t *wq = ext4_ioend_wq(inode); 55 - 56 - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 57 - /* 58 - * We need to make sure the work structure is finished being 59 - * used before we let the inode get destroyed. 60 - */ 61 - if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 62 - cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 63 - } 64 - 65 - void ext4_free_io_end(ext4_io_end_t *io) 66 - { 67 - BUG_ON(!io); 68 - BUG_ON(!list_empty(&io->list)); 69 - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 70 - 71 - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 72 - wake_up_all(ext4_ioend_wq(io->inode)); 73 - kmem_cache_free(io_end_cachep, io); 74 - } 75 - 76 - /* check a range of space and convert unwritten extents to written. */ 77 - static int ext4_end_io(ext4_io_end_t *io) 78 - { 79 - struct inode *inode = io->inode; 80 - loff_t offset = io->offset; 81 - ssize_t size = io->size; 82 - int ret = 0; 83 - 84 - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 85 - "list->prev 0x%p\n", 86 - io, inode->i_ino, io->list.next, io->list.prev); 87 - 88 - ret = ext4_convert_unwritten_extents(inode, offset, size); 89 - if (ret < 0) { 90 - ext4_msg(inode->i_sb, KERN_EMERG, 91 - "failed to convert unwritten extents to written " 92 - "extents -- potential data loss! " 93 - "(inode %lu, offset %llu, size %zd, error %d)", 94 - inode->i_ino, offset, size, ret); 95 - } 96 - /* Wake up anyone waiting on unwritten extent conversion */ 97 - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 98 - wake_up_all(ext4_ioend_wq(inode)); 99 - if (io->flag & EXT4_IO_END_DIRECT) 100 - inode_dio_done(inode); 101 - if (io->iocb) 102 - aio_complete(io->iocb, io->result, 0); 103 - return ret; 104 - } 105 - 106 - static void dump_completed_IO(struct inode *inode) 107 - { 108 - #ifdef EXT4FS_DEBUG 109 - struct list_head *cur, *before, *after; 110 - ext4_io_end_t *io, *io0, *io1; 111 - 112 - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 113 - ext4_debug("inode %lu completed_io list is empty\n", 114 - inode->i_ino); 115 - return; 116 - } 117 - 118 - ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 119 - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 120 - cur = &io->list; 121 - before = cur->prev; 122 - io0 = container_of(before, ext4_io_end_t, list); 123 - after = cur->next; 124 - io1 = container_of(after, ext4_io_end_t, list); 125 - 126 - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 127 - io, inode->i_ino, io0, io1); 128 - } 129 - #endif 130 - } 131 - 132 - /* Add the io_end to per-inode completed end_io list. */ 133 - void ext4_add_complete_io(ext4_io_end_t *io_end) 134 - { 135 - struct ext4_inode_info *ei = EXT4_I(io_end->inode); 136 - struct workqueue_struct *wq; 137 - unsigned long flags; 138 - 139 - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 140 - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 141 - 142 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 143 - if (list_empty(&ei->i_completed_io_list)) 144 - queue_work(wq, &ei->i_unwritten_work); 145 - list_add_tail(&io_end->list, &ei->i_completed_io_list); 146 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 147 - } 148 - 149 - static int ext4_do_flush_completed_IO(struct inode *inode) 150 - { 151 - ext4_io_end_t *io; 152 - struct list_head unwritten; 153 - unsigned long flags; 154 - struct ext4_inode_info *ei = EXT4_I(inode); 155 - int err, ret = 0; 156 - 157 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 158 - dump_completed_IO(inode); 159 - list_replace_init(&ei->i_completed_io_list, &unwritten); 160 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 161 - 162 - while (!list_empty(&unwritten)) { 163 - io = list_entry(unwritten.next, ext4_io_end_t, list); 164 - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 165 - list_del_init(&io->list); 166 - 167 - err = ext4_end_io(io); 168 - if (unlikely(!ret && err)) 169 - ret = err; 170 - io->flag &= ~EXT4_IO_END_UNWRITTEN; 171 - ext4_free_io_end(io); 172 - } 173 - return ret; 174 - } 175 - 176 - /* 177 - * work on completed aio dio IO, to convert unwritten extents to extents 178 - */ 179 - void ext4_end_io_work(struct work_struct *work) 180 - { 181 - struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 182 - i_unwritten_work); 183 - ext4_do_flush_completed_IO(&ei->vfs_inode); 184 - } 185 - 186 - int ext4_flush_unwritten_io(struct inode *inode) 187 - { 188 - int ret; 189 - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 190 - !(inode->i_state & I_FREEING)); 191 - ret = ext4_do_flush_completed_IO(inode); 192 - ext4_unwritten_wait(inode); 193 - return ret; 194 - } 195 - 196 - ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 197 - { 198 - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 199 - if (io) { 200 - atomic_inc(&EXT4_I(inode)->i_ioend_count); 201 - io->inode = inode; 202 - INIT_LIST_HEAD(&io->list); 203 - } 204 - return io; 205 - } 206 - 207 - /* 208 49 * Print an buffer I/O error compatible with the fs/buffer.c. This 209 50 * provides compatibility with dmesg scrapers that look for a specific 210 51 * buffer I/O error message. We really need a unified error reporting ··· 60 219 (unsigned long long)bh->b_blocknr); 61 220 } 62 221 63 - static void ext4_end_bio(struct bio *bio, int error) 222 + static void ext4_finish_bio(struct bio *bio) 64 223 { 65 - ext4_io_end_t *io_end = bio->bi_private; 66 - struct inode *inode; 67 224 int i; 68 - int blocksize; 69 - sector_t bi_sector = bio->bi_sector; 225 + int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); 70 226 71 - BUG_ON(!io_end); 72 - inode = io_end->inode; 73 - blocksize = 1 << inode->i_blkbits; 74 - bio->bi_private = NULL; 75 - bio->bi_end_io = NULL; 76 - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 77 - error = 0; 78 227 for (i = 0; i < bio->bi_vcnt; i++) { 79 228 struct bio_vec *bvec = &bio->bi_io_vec[i]; 80 229 struct page *page = bvec->bv_page; ··· 90 259 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 91 260 do { 92 261 if (bh_offset(bh) < bio_start || 93 - bh_offset(bh) + blocksize > bio_end) { 262 + bh_offset(bh) + bh->b_size > bio_end) { 94 263 if (buffer_async_write(bh)) 95 264 under_io++; 96 265 continue; ··· 104 273 if (!under_io) 105 274 end_page_writeback(page); 106 275 } 107 - bio_put(bio); 276 + } 277 + 278 + static void ext4_release_io_end(ext4_io_end_t *io_end) 279 + { 280 + struct bio *bio, *next_bio; 281 + 282 + BUG_ON(!list_empty(&io_end->list)); 283 + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 284 + WARN_ON(io_end->handle); 285 + 286 + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) 287 + wake_up_all(ext4_ioend_wq(io_end->inode)); 288 + 289 + for (bio = io_end->bio; bio; bio = next_bio) { 290 + next_bio = bio->bi_private; 291 + ext4_finish_bio(bio); 292 + bio_put(bio); 293 + } 294 + if (io_end->flag & EXT4_IO_END_DIRECT) 295 + inode_dio_done(io_end->inode); 296 + if (io_end->iocb) 297 + aio_complete(io_end->iocb, io_end->result, 0); 298 + kmem_cache_free(io_end_cachep, io_end); 299 + } 300 + 301 + static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) 302 + { 303 + struct inode *inode = io_end->inode; 304 + 305 + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; 306 + /* Wake up anyone waiting on unwritten extent conversion */ 307 + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 308 + wake_up_all(ext4_ioend_wq(inode)); 309 + } 310 + 311 + /* 312 + * Check a range of space and convert unwritten extents to written. Note that 313 + * we are protected from truncate touching same part of extent tree by the 314 + * fact that truncate code waits for all DIO to finish (thus exclusion from 315 + * direct IO is achieved) and also waits for PageWriteback bits. Thus we 316 + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are 317 + * completed (happens from ext4_free_ioend()). 318 + */ 319 + static int ext4_end_io(ext4_io_end_t *io) 320 + { 321 + struct inode *inode = io->inode; 322 + loff_t offset = io->offset; 323 + ssize_t size = io->size; 324 + handle_t *handle = io->handle; 325 + int ret = 0; 326 + 327 + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 328 + "list->prev 0x%p\n", 329 + io, inode->i_ino, io->list.next, io->list.prev); 330 + 331 + io->handle = NULL; /* Following call will use up the handle */ 332 + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); 333 + if (ret < 0) { 334 + ext4_msg(inode->i_sb, KERN_EMERG, 335 + "failed to convert unwritten extents to written " 336 + "extents -- potential data loss! " 337 + "(inode %lu, offset %llu, size %zd, error %d)", 338 + inode->i_ino, offset, size, ret); 339 + } 340 + ext4_clear_io_unwritten_flag(io); 341 + ext4_release_io_end(io); 342 + return ret; 343 + } 344 + 345 + static void dump_completed_IO(struct inode *inode, struct list_head *head) 346 + { 347 + #ifdef EXT4FS_DEBUG 348 + struct list_head *cur, *before, *after; 349 + ext4_io_end_t *io, *io0, *io1; 350 + 351 + if (list_empty(head)) 352 + return; 353 + 354 + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); 355 + list_for_each_entry(io, head, list) { 356 + cur = &io->list; 357 + before = cur->prev; 358 + io0 = container_of(before, ext4_io_end_t, list); 359 + after = cur->next; 360 + io1 = container_of(after, ext4_io_end_t, list); 361 + 362 + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 363 + io, inode->i_ino, io0, io1); 364 + } 365 + #endif 366 + } 367 + 368 + /* Add the io_end to per-inode completed end_io list. */ 369 + static void ext4_add_complete_io(ext4_io_end_t *io_end) 370 + { 371 + struct ext4_inode_info *ei = EXT4_I(io_end->inode); 372 + struct workqueue_struct *wq; 373 + unsigned long flags; 374 + 375 + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 376 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 377 + if (io_end->handle) { 378 + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; 379 + if (list_empty(&ei->i_rsv_conversion_list)) 380 + queue_work(wq, &ei->i_rsv_conversion_work); 381 + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); 382 + } else { 383 + wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; 384 + if (list_empty(&ei->i_unrsv_conversion_list)) 385 + queue_work(wq, &ei->i_unrsv_conversion_work); 386 + list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); 387 + } 388 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 389 + } 390 + 391 + static int ext4_do_flush_completed_IO(struct inode *inode, 392 + struct list_head *head) 393 + { 394 + ext4_io_end_t *io; 395 + struct list_head unwritten; 396 + unsigned long flags; 397 + struct ext4_inode_info *ei = EXT4_I(inode); 398 + int err, ret = 0; 399 + 400 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 401 + dump_completed_IO(inode, head); 402 + list_replace_init(head, &unwritten); 403 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 404 + 405 + while (!list_empty(&unwritten)) { 406 + io = list_entry(unwritten.next, ext4_io_end_t, list); 407 + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 408 + list_del_init(&io->list); 409 + 410 + err = ext4_end_io(io); 411 + if (unlikely(!ret && err)) 412 + ret = err; 413 + } 414 + return ret; 415 + } 416 + 417 + /* 418 + * work on completed IO, to convert unwritten extents to extents 419 + */ 420 + void ext4_end_io_rsv_work(struct work_struct *work) 421 + { 422 + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 423 + i_rsv_conversion_work); 424 + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); 425 + } 426 + 427 + void ext4_end_io_unrsv_work(struct work_struct *work) 428 + { 429 + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 430 + i_unrsv_conversion_work); 431 + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); 432 + } 433 + 434 + ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 435 + { 436 + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 437 + if (io) { 438 + atomic_inc(&EXT4_I(inode)->i_ioend_count); 439 + io->inode = inode; 440 + INIT_LIST_HEAD(&io->list); 441 + atomic_set(&io->count, 1); 442 + } 443 + return io; 444 + } 445 + 446 + void ext4_put_io_end_defer(ext4_io_end_t *io_end) 447 + { 448 + if (atomic_dec_and_test(&io_end->count)) { 449 + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { 450 + ext4_release_io_end(io_end); 451 + return; 452 + } 453 + ext4_add_complete_io(io_end); 454 + } 455 + } 456 + 457 + int ext4_put_io_end(ext4_io_end_t *io_end) 458 + { 459 + int err = 0; 460 + 461 + if (atomic_dec_and_test(&io_end->count)) { 462 + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 463 + err = ext4_convert_unwritten_extents(io_end->handle, 464 + io_end->inode, io_end->offset, 465 + io_end->size); 466 + io_end->handle = NULL; 467 + ext4_clear_io_unwritten_flag(io_end); 468 + } 469 + ext4_release_io_end(io_end); 470 + } 471 + return err; 472 + } 473 + 474 + ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) 475 + { 476 + atomic_inc(&io_end->count); 477 + return io_end; 478 + } 479 + 480 + static void ext4_end_bio(struct bio *bio, int error) 481 + { 482 + ext4_io_end_t *io_end = bio->bi_private; 483 + sector_t bi_sector = bio->bi_sector; 484 + 485 + BUG_ON(!io_end); 486 + bio->bi_end_io = NULL; 487 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 488 + error = 0; 489 + 490 + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 491 + /* 492 + * Link bio into list hanging from io_end. We have to do it 493 + * atomically as bio completions can be racing against each 494 + * other. 495 + */ 496 + bio->bi_private = xchg(&io_end->bio, bio); 497 + } else { 498 + ext4_finish_bio(bio); 499 + bio_put(bio); 500 + } 108 501 109 502 if (error) { 110 - io_end->flag |= EXT4_IO_END_ERROR; 503 + struct inode *inode = io_end->inode; 504 + 111 505 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 112 506 "(offset %llu size %ld starting block %llu)", 113 507 inode->i_ino, ··· 341 285 (unsigned long long) 342 286 bi_sector >> (inode->i_blkbits - 9)); 343 287 } 344 - 345 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 346 - ext4_free_io_end(io_end); 347 - return; 348 - } 349 - 350 - ext4_add_complete_io(io_end); 288 + ext4_put_io_end_defer(io_end); 351 289 } 352 290 353 291 void ext4_io_submit(struct ext4_io_submit *io) ··· 355 305 bio_put(io->io_bio); 356 306 } 357 307 io->io_bio = NULL; 358 - io->io_op = 0; 308 + } 309 + 310 + void ext4_io_submit_init(struct ext4_io_submit *io, 311 + struct writeback_control *wbc) 312 + { 313 + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 314 + io->io_bio = NULL; 359 315 io->io_end = NULL; 360 316 } 361 317 362 - static int io_submit_init(struct ext4_io_submit *io, 363 - struct inode *inode, 364 - struct writeback_control *wbc, 365 - struct buffer_head *bh) 318 + static int io_submit_init_bio(struct ext4_io_submit *io, 319 + struct buffer_head *bh) 366 320 { 367 - ext4_io_end_t *io_end; 368 - struct page *page = bh->b_page; 369 321 int nvecs = bio_get_nr_vecs(bh->b_bdev); 370 322 struct bio *bio; 371 323 372 - io_end = ext4_init_io_end(inode, GFP_NOFS); 373 - if (!io_end) 374 - return -ENOMEM; 375 324 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 325 + if (!bio) 326 + return -ENOMEM; 376 327 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 377 328 bio->bi_bdev = bh->b_bdev; 378 - bio->bi_private = io->io_end = io_end; 379 329 bio->bi_end_io = ext4_end_bio; 380 - 381 - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 382 - 330 + bio->bi_private = ext4_get_io_end(io->io_end); 383 331 io->io_bio = bio; 384 - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 385 332 io->io_next_block = bh->b_blocknr; 386 333 return 0; 387 334 } 388 335 389 336 static int io_submit_add_bh(struct ext4_io_submit *io, 390 337 struct inode *inode, 391 - struct writeback_control *wbc, 392 338 struct buffer_head *bh) 393 339 { 394 - ext4_io_end_t *io_end; 395 340 int ret; 396 341 397 342 if (io->io_bio && bh->b_blocknr != io->io_next_block) { ··· 394 349 ext4_io_submit(io); 395 350 } 396 351 if (io->io_bio == NULL) { 397 - ret = io_submit_init(io, inode, wbc, bh); 352 + ret = io_submit_init_bio(io, bh); 398 353 if (ret) 399 354 return ret; 400 355 } 401 - io_end = io->io_end; 402 - if (test_clear_buffer_uninit(bh)) 403 - ext4_set_io_unwritten_flag(inode, io_end); 404 - io->io_end->size += bh->b_size; 405 - io->io_next_block++; 406 356 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 407 357 if (ret != bh->b_size) 408 358 goto submit_and_retry; 359 + io->io_next_block++; 409 360 return 0; 410 361 } 411 362 ··· 473 432 do { 474 433 if (!buffer_async_write(bh)) 475 434 continue; 476 - ret = io_submit_add_bh(io, inode, wbc, bh); 435 + ret = io_submit_add_bh(io, inode, bh); 477 436 if (ret) { 478 437 /* 479 438 * We only get here on ENOMEM. Not much else

+13 -11

fs/ext4/resize.c

··· 79 79 ext4_fsblk_t end = start + input->blocks_count; 80 80 ext4_group_t group = input->group; 81 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 82 - unsigned overhead = ext4_group_overhead_blocks(sb, group); 83 - ext4_fsblk_t metaend = start + overhead; 82 + unsigned overhead; 83 + ext4_fsblk_t metaend; 84 84 struct buffer_head *bh = NULL; 85 85 ext4_grpblk_t free_blocks_count, offset; 86 86 int err = -EINVAL; 87 87 88 + if (group != sbi->s_groups_count) { 89 + ext4_warning(sb, "Cannot add at group %u (only %u groups)", 90 + input->group, sbi->s_groups_count); 91 + return -EINVAL; 92 + } 93 + 94 + overhead = ext4_group_overhead_blocks(sb, group); 95 + metaend = start + overhead; 88 96 input->free_blocks_count = free_blocks_count = 89 97 input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 90 98 ··· 104 96 free_blocks_count, input->reserved_blocks); 105 97 106 98 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 107 - if (group != sbi->s_groups_count) 108 - ext4_warning(sb, "Cannot add at group %u (only %u groups)", 109 - input->group, sbi->s_groups_count); 110 - else if (offset != 0) 99 + if (offset != 0) 111 100 ext4_warning(sb, "Last group not full"); 112 101 else if (input->reserved_blocks > input->blocks_count / 5) 113 102 ext4_warning(sb, "Reserved blocks too high (%u)", ··· 1556 1551 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1557 1552 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1558 1553 struct inode *inode = NULL; 1559 - int gdb_off, gdb_num; 1554 + int gdb_off; 1560 1555 int err; 1561 1556 __u16 bg_flags = 0; 1562 1557 1563 - gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 1564 1558 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1565 1559 1566 1560 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, ··· 1660 1656 err = err2; 1661 1657 1662 1658 if (!err) { 1663 - ext4_fsblk_t first_block; 1664 - first_block = ext4_group_first_block_no(sb, 0); 1665 1659 if (test_opt(sb, DEBUG)) 1666 1660 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1667 1661 "blocks\n", ext4_blocks_count(es)); 1668 - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1662 + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, 1669 1663 (char *)es, sizeof(struct ext4_super_block), 0); 1670 1664 } 1671 1665 return err;

+118 -37

fs/ext4/super.c

··· 69 69 static void ext4_clear_journal_err(struct super_block *sb, 70 70 struct ext4_super_block *es); 71 71 static int ext4_sync_fs(struct super_block *sb, int wait); 72 + static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); 72 73 static int ext4_remount(struct super_block *sb, int *flags, char *data); 73 74 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74 75 static int ext4_unfreeze(struct super_block *sb); ··· 399 398 } 400 399 if (test_opt(sb, ERRORS_RO)) { 401 400 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 401 + /* 402 + * Make sure updated value of ->s_mount_flags will be visible 403 + * before ->s_flags update 404 + */ 405 + smp_wmb(); 402 406 sb->s_flags |= MS_RDONLY; 403 407 } 404 408 if (test_opt(sb, ERRORS_PANIC)) ··· 428 422 ext4_handle_error(sb); 429 423 } 430 424 431 - void ext4_error_inode(struct inode *inode, const char *function, 432 - unsigned int line, ext4_fsblk_t block, 433 - const char *fmt, ...) 425 + void __ext4_error_inode(struct inode *inode, const char *function, 426 + unsigned int line, ext4_fsblk_t block, 427 + const char *fmt, ...) 434 428 { 435 429 va_list args; 436 430 struct va_format vaf; ··· 457 451 ext4_handle_error(inode->i_sb); 458 452 } 459 453 460 - void ext4_error_file(struct file *file, const char *function, 461 - unsigned int line, ext4_fsblk_t block, 462 - const char *fmt, ...) 454 + void __ext4_error_file(struct file *file, const char *function, 455 + unsigned int line, ext4_fsblk_t block, 456 + const char *fmt, ...) 463 457 { 464 458 va_list args; 465 459 struct va_format vaf; ··· 576 570 577 571 if ((sb->s_flags & MS_RDONLY) == 0) { 578 572 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 579 - sb->s_flags |= MS_RDONLY; 580 573 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; 574 + /* 575 + * Make sure updated value of ->s_mount_flags will be visible 576 + * before ->s_flags update 577 + */ 578 + smp_wmb(); 579 + sb->s_flags |= MS_RDONLY; 581 580 if (EXT4_SB(sb)->s_journal) 582 581 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 583 582 save_error_info(sb, function, line); ··· 591 580 panic("EXT4-fs panic from previous error\n"); 592 581 } 593 582 594 - void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 583 + void __ext4_msg(struct super_block *sb, 584 + const char *prefix, const char *fmt, ...) 595 585 { 596 586 struct va_format vaf; 597 587 va_list args; ··· 762 750 ext4_unregister_li_request(sb); 763 751 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 764 752 765 - flush_workqueue(sbi->dio_unwritten_wq); 766 - destroy_workqueue(sbi->dio_unwritten_wq); 753 + flush_workqueue(sbi->unrsv_conversion_wq); 754 + flush_workqueue(sbi->rsv_conversion_wq); 755 + destroy_workqueue(sbi->unrsv_conversion_wq); 756 + destroy_workqueue(sbi->rsv_conversion_wq); 767 757 768 758 if (sbi->s_journal) { 769 759 err = jbd2_journal_destroy(sbi->s_journal); ··· 774 760 ext4_abort(sb, "Couldn't clean up the journal"); 775 761 } 776 762 777 - ext4_es_unregister_shrinker(sb); 763 + ext4_es_unregister_shrinker(sbi); 778 764 del_timer(&sbi->s_err_report); 779 765 ext4_release_system_zone(sb); 780 766 ext4_mb_release(sb); ··· 863 849 rwlock_init(&ei->i_es_lock); 864 850 INIT_LIST_HEAD(&ei->i_es_lru); 865 851 ei->i_es_lru_nr = 0; 852 + ei->i_touch_when = 0; 866 853 ei->i_reserved_data_blocks = 0; 867 854 ei->i_reserved_meta_blocks = 0; 868 855 ei->i_allocated_meta_blocks = 0; ··· 874 859 ei->i_reserved_quota = 0; 875 860 #endif 876 861 ei->jinode = NULL; 877 - INIT_LIST_HEAD(&ei->i_completed_io_list); 862 + INIT_LIST_HEAD(&ei->i_rsv_conversion_list); 863 + INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); 878 864 spin_lock_init(&ei->i_completed_io_lock); 879 865 ei->i_sync_tid = 0; 880 866 ei->i_datasync_tid = 0; 881 867 atomic_set(&ei->i_ioend_count, 0); 882 868 atomic_set(&ei->i_unwritten, 0); 883 - INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); 869 + INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 870 + INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); 884 871 885 872 return &ei->vfs_inode; 886 873 } ··· 1110 1093 .dirty_inode = ext4_dirty_inode, 1111 1094 .drop_inode = ext4_drop_inode, 1112 1095 .evict_inode = ext4_evict_inode, 1096 + .sync_fs = ext4_sync_fs_nojournal, 1113 1097 .put_super = ext4_put_super, 1114 1098 .statfs = ext4_statfs, 1115 1099 .remount_fs = ext4_remount, ··· 1926 1908 struct ext4_sb_info *sbi = EXT4_SB(sb); 1927 1909 struct ext4_group_desc *gdp = NULL; 1928 1910 ext4_group_t flex_group; 1929 - unsigned int groups_per_flex = 0; 1930 1911 int i, err; 1931 1912 1932 1913 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; ··· 1933 1916 sbi->s_log_groups_per_flex = 0; 1934 1917 return 1; 1935 1918 } 1936 - groups_per_flex = 1U << sbi->s_log_groups_per_flex; 1937 1919 1938 1920 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1939 1921 if (err) ··· 2180 2164 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2181 2165 dquot_initialize(inode); 2182 2166 if (inode->i_nlink) { 2183 - ext4_msg(sb, KERN_DEBUG, 2184 - "%s: truncating inode %lu to %lld bytes", 2185 - __func__, inode->i_ino, inode->i_size); 2167 + if (test_opt(sb, DEBUG)) 2168 + ext4_msg(sb, KERN_DEBUG, 2169 + "%s: truncating inode %lu to %lld bytes", 2170 + __func__, inode->i_ino, inode->i_size); 2186 2171 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2187 2172 inode->i_ino, inode->i_size); 2188 2173 mutex_lock(&inode->i_mutex); 2174 + truncate_inode_pages(inode->i_mapping, inode->i_size); 2189 2175 ext4_truncate(inode); 2190 2176 mutex_unlock(&inode->i_mutex); 2191 2177 nr_truncates++; 2192 2178 } else { 2193 - ext4_msg(sb, KERN_DEBUG, 2194 - "%s: deleting unreferenced inode %lu", 2195 - __func__, inode->i_ino); 2179 + if (test_opt(sb, DEBUG)) 2180 + ext4_msg(sb, KERN_DEBUG, 2181 + "%s: deleting unreferenced inode %lu", 2182 + __func__, inode->i_ino); 2196 2183 jbd_debug(2, "deleting unreferenced inode %lu\n", 2197 2184 inode->i_ino); 2198 2185 nr_orphans++; ··· 2396 2377 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2397 2378 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2398 2379 const char *, size_t); 2399 - int offset; 2380 + union { 2381 + int offset; 2382 + int deprecated_val; 2383 + } u; 2400 2384 }; 2401 2385 2402 2386 static int parse_strtoull(const char *buf, ··· 2468 2446 static ssize_t sbi_ui_show(struct ext4_attr *a, 2469 2447 struct ext4_sb_info *sbi, char *buf) 2470 2448 { 2471 - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2449 + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); 2472 2450 2473 2451 return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2474 2452 } ··· 2477 2455 struct ext4_sb_info *sbi, 2478 2456 const char *buf, size_t count) 2479 2457 { 2480 - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2458 + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); 2481 2459 unsigned long t; 2482 2460 int ret; 2483 2461 ··· 2526 2504 return count; 2527 2505 } 2528 2506 2507 + static ssize_t sbi_deprecated_show(struct ext4_attr *a, 2508 + struct ext4_sb_info *sbi, char *buf) 2509 + { 2510 + return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); 2511 + } 2512 + 2529 2513 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2530 2514 static struct ext4_attr ext4_attr_##_name = { \ 2531 2515 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2532 2516 .show = _show, \ 2533 2517 .store = _store, \ 2534 - .offset = offsetof(struct ext4_sb_info, _elname), \ 2518 + .u = { \ 2519 + .offset = offsetof(struct ext4_sb_info, _elname),\ 2520 + }, \ 2535 2521 } 2536 2522 #define EXT4_ATTR(name, mode, show, store) \ 2537 2523 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) ··· 2550 2520 #define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2551 2521 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2552 2522 #define ATTR_LIST(name) &ext4_attr_##name.attr 2523 + #define EXT4_DEPRECATED_ATTR(_name, _val) \ 2524 + static struct ext4_attr ext4_attr_##_name = { \ 2525 + .attr = {.name = __stringify(_name), .mode = 0444 }, \ 2526 + .show = sbi_deprecated_show, \ 2527 + .u = { \ 2528 + .deprecated_val = _val, \ 2529 + }, \ 2530 + } 2553 2531 2554 2532 EXT4_RO_ATTR(delayed_allocation_blocks); 2555 2533 EXT4_RO_ATTR(session_write_kbytes); ··· 2572 2534 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2573 2535 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2574 2536 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2575 - EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2537 + EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); 2576 2538 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2577 2539 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2578 2540 ··· 3801 3763 sbi->s_err_report.data = (unsigned long) sb; 3802 3764 3803 3765 /* Register extent status tree shrinker */ 3804 - ext4_es_register_shrinker(sb); 3766 + ext4_es_register_shrinker(sbi); 3805 3767 3806 3768 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3807 3769 ext4_count_free_clusters(sb)); ··· 3825 3787 } 3826 3788 3827 3789 sbi->s_stripe = ext4_get_stripe_size(sbi); 3828 - sbi->s_max_writeback_mb_bump = 128; 3829 3790 sbi->s_extent_max_zeroout_kb = 32; 3830 3791 3831 3792 /* ··· 3952 3915 * The maximum number of concurrent works can be high and 3953 3916 * concurrency isn't really necessary. Limit it to 1. 3954 3917 */ 3955 - EXT4_SB(sb)->dio_unwritten_wq = 3956 - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3957 - if (!EXT4_SB(sb)->dio_unwritten_wq) { 3958 - printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3918 + EXT4_SB(sb)->rsv_conversion_wq = 3919 + alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3920 + if (!EXT4_SB(sb)->rsv_conversion_wq) { 3921 + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); 3959 3922 ret = -ENOMEM; 3960 - goto failed_mount_wq; 3923 + goto failed_mount4; 3924 + } 3925 + 3926 + EXT4_SB(sb)->unrsv_conversion_wq = 3927 + alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3928 + if (!EXT4_SB(sb)->unrsv_conversion_wq) { 3929 + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); 3930 + ret = -ENOMEM; 3931 + goto failed_mount4; 3961 3932 } 3962 3933 3963 3934 /* ··· 4119 4074 sb->s_root = NULL; 4120 4075 failed_mount4: 4121 4076 ext4_msg(sb, KERN_ERR, "mount failed"); 4122 - destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 4077 + if (EXT4_SB(sb)->rsv_conversion_wq) 4078 + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4079 + if (EXT4_SB(sb)->unrsv_conversion_wq) 4080 + destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); 4123 4081 failed_mount_wq: 4124 4082 if (sbi->s_journal) { 4125 4083 jbd2_journal_destroy(sbi->s_journal); 4126 4084 sbi->s_journal = NULL; 4127 4085 } 4128 4086 failed_mount3: 4129 - ext4_es_unregister_shrinker(sb); 4087 + ext4_es_unregister_shrinker(sbi); 4130 4088 del_timer(&sbi->s_err_report); 4131 4089 if (sbi->s_flex_groups) 4132 4090 ext4_kvfree(sbi->s_flex_groups); ··· 4565 4517 { 4566 4518 int ret = 0; 4567 4519 tid_t target; 4520 + bool needs_barrier = false; 4568 4521 struct ext4_sb_info *sbi = EXT4_SB(sb); 4569 4522 4570 4523 trace_ext4_sync_fs(sb, wait); 4571 - flush_workqueue(sbi->dio_unwritten_wq); 4524 + flush_workqueue(sbi->rsv_conversion_wq); 4525 + flush_workqueue(sbi->unrsv_conversion_wq); 4572 4526 /* 4573 4527 * Writeback quota in non-journalled quota case - journalled quota has 4574 4528 * no dirty dquots 4575 4529 */ 4576 4530 dquot_writeback_dquots(sb, -1); 4531 + /* 4532 + * Data writeback is possible w/o journal transaction, so barrier must 4533 + * being sent at the end of the function. But we can skip it if 4534 + * transaction_commit will do it for us. 4535 + */ 4536 + target = jbd2_get_latest_transaction(sbi->s_journal); 4537 + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && 4538 + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) 4539 + needs_barrier = true; 4540 + 4577 4541 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4578 4542 if (wait) 4579 - jbd2_log_wait_commit(sbi->s_journal, target); 4543 + ret = jbd2_log_wait_commit(sbi->s_journal, target); 4580 4544 } 4545 + if (needs_barrier) { 4546 + int err; 4547 + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); 4548 + if (!ret) 4549 + ret = err; 4550 + } 4551 + 4552 + return ret; 4553 + } 4554 + 4555 + static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) 4556 + { 4557 + int ret = 0; 4558 + 4559 + trace_ext4_sync_fs(sb, wait); 4560 + flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4561 + flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); 4562 + dquot_writeback_dquots(sb, -1); 4563 + if (wait && test_opt(sb, BARRIER)) 4564 + ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); 4565 + 4581 4566 return ret; 4582 4567 } 4583 4568

+2 -1

fs/f2fs/data.c

··· 698 698 get_data_block_ro); 699 699 } 700 700 701 - static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) 701 + static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 702 + unsigned int length) 702 703 { 703 704 struct inode *inode = page->mapping->host; 704 705 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);

+2 -1

fs/f2fs/node.c

··· 1205 1205 return 0; 1206 1206 } 1207 1207 1208 - static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) 1208 + static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, 1209 + unsigned int length) 1209 1210 { 1210 1211 struct inode *inode = page->mapping->host; 1211 1212 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);

+12 -5

fs/gfs2/aops.c

··· 110 110 /* Is the page fully outside i_size? (truncate in progress) */ 111 111 offset = i_size & (PAGE_CACHE_SIZE-1); 112 112 if (page->index > end_index || (page->index == end_index && !offset)) { 113 - page->mapping->a_ops->invalidatepage(page, 0); 113 + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); 114 114 goto out; 115 115 } 116 116 return 1; ··· 299 299 300 300 /* Is the page fully outside i_size? (truncate in progress) */ 301 301 if (page->index > end_index || (page->index == end_index && !offset)) { 302 - page->mapping->a_ops->invalidatepage(page, 0); 302 + page->mapping->a_ops->invalidatepage(page, 0, 303 + PAGE_CACHE_SIZE); 303 304 unlock_page(page); 304 305 continue; 305 306 } ··· 944 943 unlock_buffer(bh); 945 944 } 946 945 947 - static void gfs2_invalidatepage(struct page *page, unsigned long offset) 946 + static void gfs2_invalidatepage(struct page *page, unsigned int offset, 947 + unsigned int length) 948 948 { 949 949 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 950 + unsigned int stop = offset + length; 951 + int partial_page = (offset || length < PAGE_CACHE_SIZE); 950 952 struct buffer_head *bh, *head; 951 953 unsigned long pos = 0; 952 954 953 955 BUG_ON(!PageLocked(page)); 954 - if (offset == 0) 956 + if (!partial_page) 955 957 ClearPageChecked(page); 956 958 if (!page_has_buffers(page)) 957 959 goto out; 958 960 959 961 bh = head = page_buffers(page); 960 962 do { 963 + if (pos + bh->b_size > stop) 964 + return; 965 + 961 966 if (offset <= pos) 962 967 gfs2_discard(sdp, bh); 963 968 pos += bh->b_size; 964 969 bh = bh->b_this_page; 965 970 } while (bh != head); 966 971 out: 967 - if (offset == 0) 972 + if (!partial_page) 968 973 try_to_release_page(page, 0); 969 974 } 970 975

+14 -5

fs/jbd/transaction.c

··· 2019 2019 * void journal_invalidatepage() - invalidate a journal page 2020 2020 * @journal: journal to use for flush 2021 2021 * @page: page to flush 2022 - * @offset: length of page to invalidate. 2022 + * @offset: offset of the range to invalidate 2023 + * @length: length of the range to invalidate 2023 2024 * 2024 - * Reap page buffers containing data after offset in page. 2025 + * Reap page buffers containing data in specified range in page. 2025 2026 */ 2026 2027 void journal_invalidatepage(journal_t *journal, 2027 2028 struct page *page, 2028 - unsigned long offset) 2029 + unsigned int offset, 2030 + unsigned int length) 2029 2031 { 2030 2032 struct buffer_head *head, *bh, *next; 2033 + unsigned int stop = offset + length; 2031 2034 unsigned int curr_off = 0; 2035 + int partial_page = (offset || length < PAGE_CACHE_SIZE); 2032 2036 int may_free = 1; 2033 2037 2034 2038 if (!PageLocked(page)) 2035 2039 BUG(); 2036 2040 if (!page_has_buffers(page)) 2037 2041 return; 2042 + 2043 + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); 2038 2044 2039 2045 /* We will potentially be playing with lists other than just the 2040 2046 * data lists (especially for journaled data mode), so be ··· 2051 2045 unsigned int next_off = curr_off + bh->b_size; 2052 2046 next = bh->b_this_page; 2053 2047 2048 + if (next_off > stop) 2049 + return; 2050 + 2054 2051 if (offset <= curr_off) { 2055 2052 /* This block is wholly outside the truncation point */ 2056 2053 lock_buffer(bh); 2057 2054 may_free &= journal_unmap_buffer(journal, bh, 2058 - offset > 0); 2055 + partial_page); 2059 2056 unlock_buffer(bh); 2060 2057 } 2061 2058 curr_off = next_off; ··· 2066 2057 2067 2058 } while (bh != head); 2068 2059 2069 - if (!offset) { 2060 + if (!partial_page) { 2070 2061 if (may_free && try_to_free_buffers(page)) 2071 2062 J_ASSERT(!page_has_buffers(page)); 2072 2063 }

+3 -3

fs/jbd2/Kconfig

··· 20 20 21 21 config JBD2_DEBUG 22 22 bool "JBD2 (ext4) debugging support" 23 - depends on JBD2 && DEBUG_FS 23 + depends on JBD2 24 24 help 25 25 If you are using the ext4 journaled file system (or 26 26 potentially any other filesystem/device using JBD2), this option ··· 29 29 By default, the debugging output will be turned off. 30 30 31 31 If you select Y here, then you will be able to turn on debugging 32 - with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a 32 + with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a 33 33 number between 1 and 5. The higher the number, the more debugging 34 34 output is generated. To turn debugging off again, do 35 - "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". 35 + "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".

+12 -10

fs/jbd2/checkpoint.c

··· 120 120 int nblocks, space_left; 121 121 /* assert_spin_locked(&journal->j_state_lock); */ 122 122 123 - nblocks = jbd_space_needed(journal); 124 - while (__jbd2_log_space_left(journal) < nblocks) { 123 + nblocks = jbd2_space_needed(journal); 124 + while (jbd2_log_space_left(journal) < nblocks) { 125 125 if (journal->j_flags & JBD2_ABORT) 126 126 return; 127 127 write_unlock(&journal->j_state_lock); ··· 140 140 */ 141 141 write_lock(&journal->j_state_lock); 142 142 spin_lock(&journal->j_list_lock); 143 - nblocks = jbd_space_needed(journal); 144 - space_left = __jbd2_log_space_left(journal); 143 + nblocks = jbd2_space_needed(journal); 144 + space_left = jbd2_log_space_left(journal); 145 145 if (space_left < nblocks) { 146 146 int chkpt = journal->j_checkpoint_transactions != NULL; 147 147 tid_t tid = 0; ··· 156 156 /* We were able to recover space; yay! */ 157 157 ; 158 158 } else if (tid) { 159 + /* 160 + * jbd2_journal_commit_transaction() may want 161 + * to take the checkpoint_mutex if JBD2_FLUSHED 162 + * is set. So we need to temporarily drop it. 163 + */ 164 + mutex_unlock(&journal->j_checkpoint_mutex); 159 165 jbd2_log_wait_commit(journal, tid); 166 + write_lock(&journal->j_state_lock); 167 + continue; 160 168 } else { 161 169 printk(KERN_ERR "%s: needed %d blocks and " 162 170 "only had %d space available\n", ··· 633 625 634 626 __jbd2_journal_drop_transaction(journal, transaction); 635 627 jbd2_journal_free_transaction(transaction); 636 - 637 - /* Just in case anybody was waiting for more transactions to be 638 - checkpointed... */ 639 - wake_up(&journal->j_wait_logspace); 640 628 ret = 1; 641 629 out: 642 630 return ret; ··· 694 690 J_ASSERT(transaction->t_state == T_FINISHED); 695 691 J_ASSERT(transaction->t_buffers == NULL); 696 692 J_ASSERT(transaction->t_forget == NULL); 697 - J_ASSERT(transaction->t_iobuf_list == NULL); 698 693 J_ASSERT(transaction->t_shadow_list == NULL); 699 - J_ASSERT(transaction->t_log_list == NULL); 700 694 J_ASSERT(transaction->t_checkpoint_list == NULL); 701 695 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 702 696 J_ASSERT(atomic_read(&transaction->t_updates) == 0);

+77 -107

fs/jbd2/commit.c

··· 30 30 #include <trace/events/jbd2.h> 31 31 32 32 /* 33 - * Default IO end handler for temporary BJ_IO buffer_heads. 33 + * IO end handler for temporary buffer_heads handling writes to the journal. 34 34 */ 35 35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 36 36 { 37 + struct buffer_head *orig_bh = bh->b_private; 38 + 37 39 BUFFER_TRACE(bh, ""); 38 40 if (uptodate) 39 41 set_buffer_uptodate(bh); 40 42 else 41 43 clear_buffer_uptodate(bh); 44 + if (orig_bh) { 45 + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); 46 + smp_mb__after_clear_bit(); 47 + wake_up_bit(&orig_bh->b_state, BH_Shadow); 48 + } 42 49 unlock_buffer(bh); 43 50 } 44 51 ··· 92 85 __brelse(bh); 93 86 } 94 87 95 - static void jbd2_commit_block_csum_set(journal_t *j, 96 - struct journal_head *descriptor) 88 + static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) 97 89 { 98 90 struct commit_header *h; 99 91 __u32 csum; ··· 100 94 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 101 95 return; 102 96 103 - h = (struct commit_header *)(jh2bh(descriptor)->b_data); 97 + h = (struct commit_header *)(bh->b_data); 104 98 h->h_chksum_type = 0; 105 99 h->h_chksum_size = 0; 106 100 h->h_chksum[0] = 0; 107 - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 108 - j->j_blocksize); 101 + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); 109 102 h->h_chksum[0] = cpu_to_be32(csum); 110 103 } 111 104 ··· 121 116 struct buffer_head **cbh, 122 117 __u32 crc32_sum) 123 118 { 124 - struct journal_head *descriptor; 125 119 struct commit_header *tmp; 126 120 struct buffer_head *bh; 127 121 int ret; ··· 131 127 if (is_journal_aborted(journal)) 132 128 return 0; 133 129 134 - descriptor = jbd2_journal_get_descriptor_buffer(journal); 135 - if (!descriptor) 130 + bh = jbd2_journal_get_descriptor_buffer(journal); 131 + if (!bh) 136 132 return 1; 137 - 138 - bh = jh2bh(descriptor); 139 133 140 134 tmp = (struct commit_header *)bh->b_data; 141 135 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); ··· 148 146 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 149 147 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 150 148 } 151 - jbd2_commit_block_csum_set(journal, descriptor); 149 + jbd2_commit_block_csum_set(journal, bh); 152 150 153 - JBUFFER_TRACE(descriptor, "submit commit block"); 151 + BUFFER_TRACE(bh, "submit commit block"); 154 152 lock_buffer(bh); 155 153 clear_buffer_dirty(bh); 156 154 set_buffer_uptodate(bh); ··· 182 180 if (unlikely(!buffer_uptodate(bh))) 183 181 ret = -EIO; 184 182 put_bh(bh); /* One for getblk() */ 185 - jbd2_journal_put_journal_head(bh2jh(bh)); 186 183 187 184 return ret; 188 185 } ··· 322 321 } 323 322 324 323 static void jbd2_descr_block_csum_set(journal_t *j, 325 - struct journal_head *descriptor) 324 + struct buffer_head *bh) 326 325 { 327 326 struct jbd2_journal_block_tail *tail; 328 327 __u32 csum; ··· 330 329 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 331 330 return; 332 331 333 - tail = (struct jbd2_journal_block_tail *) 334 - (jh2bh(descriptor)->b_data + j->j_blocksize - 332 + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - 335 333 sizeof(struct jbd2_journal_block_tail)); 336 334 tail->t_checksum = 0; 337 - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 338 - j->j_blocksize); 335 + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); 339 336 tail->t_checksum = cpu_to_be32(csum); 340 337 } 341 338 ··· 342 343 { 343 344 struct page *page = bh->b_page; 344 345 __u8 *addr; 345 - __u32 csum; 346 + __u32 csum32; 346 347 347 348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 348 349 return; 349 350 350 351 sequence = cpu_to_be32(sequence); 351 352 addr = kmap_atomic(page); 352 - csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 353 - sizeof(sequence)); 354 - csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), 355 - bh->b_size); 353 + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 354 + sizeof(sequence)); 355 + csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), 356 + bh->b_size); 356 357 kunmap_atomic(addr); 357 358 358 - tag->t_checksum = cpu_to_be32(csum); 359 + /* We only have space to store the lower 16 bits of the crc32c. */ 360 + tag->t_checksum = cpu_to_be16(csum32); 359 361 } 360 362 /* 361 363 * jbd2_journal_commit_transaction ··· 368 368 { 369 369 struct transaction_stats_s stats; 370 370 transaction_t *commit_transaction; 371 - struct journal_head *jh, *new_jh, *descriptor; 371 + struct journal_head *jh; 372 + struct buffer_head *descriptor; 372 373 struct buffer_head **wbuf = journal->j_wbuf; 373 374 int bufs; 374 375 int flags; ··· 393 392 tid_t first_tid; 394 393 int update_tail; 395 394 int csum_size = 0; 395 + LIST_HEAD(io_bufs); 396 + LIST_HEAD(log_bufs); 396 397 397 398 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 398 399 csum_size = sizeof(struct jbd2_journal_block_tail); ··· 427 424 J_ASSERT(journal->j_committing_transaction == NULL); 428 425 429 426 commit_transaction = journal->j_running_transaction; 430 - J_ASSERT(commit_transaction->t_state == T_RUNNING); 431 427 432 428 trace_jbd2_start_commit(journal, commit_transaction); 433 429 jbd_debug(1, "JBD2: starting commit of transaction %d\n", 434 430 commit_transaction->t_tid); 435 431 436 432 write_lock(&journal->j_state_lock); 433 + J_ASSERT(commit_transaction->t_state == T_RUNNING); 437 434 commit_transaction->t_state = T_LOCKED; 438 435 439 436 trace_jbd2_commit_locking(journal, commit_transaction); ··· 523 520 */ 524 521 jbd2_journal_switch_revoke_table(journal); 525 522 523 + /* 524 + * Reserved credits cannot be claimed anymore, free them 525 + */ 526 + atomic_sub(atomic_read(&journal->j_reserved_credits), 527 + &commit_transaction->t_outstanding_credits); 528 + 526 529 trace_jbd2_commit_flushing(journal, commit_transaction); 527 530 stats.run.rs_flushing = jiffies; 528 531 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, ··· 542 533 wake_up(&journal->j_wait_transaction_locked); 543 534 write_unlock(&journal->j_state_lock); 544 535 545 - jbd_debug(3, "JBD2: commit phase 2\n"); 536 + jbd_debug(3, "JBD2: commit phase 2a\n"); 546 537 547 538 /* 548 539 * Now start flushing things to disk, in the order they appear ··· 554 545 555 546 blk_start_plug(&plug); 556 547 jbd2_journal_write_revoke_records(journal, commit_transaction, 557 - WRITE_SYNC); 548 + &log_bufs, WRITE_SYNC); 558 549 blk_finish_plug(&plug); 559 550 560 - jbd_debug(3, "JBD2: commit phase 2\n"); 551 + jbd_debug(3, "JBD2: commit phase 2b\n"); 561 552 562 553 /* 563 554 * Way to go: we have now written out all of the data for a ··· 580 571 atomic_read(&commit_transaction->t_outstanding_credits)); 581 572 582 573 err = 0; 583 - descriptor = NULL; 584 574 bufs = 0; 575 + descriptor = NULL; 585 576 blk_start_plug(&plug); 586 577 while (commit_transaction->t_buffers) { 587 578 ··· 613 604 record the metadata buffer. */ 614 605 615 606 if (!descriptor) { 616 - struct buffer_head *bh; 617 - 618 607 J_ASSERT (bufs == 0); 619 608 620 609 jbd_debug(4, "JBD2: get descriptor\n"); ··· 623 616 continue; 624 617 } 625 618 626 - bh = jh2bh(descriptor); 627 619 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 628 - (unsigned long long)bh->b_blocknr, bh->b_data); 629 - header = (journal_header_t *)&bh->b_data[0]; 620 + (unsigned long long)descriptor->b_blocknr, 621 + descriptor->b_data); 622 + header = (journal_header_t *)descriptor->b_data; 630 623 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 631 624 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); 632 625 header->h_sequence = cpu_to_be32(commit_transaction->t_tid); 633 626 634 - tagp = &bh->b_data[sizeof(journal_header_t)]; 635 - space_left = bh->b_size - sizeof(journal_header_t); 627 + tagp = &descriptor->b_data[sizeof(journal_header_t)]; 628 + space_left = descriptor->b_size - 629 + sizeof(journal_header_t); 636 630 first_tag = 1; 637 - set_buffer_jwrite(bh); 638 - set_buffer_dirty(bh); 639 - wbuf[bufs++] = bh; 631 + set_buffer_jwrite(descriptor); 632 + set_buffer_dirty(descriptor); 633 + wbuf[bufs++] = descriptor; 640 634 641 635 /* Record it so that we can wait for IO 642 636 completion later */ 643 - BUFFER_TRACE(bh, "ph3: file as descriptor"); 644 - jbd2_journal_file_buffer(descriptor, commit_transaction, 645 - BJ_LogCtl); 637 + BUFFER_TRACE(descriptor, "ph3: file as descriptor"); 638 + jbd2_file_log_bh(&log_bufs, descriptor); 646 639 } 647 640 648 641 /* Where is the buffer to be written? */ ··· 665 658 666 659 /* Bump b_count to prevent truncate from stumbling over 667 660 the shadowed buffer! @@@ This can go if we ever get 668 - rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 661 + rid of the shadow pairing of buffers. */ 669 662 atomic_inc(&jh2bh(jh)->b_count); 670 663 671 - /* Make a temporary IO buffer with which to write it out 672 - (this will requeue both the metadata buffer and the 673 - temporary IO buffer). new_bh goes on BJ_IO*/ 674 - 675 - set_bit(BH_JWrite, &jh2bh(jh)->b_state); 676 664 /* 677 - * akpm: jbd2_journal_write_metadata_buffer() sets 678 - * new_bh->b_transaction to commit_transaction. 679 - * We need to clean this up before we release new_bh 680 - * (which is of type BJ_IO) 665 + * Make a temporary IO buffer with which to write it out 666 + * (this will requeue the metadata buffer to BJ_Shadow). 681 667 */ 668 + set_bit(BH_JWrite, &jh2bh(jh)->b_state); 682 669 JBUFFER_TRACE(jh, "ph3: write metadata"); 683 670 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 684 - jh, &new_jh, blocknr); 671 + jh, &wbuf[bufs], blocknr); 685 672 if (flags < 0) { 686 673 jbd2_journal_abort(journal, flags); 687 674 continue; 688 675 } 689 - set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 690 - wbuf[bufs++] = jh2bh(new_jh); 676 + jbd2_file_log_bh(&io_bufs, wbuf[bufs]); 691 677 692 678 /* Record the new block's tag in the current descriptor 693 679 buffer */ ··· 694 694 tag = (journal_block_tag_t *) tagp; 695 695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 696 696 tag->t_flags = cpu_to_be16(tag_flag); 697 - jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), 697 + jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 698 698 commit_transaction->t_tid); 699 699 tagp += tag_bytes; 700 700 space_left -= tag_bytes; 701 + bufs++; 701 702 702 703 if (first_tag) { 703 704 memcpy (tagp, journal->j_uuid, 16); ··· 810 809 the log. Before we can commit it, wait for the IO so far to 811 810 complete. Control buffers being written are on the 812 811 transaction's t_log_list queue, and metadata buffers are on 813 - the t_iobuf_list queue. 812 + the io_bufs list. 814 813 815 814 Wait for the buffers in reverse order. That way we are 816 815 less likely to be woken up until all IOs have completed, and ··· 819 818 820 819 jbd_debug(3, "JBD2: commit phase 3\n"); 821 820 822 - /* 823 - * akpm: these are BJ_IO, and j_list_lock is not needed. 824 - * See __journal_try_to_free_buffer. 825 - */ 826 - wait_for_iobuf: 827 - while (commit_transaction->t_iobuf_list != NULL) { 828 - struct buffer_head *bh; 821 + while (!list_empty(&io_bufs)) { 822 + struct buffer_head *bh = list_entry(io_bufs.prev, 823 + struct buffer_head, 824 + b_assoc_buffers); 829 825 830 - jh = commit_transaction->t_iobuf_list->b_tprev; 831 - bh = jh2bh(jh); 832 - if (buffer_locked(bh)) { 833 - wait_on_buffer(bh); 834 - goto wait_for_iobuf; 835 - } 836 - if (cond_resched()) 837 - goto wait_for_iobuf; 826 + wait_on_buffer(bh); 827 + cond_resched(); 838 828 839 829 if (unlikely(!buffer_uptodate(bh))) 840 830 err = -EIO; 841 - 842 - clear_buffer_jwrite(bh); 843 - 844 - JBUFFER_TRACE(jh, "ph4: unfile after journal write"); 845 - jbd2_journal_unfile_buffer(journal, jh); 831 + jbd2_unfile_log_bh(bh); 846 832 847 833 /* 848 - * ->t_iobuf_list should contain only dummy buffer_heads 849 - * which were created by jbd2_journal_write_metadata_buffer(). 834 + * The list contains temporary buffer heads created by 835 + * jbd2_journal_write_metadata_buffer(). 850 836 */ 851 837 BUFFER_TRACE(bh, "dumping temporary bh"); 852 - jbd2_journal_put_journal_head(jh); 853 838 __brelse(bh); 854 839 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 855 840 free_buffer_head(bh); 856 841 857 - /* We also have to unlock and free the corresponding 858 - shadowed buffer */ 842 + /* We also have to refile the corresponding shadowed buffer */ 859 843 jh = commit_transaction->t_shadow_list->b_tprev; 860 844 bh = jh2bh(jh); 861 - clear_bit(BH_JWrite, &bh->b_state); 845 + clear_buffer_jwrite(bh); 862 846 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 847 + J_ASSERT_BH(bh, !buffer_shadow(bh)); 863 848 864 849 /* The metadata is now released for reuse, but we need 865 850 to remember it against this transaction so that when ··· 853 866 required. */ 854 867 JBUFFER_TRACE(jh, "file as BJ_Forget"); 855 868 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 856 - /* 857 - * Wake up any transactions which were waiting for this IO to 858 - * complete. The barrier must be here so that changes by 859 - * jbd2_journal_file_buffer() take effect before wake_up_bit() 860 - * does the waitqueue check. 861 - */ 862 - smp_mb(); 863 - wake_up_bit(&bh->b_state, BH_Unshadow); 864 869 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 865 870 __brelse(bh); 866 871 } ··· 862 883 jbd_debug(3, "JBD2: commit phase 4\n"); 863 884 864 885 /* Here we wait for the revoke record and descriptor record buffers */ 865 - wait_for_ctlbuf: 866 - while (commit_transaction->t_log_list != NULL) { 886 + while (!list_empty(&log_bufs)) { 867 887 struct buffer_head *bh; 868 888 869 - jh = commit_transaction->t_log_list->b_tprev; 870 - bh = jh2bh(jh); 871 - if (buffer_locked(bh)) { 872 - wait_on_buffer(bh); 873 - goto wait_for_ctlbuf; 874 - } 875 - if (cond_resched()) 876 - goto wait_for_ctlbuf; 889 + bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); 890 + wait_on_buffer(bh); 891 + cond_resched(); 877 892 878 893 if (unlikely(!buffer_uptodate(bh))) 879 894 err = -EIO; 880 895 881 896 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 882 897 clear_buffer_jwrite(bh); 883 - jbd2_journal_unfile_buffer(journal, jh); 884 - jbd2_journal_put_journal_head(jh); 898 + jbd2_unfile_log_bh(bh); 885 899 __brelse(bh); /* One for getblk */ 886 900 /* AKPM: bforget here */ 887 901 } ··· 924 952 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 925 953 J_ASSERT(commit_transaction->t_buffers == NULL); 926 954 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 927 - J_ASSERT(commit_transaction->t_iobuf_list == NULL); 928 955 J_ASSERT(commit_transaction->t_shadow_list == NULL); 929 - J_ASSERT(commit_transaction->t_log_list == NULL); 930 956 931 957 restart_loop: 932 958 /*

+91 -75

fs/jbd2/journal.c

··· 103 103 static void __journal_abort_soft (journal_t *journal, int errno); 104 104 static int jbd2_journal_create_slab(size_t slab_size); 105 105 106 + #ifdef CONFIG_JBD2_DEBUG 107 + void __jbd2_debug(int level, const char *file, const char *func, 108 + unsigned int line, const char *fmt, ...) 109 + { 110 + struct va_format vaf; 111 + va_list args; 112 + 113 + if (level > jbd2_journal_enable_debug) 114 + return; 115 + va_start(args, fmt); 116 + vaf.fmt = fmt; 117 + vaf.va = &args; 118 + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); 119 + va_end(args); 120 + } 121 + EXPORT_SYMBOL(__jbd2_debug); 122 + #endif 123 + 106 124 /* Checksumming functions */ 107 125 int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 108 126 { ··· 328 310 * 329 311 * If the source buffer has already been modified by a new transaction 330 312 * since we took the last commit snapshot, we use the frozen copy of 331 - * that data for IO. If we end up using the existing buffer_head's data 332 - * for the write, then we *have* to lock the buffer to prevent anyone 333 - * else from using and possibly modifying it while the IO is in 334 - * progress. 313 + * that data for IO. If we end up using the existing buffer_head's data 314 + * for the write, then we have to make sure nobody modifies it while the 315 + * IO is in progress. do_get_write_access() handles this. 335 316 * 336 - * The function returns a pointer to the buffer_heads to be used for IO. 337 - * 338 - * We assume that the journal has already been locked in this function. 317 + * The function returns a pointer to the buffer_head to be used for IO. 318 + * 339 319 * 340 320 * Return value: 341 321 * <0: Error ··· 346 330 347 331 int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 348 332 struct journal_head *jh_in, 349 - struct journal_head **jh_out, 350 - unsigned long long blocknr) 333 + struct buffer_head **bh_out, 334 + sector_t blocknr) 351 335 { 352 336 int need_copy_out = 0; 353 337 int done_copy_out = 0; 354 338 int do_escape = 0; 355 339 char *mapped_data; 356 340 struct buffer_head *new_bh; 357 - struct journal_head *new_jh; 358 341 struct page *new_page; 359 342 unsigned int new_offset; 360 343 struct buffer_head *bh_in = jh2bh(jh_in); ··· 383 368 384 369 /* keep subsequent assertions sane */ 385 370 atomic_set(&new_bh->b_count, 1); 386 - new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ 387 371 372 + jbd_lock_bh_state(bh_in); 373 + repeat: 388 374 /* 389 375 * If a new transaction has already done a buffer copy-out, then 390 376 * we use that version of the data for the commit. 391 377 */ 392 - jbd_lock_bh_state(bh_in); 393 - repeat: 394 378 if (jh_in->b_frozen_data) { 395 379 done_copy_out = 1; 396 380 new_page = virt_to_page(jh_in->b_frozen_data); ··· 429 415 jbd_unlock_bh_state(bh_in); 430 416 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 431 417 if (!tmp) { 432 - jbd2_journal_put_journal_head(new_jh); 418 + brelse(new_bh); 433 419 return -ENOMEM; 434 420 } 435 421 jbd_lock_bh_state(bh_in); ··· 440 426 441 427 jh_in->b_frozen_data = tmp; 442 428 mapped_data = kmap_atomic(new_page); 443 - memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 429 + memcpy(tmp, mapped_data + new_offset, bh_in->b_size); 444 430 kunmap_atomic(mapped_data); 445 431 446 432 new_page = virt_to_page(tmp); ··· 466 452 } 467 453 468 454 set_bh_page(new_bh, new_page, new_offset); 469 - new_jh->b_transaction = NULL; 470 - new_bh->b_size = jh2bh(jh_in)->b_size; 471 - new_bh->b_bdev = transaction->t_journal->j_dev; 455 + new_bh->b_size = bh_in->b_size; 456 + new_bh->b_bdev = journal->j_dev; 472 457 new_bh->b_blocknr = blocknr; 458 + new_bh->b_private = bh_in; 473 459 set_buffer_mapped(new_bh); 474 460 set_buffer_dirty(new_bh); 475 461 476 - *jh_out = new_jh; 462 + *bh_out = new_bh; 477 463 478 464 /* 479 465 * The to-be-written buffer needs to get moved to the io queue, ··· 484 470 spin_lock(&journal->j_list_lock); 485 471 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 486 472 spin_unlock(&journal->j_list_lock); 473 + set_buffer_shadow(bh_in); 487 474 jbd_unlock_bh_state(bh_in); 488 - 489 - JBUFFER_TRACE(new_jh, "file as BJ_IO"); 490 - jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); 491 475 492 476 return do_escape | (done_copy_out << 1); 493 477 } ··· 494 482 * Allocation code for the journal file. Manage the space left in the 495 483 * journal, so that we can begin checkpointing when appropriate. 496 484 */ 497 - 498 - /* 499 - * __jbd2_log_space_left: Return the number of free blocks left in the journal. 500 - * 501 - * Called with the journal already locked. 502 - * 503 - * Called under j_state_lock 504 - */ 505 - 506 - int __jbd2_log_space_left(journal_t *journal) 507 - { 508 - int left = journal->j_free; 509 - 510 - /* assert_spin_locked(&journal->j_state_lock); */ 511 - 512 - /* 513 - * Be pessimistic here about the number of those free blocks which 514 - * might be required for log descriptor control blocks. 515 - */ 516 - 517 - #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ 518 - 519 - left -= MIN_LOG_RESERVED_BLOCKS; 520 - 521 - if (left <= 0) 522 - return 0; 523 - left -= (left >> 3); 524 - return left; 525 - } 526 485 527 486 /* 528 487 * Called with j_state_lock locked for writing. ··· 547 564 } 548 565 549 566 /* 550 - * Force and wait upon a commit if the calling process is not within 551 - * transaction. This is used for forcing out undo-protected data which contains 552 - * bitmaps, when the fs is running out of space. 553 - * 554 - * We can only force the running transaction if we don't have an active handle; 555 - * otherwise, we will deadlock. 556 - * 557 - * Returns true if a transaction was started. 567 + * Force and wait any uncommitted transactions. We can only force the running 568 + * transaction if we don't have an active handle, otherwise, we will deadlock. 569 + * Returns: <0 in case of error, 570 + * 0 if nothing to commit, 571 + * 1 if transaction was successfully committed. 558 572 */ 559 - int jbd2_journal_force_commit_nested(journal_t *journal) 573 + static int __jbd2_journal_force_commit(journal_t *journal) 560 574 { 561 575 transaction_t *transaction = NULL; 562 576 tid_t tid; 563 - int need_to_start = 0; 577 + int need_to_start = 0, ret = 0; 564 578 565 579 read_lock(&journal->j_state_lock); 566 580 if (journal->j_running_transaction && !current->journal_info) { ··· 568 588 transaction = journal->j_committing_transaction; 569 589 570 590 if (!transaction) { 591 + /* Nothing to commit */ 571 592 read_unlock(&journal->j_state_lock); 572 - return 0; /* Nothing to retry */ 593 + return 0; 573 594 } 574 - 575 595 tid = transaction->t_tid; 576 596 read_unlock(&journal->j_state_lock); 577 597 if (need_to_start) 578 598 jbd2_log_start_commit(journal, tid); 579 - jbd2_log_wait_commit(journal, tid); 580 - return 1; 599 + ret = jbd2_log_wait_commit(journal, tid); 600 + if (!ret) 601 + ret = 1; 602 + 603 + return ret; 604 + } 605 + 606 + /** 607 + * Force and wait upon a commit if the calling process is not within 608 + * transaction. This is used for forcing out undo-protected data which contains 609 + * bitmaps, when the fs is running out of space. 610 + * 611 + * @journal: journal to force 612 + * Returns true if progress was made. 613 + */ 614 + int jbd2_journal_force_commit_nested(journal_t *journal) 615 + { 616 + int ret; 617 + 618 + ret = __jbd2_journal_force_commit(journal); 619 + return ret > 0; 620 + } 621 + 622 + /** 623 + * int journal_force_commit() - force any uncommitted transactions 624 + * @journal: journal to force 625 + * 626 + * Caller want unconditional commit. We can only force the running transaction 627 + * if we don't have an active handle, otherwise, we will deadlock. 628 + */ 629 + int jbd2_journal_force_commit(journal_t *journal) 630 + { 631 + int ret; 632 + 633 + J_ASSERT(!current->journal_info); 634 + ret = __jbd2_journal_force_commit(journal); 635 + if (ret > 0) 636 + ret = 0; 637 + return ret; 581 638 } 582 639 583 640 /* ··· 815 798 * But we don't bother doing that, so there will be coherency problems with 816 799 * mmaps of blockdevs which hold live JBD-controlled filesystems. 817 800 */ 818 - struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 801 + struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 819 802 { 820 803 struct buffer_head *bh; 821 804 unsigned long long blocknr; ··· 834 817 set_buffer_uptodate(bh); 835 818 unlock_buffer(bh); 836 819 BUFFER_TRACE(bh, "return this buffer"); 837 - return jbd2_journal_add_journal_head(bh); 820 + return bh; 838 821 } 839 822 840 823 /* ··· 1079 1062 return NULL; 1080 1063 1081 1064 init_waitqueue_head(&journal->j_wait_transaction_locked); 1082 - init_waitqueue_head(&journal->j_wait_logspace); 1083 1065 init_waitqueue_head(&journal->j_wait_done_commit); 1084 - init_waitqueue_head(&journal->j_wait_checkpoint); 1085 1066 init_waitqueue_head(&journal->j_wait_commit); 1086 1067 init_waitqueue_head(&journal->j_wait_updates); 1068 + init_waitqueue_head(&journal->j_wait_reserved); 1087 1069 mutex_init(&journal->j_barrier); 1088 1070 mutex_init(&journal->j_checkpoint_mutex); 1089 1071 spin_lock_init(&journal->j_revoke_lock); ··· 1092 1076 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 1093 1077 journal->j_min_batch_time = 0; 1094 1078 journal->j_max_batch_time = 15000; /* 15ms */ 1079 + atomic_set(&journal->j_reserved_credits, 0); 1095 1080 1096 1081 /* The journal is marked for error until we succeed with recovery! */ 1097 1082 journal->j_flags = JBD2_ABORT; ··· 1335 1318 static void jbd2_write_superblock(journal_t *journal, int write_op) 1336 1319 { 1337 1320 struct buffer_head *bh = journal->j_sb_buffer; 1321 + journal_superblock_t *sb = journal->j_superblock; 1338 1322 int ret; 1339 1323 1340 1324 trace_jbd2_write_superblock(journal, write_op); ··· 1357 1339 clear_buffer_write_io_error(bh); 1358 1340 set_buffer_uptodate(bh); 1359 1341 } 1342 + jbd2_superblock_csum_set(journal, sb); 1360 1343 get_bh(bh); 1361 1344 bh->b_end_io = end_buffer_write_sync; 1362 1345 ret = submit_bh(write_op, bh); ··· 1454 1435 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", 1455 1436 journal->j_errno); 1456 1437 sb->s_errno = cpu_to_be32(journal->j_errno); 1457 - jbd2_superblock_csum_set(journal, sb); 1458 1438 read_unlock(&journal->j_state_lock); 1459 1439 1460 1440 jbd2_write_superblock(journal, WRITE_SYNC); ··· 2343 2325 #ifdef CONFIG_JBD2_DEBUG 2344 2326 atomic_inc(&nr_journal_heads); 2345 2327 #endif 2346 - ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2328 + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); 2347 2329 if (!ret) { 2348 2330 jbd_debug(1, "out of memory for journal_head\n"); 2349 2331 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); 2350 2332 while (!ret) { 2351 2333 yield(); 2352 - ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2334 + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); 2353 2335 } 2354 2336 } 2355 2337 return ret; ··· 2411 2393 struct journal_head *new_jh = NULL; 2412 2394 2413 2395 repeat: 2414 - if (!buffer_jbd(bh)) { 2396 + if (!buffer_jbd(bh)) 2415 2397 new_jh = journal_alloc_journal_head(); 2416 - memset(new_jh, 0, sizeof(*new_jh)); 2417 - } 2418 2398 2419 2399 jbd_lock_bh_journal_head(bh); 2420 2400 if (buffer_jbd(bh)) {

+5 -6

fs/jbd2/recovery.c

··· 399 399 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 400 400 void *buf, __u32 sequence) 401 401 { 402 - __u32 provided, calculated; 402 + __u32 csum32; 403 403 404 404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 405 405 return 1; 406 406 407 407 sequence = cpu_to_be32(sequence); 408 - calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 409 - sizeof(sequence)); 410 - calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); 411 - provided = be32_to_cpu(tag->t_checksum); 408 + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 409 + sizeof(sequence)); 410 + csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); 412 411 413 - return provided == cpu_to_be32(calculated); 412 + return tag->t_checksum == cpu_to_be16(csum32); 414 413 } 415 414 416 415 static int do_one_pass(journal_t *journal,

+24 -25

fs/jbd2/revoke.c

··· 122 122 123 123 #ifdef __KERNEL__ 124 124 static void write_one_revoke_record(journal_t *, transaction_t *, 125 - struct journal_head **, int *, 125 + struct list_head *, 126 + struct buffer_head **, int *, 126 127 struct jbd2_revoke_record_s *, int); 127 - static void flush_descriptor(journal_t *, struct journal_head *, int, int); 128 + static void flush_descriptor(journal_t *, struct buffer_head *, int, int); 128 129 #endif 129 130 130 131 /* Utility functions to maintain the revoke table */ ··· 532 531 */ 533 532 void jbd2_journal_write_revoke_records(journal_t *journal, 534 533 transaction_t *transaction, 534 + struct list_head *log_bufs, 535 535 int write_op) 536 536 { 537 - struct journal_head *descriptor; 537 + struct buffer_head *descriptor; 538 538 struct jbd2_revoke_record_s *record; 539 539 struct jbd2_revoke_table_s *revoke; 540 540 struct list_head *hash_list; ··· 555 553 while (!list_empty(hash_list)) { 556 554 record = (struct jbd2_revoke_record_s *) 557 555 hash_list->next; 558 - write_one_revoke_record(journal, transaction, 556 + write_one_revoke_record(journal, transaction, log_bufs, 559 557 &descriptor, &offset, 560 558 record, write_op); 561 559 count++; ··· 575 573 576 574 static void write_one_revoke_record(journal_t *journal, 577 575 transaction_t *transaction, 578 - struct journal_head **descriptorp, 576 + struct list_head *log_bufs, 577 + struct buffer_head **descriptorp, 579 578 int *offsetp, 580 579 struct jbd2_revoke_record_s *record, 581 580 int write_op) 582 581 { 583 582 int csum_size = 0; 584 - struct journal_head *descriptor; 583 + struct buffer_head *descriptor; 585 584 int offset; 586 585 journal_header_t *header; 587 586 ··· 612 609 descriptor = jbd2_journal_get_descriptor_buffer(journal); 613 610 if (!descriptor) 614 611 return; 615 - header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; 612 + header = (journal_header_t *)descriptor->b_data; 616 613 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 617 614 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); 618 615 header->h_sequence = cpu_to_be32(transaction->t_tid); 619 616 620 617 /* Record it so that we can wait for IO completion later */ 621 - JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); 622 - jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); 618 + BUFFER_TRACE(descriptor, "file in log_bufs"); 619 + jbd2_file_log_bh(log_bufs, descriptor); 623 620 624 621 offset = sizeof(jbd2_journal_revoke_header_t); 625 622 *descriptorp = descriptor; 626 623 } 627 624 628 625 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { 629 - * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = 626 + * ((__be64 *)(&descriptor->b_data[offset])) = 630 627 cpu_to_be64(record->blocknr); 631 628 offset += 8; 632 629 633 630 } else { 634 - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 631 + * ((__be32 *)(&descriptor->b_data[offset])) = 635 632 cpu_to_be32(record->blocknr); 636 633 offset += 4; 637 634 } ··· 639 636 *offsetp = offset; 640 637 } 641 638 642 - static void jbd2_revoke_csum_set(journal_t *j, 643 - struct journal_head *descriptor) 639 + static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) 644 640 { 645 641 struct jbd2_journal_revoke_tail *tail; 646 642 __u32 csum; ··· 647 645 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 648 646 return; 649 647 650 - tail = (struct jbd2_journal_revoke_tail *) 651 - (jh2bh(descriptor)->b_data + j->j_blocksize - 648 + tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - 652 649 sizeof(struct jbd2_journal_revoke_tail)); 653 650 tail->r_checksum = 0; 654 - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 655 - j->j_blocksize); 651 + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); 656 652 tail->r_checksum = cpu_to_be32(csum); 657 653 } 658 654 ··· 662 662 */ 663 663 664 664 static void flush_descriptor(journal_t *journal, 665 - struct journal_head *descriptor, 665 + struct buffer_head *descriptor, 666 666 int offset, int write_op) 667 667 { 668 668 jbd2_journal_revoke_header_t *header; 669 - struct buffer_head *bh = jh2bh(descriptor); 670 669 671 670 if (is_journal_aborted(journal)) { 672 - put_bh(bh); 671 + put_bh(descriptor); 673 672 return; 674 673 } 675 674 676 - header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; 675 + header = (jbd2_journal_revoke_header_t *)descriptor->b_data; 677 676 header->r_count = cpu_to_be32(offset); 678 677 jbd2_revoke_csum_set(journal, descriptor); 679 678 680 - set_buffer_jwrite(bh); 681 - BUFFER_TRACE(bh, "write"); 682 - set_buffer_dirty(bh); 683 - write_dirty_buffer(bh, write_op); 679 + set_buffer_jwrite(descriptor); 680 + BUFFER_TRACE(descriptor, "write"); 681 + set_buffer_dirty(descriptor); 682 + write_dirty_buffer(descriptor, write_op); 684 683 } 685 684 #endif 686 685

+326 -200

fs/jbd2/transaction.c

··· 89 89 transaction->t_expires = jiffies + journal->j_commit_interval; 90 90 spin_lock_init(&transaction->t_handle_lock); 91 91 atomic_set(&transaction->t_updates, 0); 92 - atomic_set(&transaction->t_outstanding_credits, 0); 92 + atomic_set(&transaction->t_outstanding_credits, 93 + atomic_read(&journal->j_reserved_credits)); 93 94 atomic_set(&transaction->t_handle_count, 0); 94 95 INIT_LIST_HEAD(&transaction->t_inode_list); 95 96 INIT_LIST_HEAD(&transaction->t_private_list); ··· 142 141 } 143 142 144 143 /* 144 + * Wait until running transaction passes T_LOCKED state. Also starts the commit 145 + * if needed. The function expects running transaction to exist and releases 146 + * j_state_lock. 147 + */ 148 + static void wait_transaction_locked(journal_t *journal) 149 + __releases(journal->j_state_lock) 150 + { 151 + DEFINE_WAIT(wait); 152 + int need_to_start; 153 + tid_t tid = journal->j_running_transaction->t_tid; 154 + 155 + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 156 + TASK_UNINTERRUPTIBLE); 157 + need_to_start = !tid_geq(journal->j_commit_request, tid); 158 + read_unlock(&journal->j_state_lock); 159 + if (need_to_start) 160 + jbd2_log_start_commit(journal, tid); 161 + schedule(); 162 + finish_wait(&journal->j_wait_transaction_locked, &wait); 163 + } 164 + 165 + static void sub_reserved_credits(journal_t *journal, int blocks) 166 + { 167 + atomic_sub(blocks, &journal->j_reserved_credits); 168 + wake_up(&journal->j_wait_reserved); 169 + } 170 + 171 + /* 172 + * Wait until we can add credits for handle to the running transaction. Called 173 + * with j_state_lock held for reading. Returns 0 if handle joined the running 174 + * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and 175 + * caller must retry. 176 + */ 177 + static int add_transaction_credits(journal_t *journal, int blocks, 178 + int rsv_blocks) 179 + { 180 + transaction_t *t = journal->j_running_transaction; 181 + int needed; 182 + int total = blocks + rsv_blocks; 183 + 184 + /* 185 + * If the current transaction is locked down for commit, wait 186 + * for the lock to be released. 187 + */ 188 + if (t->t_state == T_LOCKED) { 189 + wait_transaction_locked(journal); 190 + return 1; 191 + } 192 + 193 + /* 194 + * If there is not enough space left in the log to write all 195 + * potential buffers requested by this operation, we need to 196 + * stall pending a log checkpoint to free some more log space. 197 + */ 198 + needed = atomic_add_return(total, &t->t_outstanding_credits); 199 + if (needed > journal->j_max_transaction_buffers) { 200 + /* 201 + * If the current transaction is already too large, 202 + * then start to commit it: we can then go back and 203 + * attach this handle to a new transaction. 204 + */ 205 + atomic_sub(total, &t->t_outstanding_credits); 206 + wait_transaction_locked(journal); 207 + return 1; 208 + } 209 + 210 + /* 211 + * The commit code assumes that it can get enough log space 212 + * without forcing a checkpoint. This is *critical* for 213 + * correctness: a checkpoint of a buffer which is also 214 + * associated with a committing transaction creates a deadlock, 215 + * so commit simply cannot force through checkpoints. 216 + * 217 + * We must therefore ensure the necessary space in the journal 218 + * *before* starting to dirty potentially checkpointed buffers 219 + * in the new transaction. 220 + */ 221 + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { 222 + atomic_sub(total, &t->t_outstanding_credits); 223 + read_unlock(&journal->j_state_lock); 224 + write_lock(&journal->j_state_lock); 225 + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) 226 + __jbd2_log_wait_for_space(journal); 227 + write_unlock(&journal->j_state_lock); 228 + return 1; 229 + } 230 + 231 + /* No reservation? We are done... */ 232 + if (!rsv_blocks) 233 + return 0; 234 + 235 + needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); 236 + /* We allow at most half of a transaction to be reserved */ 237 + if (needed > journal->j_max_transaction_buffers / 2) { 238 + sub_reserved_credits(journal, rsv_blocks); 239 + atomic_sub(total, &t->t_outstanding_credits); 240 + read_unlock(&journal->j_state_lock); 241 + wait_event(journal->j_wait_reserved, 242 + atomic_read(&journal->j_reserved_credits) + rsv_blocks 243 + <= journal->j_max_transaction_buffers / 2); 244 + return 1; 245 + } 246 + return 0; 247 + } 248 + 249 + /* 145 250 * start_this_handle: Given a handle, deal with any locking or stalling 146 251 * needed to make sure that there is enough journal space for the handle 147 252 * to begin. Attach the handle to a transaction and set up the ··· 258 151 gfp_t gfp_mask) 259 152 { 260 153 transaction_t *transaction, *new_transaction = NULL; 261 - tid_t tid; 262 - int needed, need_to_start; 263 - int nblocks = handle->h_buffer_credits; 154 + int blocks = handle->h_buffer_credits; 155 + int rsv_blocks = 0; 264 156 unsigned long ts = jiffies; 265 157 266 - if (nblocks > journal->j_max_transaction_buffers) { 158 + /* 159 + * 1/2 of transaction can be reserved so we can practically handle 160 + * only 1/2 of maximum transaction size per operation 161 + */ 162 + if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { 267 163 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", 268 - current->comm, nblocks, 269 - journal->j_max_transaction_buffers); 164 + current->comm, blocks, 165 + journal->j_max_transaction_buffers / 2); 270 166 return -ENOSPC; 271 167 } 168 + 169 + if (handle->h_rsv_handle) 170 + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; 272 171 273 172 alloc_transaction: 274 173 if (!journal->j_running_transaction) { ··· 312 199 return -EROFS; 313 200 } 314 201 315 - /* Wait on the journal's transaction barrier if necessary */ 316 - if (journal->j_barrier_count) { 202 + /* 203 + * Wait on the journal's transaction barrier if necessary. Specifically 204 + * we allow reserved handles to proceed because otherwise commit could 205 + * deadlock on page writeback not being able to complete. 206 + */ 207 + if (!handle->h_reserved && journal->j_barrier_count) { 317 208 read_unlock(&journal->j_state_lock); 318 209 wait_event(journal->j_wait_transaction_locked, 319 210 journal->j_barrier_count == 0); ··· 330 213 goto alloc_transaction; 331 214 write_lock(&journal->j_state_lock); 332 215 if (!journal->j_running_transaction && 333 - !journal->j_barrier_count) { 216 + (handle->h_reserved || !journal->j_barrier_count)) { 334 217 jbd2_get_transaction(journal, new_transaction); 335 218 new_transaction = NULL; 336 219 } ··· 340 223 341 224 transaction = journal->j_running_transaction; 342 225 343 - /* 344 - * If the current transaction is locked down for commit, wait for the 345 - * lock to be released. 346 - */ 347 - if (transaction->t_state == T_LOCKED) { 348 - DEFINE_WAIT(wait); 349 - 350 - prepare_to_wait(&journal->j_wait_transaction_locked, 351 - &wait, TASK_UNINTERRUPTIBLE); 352 - read_unlock(&journal->j_state_lock); 353 - schedule(); 354 - finish_wait(&journal->j_wait_transaction_locked, &wait); 355 - goto repeat; 356 - } 357 - 358 - /* 359 - * If there is not enough space left in the log to write all potential 360 - * buffers requested by this operation, we need to stall pending a log 361 - * checkpoint to free some more log space. 362 - */ 363 - needed = atomic_add_return(nblocks, 364 - &transaction->t_outstanding_credits); 365 - 366 - if (needed > journal->j_max_transaction_buffers) { 226 + if (!handle->h_reserved) { 227 + /* We may have dropped j_state_lock - restart in that case */ 228 + if (add_transaction_credits(journal, blocks, rsv_blocks)) 229 + goto repeat; 230 + } else { 367 231 /* 368 - * If the current transaction is already too large, then start 369 - * to commit it: we can then go back and attach this handle to 370 - * a new transaction. 232 + * We have handle reserved so we are allowed to join T_LOCKED 233 + * transaction and we don't have to check for transaction size 234 + * and journal space. 371 235 */ 372 - DEFINE_WAIT(wait); 373 - 374 - jbd_debug(2, "Handle %p starting new commit...\n", handle); 375 - atomic_sub(nblocks, &transaction->t_outstanding_credits); 376 - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 377 - TASK_UNINTERRUPTIBLE); 378 - tid = transaction->t_tid; 379 - need_to_start = !tid_geq(journal->j_commit_request, tid); 380 - read_unlock(&journal->j_state_lock); 381 - if (need_to_start) 382 - jbd2_log_start_commit(journal, tid); 383 - schedule(); 384 - finish_wait(&journal->j_wait_transaction_locked, &wait); 385 - goto repeat; 386 - } 387 - 388 - /* 389 - * The commit code assumes that it can get enough log space 390 - * without forcing a checkpoint. This is *critical* for 391 - * correctness: a checkpoint of a buffer which is also 392 - * associated with a committing transaction creates a deadlock, 393 - * so commit simply cannot force through checkpoints. 394 - * 395 - * We must therefore ensure the necessary space in the journal 396 - * *before* starting to dirty potentially checkpointed buffers 397 - * in the new transaction. 398 - * 399 - * The worst part is, any transaction currently committing can 400 - * reduce the free space arbitrarily. Be careful to account for 401 - * those buffers when checkpointing. 402 - */ 403 - 404 - /* 405 - * @@@ AKPM: This seems rather over-defensive. We're giving commit 406 - * a _lot_ of headroom: 1/4 of the journal plus the size of 407 - * the committing transaction. Really, we only need to give it 408 - * committing_transaction->t_outstanding_credits plus "enough" for 409 - * the log control blocks. 410 - * Also, this test is inconsistent with the matching one in 411 - * jbd2_journal_extend(). 412 - */ 413 - if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 414 - jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); 415 - atomic_sub(nblocks, &transaction->t_outstanding_credits); 416 - read_unlock(&journal->j_state_lock); 417 - write_lock(&journal->j_state_lock); 418 - if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) 419 - __jbd2_log_wait_for_space(journal); 420 - write_unlock(&journal->j_state_lock); 421 - goto repeat; 236 + sub_reserved_credits(journal, blocks); 237 + handle->h_reserved = 0; 422 238 } 423 239 424 240 /* OK, account for the buffers that this operation expects to ··· 359 309 */ 360 310 update_t_max_wait(transaction, ts); 361 311 handle->h_transaction = transaction; 362 - handle->h_requested_credits = nblocks; 312 + handle->h_requested_credits = blocks; 363 313 handle->h_start_jiffies = jiffies; 364 314 atomic_inc(&transaction->t_updates); 365 315 atomic_inc(&transaction->t_handle_count); 366 - jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 367 - handle, nblocks, 316 + jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", 317 + handle, blocks, 368 318 atomic_read(&transaction->t_outstanding_credits), 369 - __jbd2_log_space_left(journal)); 319 + jbd2_log_space_left(journal)); 370 320 read_unlock(&journal->j_state_lock); 321 + current->journal_info = handle; 371 322 372 323 lock_map_acquire(&handle->h_lockdep_map); 373 324 jbd2_journal_free_transaction(new_transaction); ··· 399 348 * 400 349 * We make sure that the transaction can guarantee at least nblocks of 401 350 * modified buffers in the log. We block until the log can guarantee 402 - * that much space. 403 - * 404 - * This function is visible to journal users (like ext3fs), so is not 405 - * called with the journal already locked. 351 + * that much space. Additionally, if rsv_blocks > 0, we also create another 352 + * handle with rsv_blocks reserved blocks in the journal. This handle is 353 + * is stored in h_rsv_handle. It is not attached to any particular transaction 354 + * and thus doesn't block transaction commit. If the caller uses this reserved 355 + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() 356 + * on the parent handle will dispose the reserved one. Reserved handle has to 357 + * be converted to a normal handle using jbd2_journal_start_reserved() before 358 + * it can be used. 406 359 * 407 360 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 408 361 * on failure. 409 362 */ 410 - handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, 411 - unsigned int type, unsigned int line_no) 363 + handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, 364 + gfp_t gfp_mask, unsigned int type, 365 + unsigned int line_no) 412 366 { 413 367 handle_t *handle = journal_current_handle(); 414 368 int err; ··· 430 374 handle = new_handle(nblocks); 431 375 if (!handle) 432 376 return ERR_PTR(-ENOMEM); 377 + if (rsv_blocks) { 378 + handle_t *rsv_handle; 433 379 434 - current->journal_info = handle; 380 + rsv_handle = new_handle(rsv_blocks); 381 + if (!rsv_handle) { 382 + jbd2_free_handle(handle); 383 + return ERR_PTR(-ENOMEM); 384 + } 385 + rsv_handle->h_reserved = 1; 386 + rsv_handle->h_journal = journal; 387 + handle->h_rsv_handle = rsv_handle; 388 + } 435 389 436 390 err = start_this_handle(journal, handle, gfp_mask); 437 391 if (err < 0) { 392 + if (handle->h_rsv_handle) 393 + jbd2_free_handle(handle->h_rsv_handle); 438 394 jbd2_free_handle(handle); 439 - current->journal_info = NULL; 440 395 return ERR_PTR(err); 441 396 } 442 397 handle->h_type = type; ··· 462 395 463 396 handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 464 397 { 465 - return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); 398 + return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); 466 399 } 467 400 EXPORT_SYMBOL(jbd2_journal_start); 468 401 402 + void jbd2_journal_free_reserved(handle_t *handle) 403 + { 404 + journal_t *journal = handle->h_journal; 405 + 406 + WARN_ON(!handle->h_reserved); 407 + sub_reserved_credits(journal, handle->h_buffer_credits); 408 + jbd2_free_handle(handle); 409 + } 410 + EXPORT_SYMBOL(jbd2_journal_free_reserved); 411 + 412 + /** 413 + * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle 414 + * @handle: handle to start 415 + * 416 + * Start handle that has been previously reserved with jbd2_journal_reserve(). 417 + * This attaches @handle to the running transaction (or creates one if there's 418 + * not transaction running). Unlike jbd2_journal_start() this function cannot 419 + * block on journal commit, checkpointing, or similar stuff. It can block on 420 + * memory allocation or frozen journal though. 421 + * 422 + * Return 0 on success, non-zero on error - handle is freed in that case. 423 + */ 424 + int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, 425 + unsigned int line_no) 426 + { 427 + journal_t *journal = handle->h_journal; 428 + int ret = -EIO; 429 + 430 + if (WARN_ON(!handle->h_reserved)) { 431 + /* Someone passed in normal handle? Just stop it. */ 432 + jbd2_journal_stop(handle); 433 + return ret; 434 + } 435 + /* 436 + * Usefulness of mixing of reserved and unreserved handles is 437 + * questionable. So far nobody seems to need it so just error out. 438 + */ 439 + if (WARN_ON(current->journal_info)) { 440 + jbd2_journal_free_reserved(handle); 441 + return ret; 442 + } 443 + 444 + handle->h_journal = NULL; 445 + /* 446 + * GFP_NOFS is here because callers are likely from writeback or 447 + * similarly constrained call sites 448 + */ 449 + ret = start_this_handle(journal, handle, GFP_NOFS); 450 + if (ret < 0) 451 + jbd2_journal_free_reserved(handle); 452 + handle->h_type = type; 453 + handle->h_line_no = line_no; 454 + return ret; 455 + } 456 + EXPORT_SYMBOL(jbd2_journal_start_reserved); 469 457 470 458 /** 471 459 * int jbd2_journal_extend() - extend buffer credits. ··· 545 423 int jbd2_journal_extend(handle_t *handle, int nblocks) 546 424 { 547 425 transaction_t *transaction = handle->h_transaction; 548 - journal_t *journal = transaction->t_journal; 426 + journal_t *journal; 549 427 int result; 550 428 int wanted; 551 429 552 - result = -EIO; 430 + WARN_ON(!transaction); 553 431 if (is_handle_aborted(handle)) 554 - goto out; 432 + return -EROFS; 433 + journal = transaction->t_journal; 555 434 556 435 result = 1; 557 436 558 437 read_lock(&journal->j_state_lock); 559 438 560 439 /* Don't extend a locked-down transaction! */ 561 - if (handle->h_transaction->t_state != T_RUNNING) { 440 + if (transaction->t_state != T_RUNNING) { 562 441 jbd_debug(3, "denied handle %p %d blocks: " 563 442 "transaction not running\n", handle, nblocks); 564 443 goto error_out; 565 444 } 566 445 567 446 spin_lock(&transaction->t_handle_lock); 568 - wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; 447 + wanted = atomic_add_return(nblocks, 448 + &transaction->t_outstanding_credits); 569 449 570 450 if (wanted > journal->j_max_transaction_buffers) { 571 451 jbd_debug(3, "denied handle %p %d blocks: " 572 452 "transaction too large\n", handle, nblocks); 453 + atomic_sub(nblocks, &transaction->t_outstanding_credits); 573 454 goto unlock; 574 455 } 575 456 576 - if (wanted > __jbd2_log_space_left(journal)) { 457 + if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > 458 + jbd2_log_space_left(journal)) { 577 459 jbd_debug(3, "denied handle %p %d blocks: " 578 460 "insufficient log space\n", handle, nblocks); 461 + atomic_sub(nblocks, &transaction->t_outstanding_credits); 579 462 goto unlock; 580 463 } 581 464 582 465 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, 583 - handle->h_transaction->t_tid, 466 + transaction->t_tid, 584 467 handle->h_type, handle->h_line_no, 585 468 handle->h_buffer_credits, 586 469 nblocks); 587 470 588 471 handle->h_buffer_credits += nblocks; 589 472 handle->h_requested_credits += nblocks; 590 - atomic_add(nblocks, &transaction->t_outstanding_credits); 591 473 result = 0; 592 474 593 475 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); ··· 599 473 spin_unlock(&transaction->t_handle_lock); 600 474 error_out: 601 475 read_unlock(&journal->j_state_lock); 602 - out: 603 476 return result; 604 477 } 605 478 ··· 615 490 * to a running handle, a call to jbd2_journal_restart will commit the 616 491 * handle's transaction so far and reattach the handle to a new 617 492 * transaction capabable of guaranteeing the requested number of 618 - * credits. 493 + * credits. We preserve reserved handle if there's any attached to the 494 + * passed in handle. 619 495 */ 620 496 int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) 621 497 { 622 498 transaction_t *transaction = handle->h_transaction; 623 - journal_t *journal = transaction->t_journal; 499 + journal_t *journal; 624 500 tid_t tid; 625 501 int need_to_start, ret; 626 502 503 + WARN_ON(!transaction); 627 504 /* If we've had an abort of any type, don't even think about 628 505 * actually doing the restart! */ 629 506 if (is_handle_aborted(handle)) 630 507 return 0; 508 + journal = transaction->t_journal; 631 509 632 510 /* 633 511 * First unlink the handle from its current transaction, and start the ··· 643 515 spin_lock(&transaction->t_handle_lock); 644 516 atomic_sub(handle->h_buffer_credits, 645 517 &transaction->t_outstanding_credits); 518 + if (handle->h_rsv_handle) { 519 + sub_reserved_credits(journal, 520 + handle->h_rsv_handle->h_buffer_credits); 521 + } 646 522 if (atomic_dec_and_test(&transaction->t_updates)) 647 523 wake_up(&journal->j_wait_updates); 524 + tid = transaction->t_tid; 648 525 spin_unlock(&transaction->t_handle_lock); 526 + handle->h_transaction = NULL; 527 + current->journal_info = NULL; 649 528 650 529 jbd_debug(2, "restarting handle %p\n", handle); 651 - tid = transaction->t_tid; 652 530 need_to_start = !tid_geq(journal->j_commit_request, tid); 653 531 read_unlock(&journal->j_state_lock); 654 532 if (need_to_start) ··· 690 556 691 557 write_lock(&journal->j_state_lock); 692 558 ++journal->j_barrier_count; 559 + 560 + /* Wait until there are no reserved handles */ 561 + if (atomic_read(&journal->j_reserved_credits)) { 562 + write_unlock(&journal->j_state_lock); 563 + wait_event(journal->j_wait_reserved, 564 + atomic_read(&journal->j_reserved_credits) == 0); 565 + write_lock(&journal->j_state_lock); 566 + } 693 567 694 568 /* Wait until there are no running updates */ 695 569 while (1) { ··· 761 619 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 762 620 } 763 621 622 + static int sleep_on_shadow_bh(void *word) 623 + { 624 + io_schedule(); 625 + return 0; 626 + } 627 + 764 628 /* 765 629 * If the buffer is already part of the current transaction, then there 766 630 * is nothing we need to do. If it is already part of a prior ··· 782 634 int force_copy) 783 635 { 784 636 struct buffer_head *bh; 785 - transaction_t *transaction; 637 + transaction_t *transaction = handle->h_transaction; 786 638 journal_t *journal; 787 639 int error; 788 640 char *frozen_buffer = NULL; 789 641 int need_copy = 0; 790 642 unsigned long start_lock, time_lock; 791 643 644 + WARN_ON(!transaction); 792 645 if (is_handle_aborted(handle)) 793 646 return -EROFS; 794 - 795 - transaction = handle->h_transaction; 796 647 journal = transaction->t_journal; 797 648 798 649 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); ··· 901 754 * journaled. If the primary copy is already going to 902 755 * disk then we cannot do copy-out here. */ 903 756 904 - if (jh->b_jlist == BJ_Shadow) { 905 - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); 906 - wait_queue_head_t *wqh; 907 - 908 - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); 909 - 757 + if (buffer_shadow(bh)) { 910 758 JBUFFER_TRACE(jh, "on shadow: sleep"); 911 759 jbd_unlock_bh_state(bh); 912 - /* commit wakes up all shadow buffers after IO */ 913 - for ( ; ; ) { 914 - prepare_to_wait(wqh, &wait.wait, 915 - TASK_UNINTERRUPTIBLE); 916 - if (jh->b_jlist != BJ_Shadow) 917 - break; 918 - schedule(); 919 - } 920 - finish_wait(wqh, &wait.wait); 760 + wait_on_bit(&bh->b_state, BH_Shadow, 761 + sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); 921 762 goto repeat; 922 763 } 923 764 924 - /* Only do the copy if the currently-owning transaction 925 - * still needs it. If it is on the Forget list, the 926 - * committing transaction is past that stage. The 927 - * buffer had better remain locked during the kmalloc, 928 - * but that should be true --- we hold the journal lock 929 - * still and the buffer is already on the BUF_JOURNAL 930 - * list so won't be flushed. 765 + /* 766 + * Only do the copy if the currently-owning transaction still 767 + * needs it. If buffer isn't on BJ_Metadata list, the 768 + * committing transaction is past that stage (here we use the 769 + * fact that BH_Shadow is set under bh_state lock together with 770 + * refiling to BJ_Shadow list and at this point we know the 771 + * buffer doesn't have BH_Shadow set). 931 772 * 932 773 * Subtle point, though: if this is a get_undo_access, 933 774 * then we will be relying on the frozen_data to contain 934 775 * the new value of the committed_data record after the 935 776 * transaction, so we HAVE to force the frozen_data copy 936 - * in that case. */ 937 - 938 - if (jh->b_jlist != BJ_Forget || force_copy) { 777 + * in that case. 778 + */ 779 + if (jh->b_jlist == BJ_Metadata || force_copy) { 939 780 JBUFFER_TRACE(jh, "generate frozen data"); 940 781 if (!frozen_buffer) { 941 782 JBUFFER_TRACE(jh, "allocate memory for buffer"); ··· 1050 915 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) 1051 916 { 1052 917 transaction_t *transaction = handle->h_transaction; 1053 - journal_t *journal = transaction->t_journal; 918 + journal_t *journal; 1054 919 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 1055 920 int err; 1056 921 1057 922 jbd_debug(5, "journal_head %p\n", jh); 923 + WARN_ON(!transaction); 1058 924 err = -EROFS; 1059 925 if (is_handle_aborted(handle)) 1060 926 goto out; 927 + journal = transaction->t_journal; 1061 928 err = 0; 1062 929 1063 930 JBUFFER_TRACE(jh, "entry"); ··· 1265 1128 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1266 1129 { 1267 1130 transaction_t *transaction = handle->h_transaction; 1268 - journal_t *journal = transaction->t_journal; 1131 + journal_t *journal; 1269 1132 struct journal_head *jh; 1270 1133 int ret = 0; 1271 1134 1135 + WARN_ON(!transaction); 1272 1136 if (is_handle_aborted(handle)) 1273 - goto out; 1137 + return -EROFS; 1138 + journal = transaction->t_journal; 1274 1139 jh = jbd2_journal_grab_journal_head(bh); 1275 1140 if (!jh) { 1276 1141 ret = -EUCLEAN; ··· 1366 1227 1367 1228 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1368 1229 spin_lock(&journal->j_list_lock); 1369 - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); 1230 + __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); 1370 1231 spin_unlock(&journal->j_list_lock); 1371 1232 out_unlock_bh: 1372 1233 jbd_unlock_bh_state(bh); ··· 1397 1258 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) 1398 1259 { 1399 1260 transaction_t *transaction = handle->h_transaction; 1400 - journal_t *journal = transaction->t_journal; 1261 + journal_t *journal; 1401 1262 struct journal_head *jh; 1402 1263 int drop_reserve = 0; 1403 1264 int err = 0; 1404 1265 int was_modified = 0; 1266 + 1267 + WARN_ON(!transaction); 1268 + if (is_handle_aborted(handle)) 1269 + return -EROFS; 1270 + journal = transaction->t_journal; 1405 1271 1406 1272 BUFFER_TRACE(bh, "entry"); 1407 1273 ··· 1434 1290 */ 1435 1291 jh->b_modified = 0; 1436 1292 1437 - if (jh->b_transaction == handle->h_transaction) { 1293 + if (jh->b_transaction == transaction) { 1438 1294 J_ASSERT_JH(jh, !jh->b_frozen_data); 1439 1295 1440 1296 /* If we are forgetting a buffer which is already part ··· 1529 1385 int jbd2_journal_stop(handle_t *handle) 1530 1386 { 1531 1387 transaction_t *transaction = handle->h_transaction; 1532 - journal_t *journal = transaction->t_journal; 1533 - int err, wait_for_commit = 0; 1388 + journal_t *journal; 1389 + int err = 0, wait_for_commit = 0; 1534 1390 tid_t tid; 1535 1391 pid_t pid; 1392 + 1393 + if (!transaction) 1394 + goto free_and_exit; 1395 + journal = transaction->t_journal; 1536 1396 1537 1397 J_ASSERT(journal_current_handle() == handle); 1538 1398 1539 1399 if (is_handle_aborted(handle)) 1540 1400 err = -EIO; 1541 - else { 1401 + else 1542 1402 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 1543 - err = 0; 1544 - } 1545 1403 1546 1404 if (--handle->h_ref > 0) { 1547 1405 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, ··· 1553 1407 1554 1408 jbd_debug(4, "Handle %p going down\n", handle); 1555 1409 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, 1556 - handle->h_transaction->t_tid, 1410 + transaction->t_tid, 1557 1411 handle->h_type, handle->h_line_no, 1558 1412 jiffies - handle->h_start_jiffies, 1559 1413 handle->h_sync, handle->h_requested_credits, ··· 1664 1518 1665 1519 lock_map_release(&handle->h_lockdep_map); 1666 1520 1521 + if (handle->h_rsv_handle) 1522 + jbd2_journal_free_reserved(handle->h_rsv_handle); 1523 + free_and_exit: 1667 1524 jbd2_free_handle(handle); 1668 1525 return err; 1669 - } 1670 - 1671 - /** 1672 - * int jbd2_journal_force_commit() - force any uncommitted transactions 1673 - * @journal: journal to force 1674 - * 1675 - * For synchronous operations: force any uncommitted transactions 1676 - * to disk. May seem kludgy, but it reuses all the handle batching 1677 - * code in a very simple manner. 1678 - */ 1679 - int jbd2_journal_force_commit(journal_t *journal) 1680 - { 1681 - handle_t *handle; 1682 - int ret; 1683 - 1684 - handle = jbd2_journal_start(journal, 1); 1685 - if (IS_ERR(handle)) { 1686 - ret = PTR_ERR(handle); 1687 - } else { 1688 - handle->h_sync = 1; 1689 - ret = jbd2_journal_stop(handle); 1690 - } 1691 - return ret; 1692 1526 } 1693 1527 1694 1528 /* ··· 1727 1601 * Remove a buffer from the appropriate transaction list. 1728 1602 * 1729 1603 * Note that this function can *change* the value of 1730 - * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, 1731 - * t_log_list or t_reserved_list. If the caller is holding onto a copy of one 1732 - * of these pointers, it could go bad. Generally the caller needs to re-read 1733 - * the pointer from the transaction_t. 1604 + * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or 1605 + * t_reserved_list. If the caller is holding onto a copy of one of these 1606 + * pointers, it could go bad. Generally the caller needs to re-read the 1607 + * pointer from the transaction_t. 1734 1608 * 1735 1609 * Called under j_list_lock. 1736 1610 */ ··· 1760 1634 case BJ_Forget: 1761 1635 list = &transaction->t_forget; 1762 1636 break; 1763 - case BJ_IO: 1764 - list = &transaction->t_iobuf_list; 1765 - break; 1766 1637 case BJ_Shadow: 1767 1638 list = &transaction->t_shadow_list; 1768 - break; 1769 - case BJ_LogCtl: 1770 - list = &transaction->t_log_list; 1771 1639 break; 1772 1640 case BJ_Reserved: 1773 1641 list = &transaction->t_reserved_list; ··· 2154 2034 * void jbd2_journal_invalidatepage() 2155 2035 * @journal: journal to use for flush... 2156 2036 * @page: page to flush 2157 - * @offset: length of page to invalidate. 2037 + * @offset: start of the range to invalidate 2038 + * @length: length of the range to invalidate 2158 2039 * 2159 - * Reap page buffers containing data after offset in page. Can return -EBUSY 2160 - * if buffers are part of the committing transaction and the page is straddling 2161 - * i_size. Caller then has to wait for current commit and try again. 2040 + * Reap page buffers containing data after in the specified range in page. 2041 + * Can return -EBUSY if buffers are part of the committing transaction and 2042 + * the page is straddling i_size. Caller then has to wait for current commit 2043 + * and try again. 2162 2044 */ 2163 2045 int jbd2_journal_invalidatepage(journal_t *journal, 2164 2046 struct page *page, 2165 - unsigned long offset) 2047 + unsigned int offset, 2048 + unsigned int length) 2166 2049 { 2167 2050 struct buffer_head *head, *bh, *next; 2051 + unsigned int stop = offset + length; 2168 2052 unsigned int curr_off = 0; 2053 + int partial_page = (offset || length < PAGE_CACHE_SIZE); 2169 2054 int may_free = 1; 2170 2055 int ret = 0; 2171 2056 ··· 2178 2053 BUG(); 2179 2054 if (!page_has_buffers(page)) 2180 2055 return 0; 2056 + 2057 + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); 2181 2058 2182 2059 /* We will potentially be playing with lists other than just the 2183 2060 * data lists (especially for journaled data mode), so be ··· 2190 2063 unsigned int next_off = curr_off + bh->b_size; 2191 2064 next = bh->b_this_page; 2192 2065 2066 + if (next_off > stop) 2067 + return 0; 2068 + 2193 2069 if (offset <= curr_off) { 2194 2070 /* This block is wholly outside the truncation point */ 2195 2071 lock_buffer(bh); 2196 - ret = journal_unmap_buffer(journal, bh, offset > 0); 2072 + ret = journal_unmap_buffer(journal, bh, partial_page); 2197 2073 unlock_buffer(bh); 2198 2074 if (ret < 0) 2199 2075 return ret; ··· 2207 2077 2208 2078 } while (bh != head); 2209 2079 2210 - if (!offset) { 2080 + if (!partial_page) { 2211 2081 if (may_free && try_to_free_buffers(page)) 2212 2082 J_ASSERT(!page_has_buffers(page)); 2213 2083 } ··· 2268 2138 case BJ_Forget: 2269 2139 list = &transaction->t_forget; 2270 2140 break; 2271 - case BJ_IO: 2272 - list = &transaction->t_iobuf_list; 2273 - break; 2274 2141 case BJ_Shadow: 2275 2142 list = &transaction->t_shadow_list; 2276 - break; 2277 - case BJ_LogCtl: 2278 - list = &transaction->t_log_list; 2279 2143 break; 2280 2144 case BJ_Reserved: 2281 2145 list = &transaction->t_reserved_list; ··· 2372 2248 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2373 2249 { 2374 2250 transaction_t *transaction = handle->h_transaction; 2375 - journal_t *journal = transaction->t_journal; 2251 + journal_t *journal; 2376 2252 2253 + WARN_ON(!transaction); 2377 2254 if (is_handle_aborted(handle)) 2378 - return -EIO; 2255 + return -EROFS; 2256 + journal = transaction->t_journal; 2379 2257 2380 2258 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2381 2259 transaction->t_tid);

+3 -2

fs/jfs/jfs_metapage.c

··· 571 571 return ret; 572 572 } 573 573 574 - static void metapage_invalidatepage(struct page *page, unsigned long offset) 574 + static void metapage_invalidatepage(struct page *page, unsigned int offset, 575 + unsigned int length) 575 576 { 576 - BUG_ON(offset); 577 + BUG_ON(offset || length < PAGE_CACHE_SIZE); 577 578 578 579 BUG_ON(PageWriteback(page)); 579 580

+2 -1

fs/logfs/file.c

··· 159 159 return __logfs_writepage(page); 160 160 } 161 161 162 - static void logfs_invalidatepage(struct page *page, unsigned long offset) 162 + static void logfs_invalidatepage(struct page *page, unsigned int offset, 163 + unsigned int length) 163 164 { 164 165 struct logfs_block *block = logfs_block(page); 165 166

+2 -1

fs/logfs/segment.c

··· 884 884 return area; 885 885 } 886 886 887 - static void map_invalidatepage(struct page *page, unsigned long l) 887 + static void map_invalidatepage(struct page *page, unsigned int o, 888 + unsigned int l) 888 889 { 889 890 return; 890 891 }

+5 -3

fs/nfs/file.c

··· 451 451 * - Called if either PG_private or PG_fscache is set on the page 452 452 * - Caller holds page lock 453 453 */ 454 - static void nfs_invalidate_page(struct page *page, unsigned long offset) 454 + static void nfs_invalidate_page(struct page *page, unsigned int offset, 455 + unsigned int length) 455 456 { 456 - dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 457 + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", 458 + page, offset, length); 457 459 458 - if (offset != 0) 460 + if (offset != 0 || length < PAGE_CACHE_SIZE) 459 461 return; 460 462 /* Cancel any unstarted writes on this page */ 461 463 nfs_wb_page_cancel(page_file_mapping(page)->host, page);

+1 -1

fs/ntfs/aops.c

··· 1372 1372 * The page may have dirty, unmapped buffers. Make them 1373 1373 * freeable here, so the page does not leak. 1374 1374 */ 1375 - block_invalidatepage(page, 0); 1375 + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); 1376 1376 unlock_page(page); 1377 1377 ntfs_debug("Write outside i_size - truncated?"); 1378 1378 return 0;

+3 -2

fs/ocfs2/aops.c

··· 603 603 * from ext3. PageChecked() bits have been removed as OCFS2 does not 604 604 * do journalled data. 605 605 */ 606 - static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 606 + static void ocfs2_invalidatepage(struct page *page, unsigned int offset, 607 + unsigned int length) 607 608 { 608 609 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 609 610 610 - jbd2_journal_invalidatepage(journal, page, offset); 611 + jbd2_journal_invalidatepage(journal, page, offset, length); 611 612 } 612 613 613 614 static int ocfs2_releasepage(struct page *page, gfp_t wait)

+9 -3

fs/reiserfs/inode.c

··· 2975 2975 } 2976 2976 2977 2977 /* clm -- taken from fs/buffer.c:block_invalidate_page */ 2978 - static void reiserfs_invalidatepage(struct page *page, unsigned long offset) 2978 + static void reiserfs_invalidatepage(struct page *page, unsigned int offset, 2979 + unsigned int length) 2979 2980 { 2980 2981 struct buffer_head *head, *bh, *next; 2981 2982 struct inode *inode = page->mapping->host; 2982 2983 unsigned int curr_off = 0; 2984 + unsigned int stop = offset + length; 2985 + int partial_page = (offset || length < PAGE_CACHE_SIZE); 2983 2986 int ret = 1; 2984 2987 2985 2988 BUG_ON(!PageLocked(page)); 2986 2989 2987 - if (offset == 0) 2990 + if (!partial_page) 2988 2991 ClearPageChecked(page); 2989 2992 2990 2993 if (!page_has_buffers(page)) ··· 2998 2995 do { 2999 2996 unsigned int next_off = curr_off + bh->b_size; 3000 2997 next = bh->b_this_page; 2998 + 2999 + if (next_off > stop) 3000 + goto out; 3001 3001 3002 3002 /* 3003 3003 * is this block fully invalidated? ··· 3020 3014 * The get_block cached value has been unconditionally invalidated, 3021 3015 * so real IO is not possible anymore. 3022 3016 */ 3023 - if (!offset && ret) { 3017 + if (!partial_page && ret) { 3024 3018 ret = try_to_release_page(page, 0); 3025 3019 /* maybe should BUG_ON(!ret); - neilb */ 3026 3020 }

+3 -2

fs/ubifs/file.c

··· 1277 1277 return err; 1278 1278 } 1279 1279 1280 - static void ubifs_invalidatepage(struct page *page, unsigned long offset) 1280 + static void ubifs_invalidatepage(struct page *page, unsigned int offset, 1281 + unsigned int length) 1281 1282 { 1282 1283 struct inode *inode = page->mapping->host; 1283 1284 struct ubifs_info *c = inode->i_sb->s_fs_info; 1284 1285 1285 1286 ubifs_assert(PagePrivate(page)); 1286 - if (offset) 1287 + if (offset || length < PAGE_CACHE_SIZE) 1287 1288 /* Partial page remains dirty */ 1288 1289 return; 1289 1290

+8 -6

fs/xfs/xfs_aops.c

··· 843 843 STATIC void 844 844 xfs_vm_invalidatepage( 845 845 struct page *page, 846 - unsigned long offset) 846 + unsigned int offset, 847 + unsigned int length) 847 848 { 848 - trace_xfs_invalidatepage(page->mapping->host, page, offset); 849 - block_invalidatepage(page, offset); 849 + trace_xfs_invalidatepage(page->mapping->host, page, offset, 850 + length); 851 + block_invalidatepage(page, offset, length); 850 852 } 851 853 852 854 /* ··· 912 910 913 911 xfs_iunlock(ip, XFS_ILOCK_EXCL); 914 912 out_invalidate: 915 - xfs_vm_invalidatepage(page, 0); 913 + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 916 914 return; 917 915 } 918 916 ··· 942 940 int count = 0; 943 941 int nonblocking = 0; 944 942 945 - trace_xfs_writepage(inode, page, 0); 943 + trace_xfs_writepage(inode, page, 0, 0); 946 944 947 945 ASSERT(page_has_buffers(page)); 948 946 ··· 1173 1171 { 1174 1172 int delalloc, unwritten; 1175 1173 1176 - trace_xfs_releasepage(page->mapping->host, page, 0); 1174 + trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1177 1175 1178 1176 xfs_count_page_state(page, &delalloc, &unwritten); 1179 1177

+10 -5

fs/xfs/xfs_trace.h

··· 974 974 DEFINE_RW_EVENT(xfs_file_splice_write); 975 975 976 976 DECLARE_EVENT_CLASS(xfs_page_class, 977 - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), 978 - TP_ARGS(inode, page, off), 977 + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, 978 + unsigned int len), 979 + TP_ARGS(inode, page, off, len), 979 980 TP_STRUCT__entry( 980 981 __field(dev_t, dev) 981 982 __field(xfs_ino_t, ino) 982 983 __field(pgoff_t, pgoff) 983 984 __field(loff_t, size) 984 985 __field(unsigned long, offset) 986 + __field(unsigned int, length) 985 987 __field(int, delalloc) 986 988 __field(int, unwritten) 987 989 ), ··· 997 995 __entry->pgoff = page_offset(page); 998 996 __entry->size = i_size_read(inode); 999 997 __entry->offset = off; 998 + __entry->length = len; 1000 999 __entry->delalloc = delalloc; 1001 1000 __entry->unwritten = unwritten; 1002 1001 ), 1003 1002 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " 1004 - "delalloc %d unwritten %d", 1003 + "length %x delalloc %d unwritten %d", 1005 1004 MAJOR(__entry->dev), MINOR(__entry->dev), 1006 1005 __entry->ino, 1007 1006 __entry->pgoff, 1008 1007 __entry->size, 1009 1008 __entry->offset, 1009 + __entry->length, 1010 1010 __entry->delalloc, 1011 1011 __entry->unwritten) 1012 1012 ) 1013 1013 1014 1014 #define DEFINE_PAGE_EVENT(name) \ 1015 1015 DEFINE_EVENT(xfs_page_class, name, \ 1016 - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 1017 - TP_ARGS(inode, page, off)) 1016 + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ 1017 + unsigned int len), \ 1018 + TP_ARGS(inode, page, off, len)) 1018 1019 DEFINE_PAGE_EVENT(xfs_writepage); 1019 1020 DEFINE_PAGE_EVENT(xfs_releasepage); 1020 1021 DEFINE_PAGE_EVENT(xfs_invalidatepage);

+2 -1

include/linux/buffer_head.h

··· 198 198 * Generic address_space_operations implementations for buffer_head-backed 199 199 * address_spaces. 200 200 */ 201 - void block_invalidatepage(struct page *page, unsigned long offset); 201 + void block_invalidatepage(struct page *page, unsigned int offset, 202 + unsigned int length); 202 203 int block_write_full_page(struct page *page, get_block_t *get_block, 203 204 struct writeback_control *wbc); 204 205 int block_write_full_page_endio(struct page *page, get_block_t *get_block,

+1 -1

include/linux/fs.h

··· 364 364 365 365 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ 366 366 sector_t (*bmap)(struct address_space *, sector_t); 367 - void (*invalidatepage) (struct page *, unsigned long); 367 + void (*invalidatepage) (struct page *, unsigned int, unsigned int); 368 368 int (*releasepage) (struct page *, gfp_t); 369 369 void (*freepage)(struct page *); 370 370 ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,

+26 -2

include/linux/jbd.h

··· 27 27 #include <linux/buffer_head.h> 28 28 #include <linux/journal-head.h> 29 29 #include <linux/stddef.h> 30 - #include <linux/bit_spinlock.h> 31 30 #include <linux/mutex.h> 32 31 #include <linux/timer.h> 33 32 #include <linux/lockdep.h> ··· 243 244 244 245 #include <linux/fs.h> 245 246 #include <linux/sched.h> 247 + 248 + enum jbd_state_bits { 249 + BH_JBD /* Has an attached ext3 journal_head */ 250 + = BH_PrivateStart, 251 + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ 252 + BH_Freed, /* Has been freed (truncated) */ 253 + BH_Revoked, /* Has been revoked from the log */ 254 + BH_RevokeValid, /* Revoked flag is valid */ 255 + BH_JBDDirty, /* Is dirty but journaled */ 256 + BH_State, /* Pins most journal_head state */ 257 + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ 258 + BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ 259 + BH_JBDPrivateStart, /* First bit available for private use by FS */ 260 + }; 261 + 262 + BUFFER_FNS(JBD, jbd) 263 + BUFFER_FNS(JWrite, jwrite) 264 + BUFFER_FNS(JBDDirty, jbddirty) 265 + TAS_BUFFER_FNS(JBDDirty, jbddirty) 266 + BUFFER_FNS(Revoked, revoked) 267 + TAS_BUFFER_FNS(Revoked, revoked) 268 + BUFFER_FNS(RevokeValid, revokevalid) 269 + TAS_BUFFER_FNS(RevokeValid, revokevalid) 270 + BUFFER_FNS(Freed, freed) 271 + 246 272 #include <linux/jbd_common.h> 247 273 248 274 #define J_ASSERT(assert) BUG_ON(!(assert)) ··· 864 840 extern int journal_forget (handle_t *, struct buffer_head *); 865 841 extern void journal_sync_buffer (struct buffer_head *); 866 842 extern void journal_invalidatepage(journal_t *, 867 - struct page *, unsigned long); 843 + struct page *, unsigned int, unsigned int); 868 844 extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); 869 845 extern int journal_stop(handle_t *); 870 846 extern int journal_flush (journal_t *);

+119 -56

include/linux/jbd2.h

··· 26 26 #include <linux/buffer_head.h> 27 27 #include <linux/journal-head.h> 28 28 #include <linux/stddef.h> 29 - #include <linux/bit_spinlock.h> 30 29 #include <linux/mutex.h> 31 30 #include <linux/timer.h> 32 31 #include <linux/slab.h> ··· 56 57 */ 57 58 #define JBD2_EXPENSIVE_CHECKING 58 59 extern ushort jbd2_journal_enable_debug; 60 + void __jbd2_debug(int level, const char *file, const char *func, 61 + unsigned int line, const char *fmt, ...); 59 62 60 - #define jbd_debug(n, f, a...) \ 61 - do { \ 62 - if ((n) <= jbd2_journal_enable_debug) { \ 63 - printk (KERN_DEBUG "(%s, %d): %s: ", \ 64 - __FILE__, __LINE__, __func__); \ 65 - printk (f, ## a); \ 66 - } \ 67 - } while (0) 63 + #define jbd_debug(n, fmt, a...) \ 64 + __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) 68 65 #else 69 - #define jbd_debug(f, a...) /**/ 66 + #define jbd_debug(n, fmt, a...) /**/ 70 67 #endif 71 68 72 69 extern void *jbd2_alloc(size_t size, gfp_t flags); ··· 297 302 298 303 #include <linux/fs.h> 299 304 #include <linux/sched.h> 305 + 306 + enum jbd_state_bits { 307 + BH_JBD /* Has an attached ext3 journal_head */ 308 + = BH_PrivateStart, 309 + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ 310 + BH_Freed, /* Has been freed (truncated) */ 311 + BH_Revoked, /* Has been revoked from the log */ 312 + BH_RevokeValid, /* Revoked flag is valid */ 313 + BH_JBDDirty, /* Is dirty but journaled */ 314 + BH_State, /* Pins most journal_head state */ 315 + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ 316 + BH_Shadow, /* IO on shadow buffer is running */ 317 + BH_Verified, /* Metadata block has been verified ok */ 318 + BH_JBDPrivateStart, /* First bit available for private use by FS */ 319 + }; 320 + 321 + BUFFER_FNS(JBD, jbd) 322 + BUFFER_FNS(JWrite, jwrite) 323 + BUFFER_FNS(JBDDirty, jbddirty) 324 + TAS_BUFFER_FNS(JBDDirty, jbddirty) 325 + BUFFER_FNS(Revoked, revoked) 326 + TAS_BUFFER_FNS(Revoked, revoked) 327 + BUFFER_FNS(RevokeValid, revokevalid) 328 + TAS_BUFFER_FNS(RevokeValid, revokevalid) 329 + BUFFER_FNS(Freed, freed) 330 + BUFFER_FNS(Shadow, shadow) 331 + BUFFER_FNS(Verified, verified) 332 + 300 333 #include <linux/jbd_common.h> 301 334 302 335 #define J_ASSERT(assert) BUG_ON(!(assert)) ··· 405 382 406 383 struct jbd2_journal_handle 407 384 { 408 - /* Which compound transaction is this update a part of? */ 409 - transaction_t *h_transaction; 385 + union { 386 + /* Which compound transaction is this update a part of? */ 387 + transaction_t *h_transaction; 388 + /* Which journal handle belongs to - used iff h_reserved set */ 389 + journal_t *h_journal; 390 + }; 391 + 392 + /* Handle reserved for finishing the logical operation */ 393 + handle_t *h_rsv_handle; 410 394 411 395 /* Number of remaining buffers we are allowed to dirty: */ 412 396 int h_buffer_credits; ··· 428 398 /* Flags [no locking] */ 429 399 unsigned int h_sync: 1; /* sync-on-close */ 430 400 unsigned int h_jdata: 1; /* force data journaling */ 401 + unsigned int h_reserved: 1; /* handle with reserved credits */ 431 402 unsigned int h_aborted: 1; /* fatal error on handle */ 432 403 unsigned int h_type: 8; /* for handle statistics */ 433 404 unsigned int h_line_no: 16; /* for handle statistics */ ··· 555 524 struct journal_head *t_checkpoint_io_list; 556 525 557 526 /* 558 - * Doubly-linked circular list of temporary buffers currently undergoing 559 - * IO in the log [j_list_lock] 560 - */ 561 - struct journal_head *t_iobuf_list; 562 - 563 - /* 564 527 * Doubly-linked circular list of metadata buffers being shadowed by log 565 528 * IO. The IO buffers on the iobuf list and the shadow buffers on this 566 529 * list match each other one for one at all times. [j_list_lock] 567 530 */ 568 531 struct journal_head *t_shadow_list; 569 - 570 - /* 571 - * Doubly-linked circular list of control buffers being written to the 572 - * log. [j_list_lock] 573 - */ 574 - struct journal_head *t_log_list; 575 532 576 533 /* 577 534 * List of inodes whose data we've modified in data=ordered mode. ··· 690 671 * waiting for checkpointing 691 672 * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction 692 673 * to start committing, or for a barrier lock to be released 693 - * @j_wait_logspace: Wait queue for waiting for checkpointing to complete 694 674 * @j_wait_done_commit: Wait queue for waiting for commit to complete 695 - * @j_wait_checkpoint: Wait queue to trigger checkpointing 696 675 * @j_wait_commit: Wait queue to trigger commit 697 676 * @j_wait_updates: Wait queue to wait for updates to complete 677 + * @j_wait_reserved: Wait queue to wait for reserved buffer credits to drop 698 678 * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints 699 679 * @j_head: Journal head - identifies the first unused block in the journal 700 680 * @j_tail: Journal tail - identifies the oldest still-used block in the ··· 707 689 * journal 708 690 * @j_fs_dev: Device which holds the client fs. For internal journal this will 709 691 * be equal to j_dev 692 + * @j_reserved_credits: Number of buffers reserved from the running transaction 710 693 * @j_maxlen: Total maximum capacity of the journal region on disk. 711 694 * @j_list_lock: Protects the buffer lists and internal buffer state. 712 695 * @j_inode: Optional inode where we store the journal. If present, all journal ··· 797 778 */ 798 779 wait_queue_head_t j_wait_transaction_locked; 799 780 800 - /* Wait queue for waiting for checkpointing to complete */ 801 - wait_queue_head_t j_wait_logspace; 802 - 803 781 /* Wait queue for waiting for commit to complete */ 804 782 wait_queue_head_t j_wait_done_commit; 805 - 806 - /* Wait queue to trigger checkpointing */ 807 - wait_queue_head_t j_wait_checkpoint; 808 783 809 784 /* Wait queue to trigger commit */ 810 785 wait_queue_head_t j_wait_commit; 811 786 812 787 /* Wait queue to wait for updates to complete */ 813 788 wait_queue_head_t j_wait_updates; 789 + 790 + /* Wait queue to wait for reserved buffer credits to drop */ 791 + wait_queue_head_t j_wait_reserved; 814 792 815 793 /* Semaphore for locking against concurrent checkpoints */ 816 794 struct mutex j_checkpoint_mutex; ··· 862 846 863 847 /* Total maximum capacity of the journal region on disk. */ 864 848 unsigned int j_maxlen; 849 + 850 + /* Number of buffers reserved from the running transaction */ 851 + atomic_t j_reserved_credits; 865 852 866 853 /* 867 854 * Protects the buffer lists and internal buffer state. ··· 1010 991 extern void __journal_free_buffer(struct journal_head *bh); 1011 992 extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); 1012 993 extern void __journal_clean_data_list(transaction_t *transaction); 994 + static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) 995 + { 996 + list_add_tail(&bh->b_assoc_buffers, head); 997 + } 998 + static inline void jbd2_unfile_log_bh(struct buffer_head *bh) 999 + { 1000 + list_del_init(&bh->b_assoc_buffers); 1001 + } 1013 1002 1014 1003 /* Log buffer allocation */ 1015 - extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *); 1004 + struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); 1016 1005 int jbd2_journal_next_log_block(journal_t *, unsigned long long *); 1017 1006 int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, 1018 1007 unsigned long *block); ··· 1066 1039 struct jbd2_buffer_trigger_type *triggers); 1067 1040 1068 1041 /* Buffer IO */ 1069 - extern int 1070 - jbd2_journal_write_metadata_buffer(transaction_t *transaction, 1071 - struct journal_head *jh_in, 1072 - struct journal_head **jh_out, 1073 - unsigned long long blocknr); 1042 + extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 1043 + struct journal_head *jh_in, 1044 + struct buffer_head **bh_out, 1045 + sector_t blocknr); 1074 1046 1075 1047 /* Transaction locking */ 1076 1048 extern void __wait_on_journal (journal_t *); ··· 1102 1076 */ 1103 1077 1104 1078 extern handle_t *jbd2_journal_start(journal_t *, int nblocks); 1105 - extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask, 1106 - unsigned int type, unsigned int line_no); 1079 + extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, 1080 + gfp_t gfp_mask, unsigned int type, 1081 + unsigned int line_no); 1107 1082 extern int jbd2_journal_restart(handle_t *, int nblocks); 1108 1083 extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); 1084 + extern int jbd2_journal_start_reserved(handle_t *handle, 1085 + unsigned int type, unsigned int line_no); 1086 + extern void jbd2_journal_free_reserved(handle_t *handle); 1109 1087 extern int jbd2_journal_extend (handle_t *, int nblocks); 1110 1088 extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); 1111 1089 extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); ··· 1120 1090 extern int jbd2_journal_forget (handle_t *, struct buffer_head *); 1121 1091 extern void journal_sync_buffer (struct buffer_head *); 1122 1092 extern int jbd2_journal_invalidatepage(journal_t *, 1123 - struct page *, unsigned long); 1093 + struct page *, unsigned int, unsigned int); 1124 1094 extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); 1125 1095 extern int jbd2_journal_stop(handle_t *); 1126 1096 extern int jbd2_journal_flush (journal_t *); ··· 1155 1125 extern int jbd2_journal_clear_err (journal_t *); 1156 1126 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1157 1127 extern int jbd2_journal_force_commit(journal_t *); 1128 + extern int jbd2_journal_force_commit_nested(journal_t *); 1158 1129 extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); 1159 1130 extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, 1160 1131 struct jbd2_inode *inode, loff_t new_size); ··· 1209 1178 extern void jbd2_journal_destroy_revoke(journal_t *); 1210 1179 extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); 1211 1180 extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); 1212 - extern void jbd2_journal_write_revoke_records(journal_t *, 1213 - transaction_t *, int); 1181 + extern void jbd2_journal_write_revoke_records(journal_t *journal, 1182 + transaction_t *transaction, 1183 + struct list_head *log_bufs, 1184 + int write_op); 1214 1185 1215 1186 /* Recovery revoke support */ 1216 1187 extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); ··· 1228 1195 * transitions on demand. 1229 1196 */ 1230 1197 1231 - int __jbd2_log_space_left(journal_t *); /* Called with journal locked */ 1232 1198 int jbd2_log_start_commit(journal_t *journal, tid_t tid); 1233 1199 int __jbd2_log_start_commit(journal_t *journal, tid_t tid); 1234 1200 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); 1235 - int jbd2_journal_force_commit_nested(journal_t *journal); 1236 1201 int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1237 1202 int jbd2_complete_transaction(journal_t *journal, tid_t tid); 1238 1203 int jbd2_log_do_checkpoint(journal_t *journal); ··· 1266 1235 1267 1236 static inline int is_handle_aborted(handle_t *handle) 1268 1237 { 1269 - if (handle->h_aborted) 1238 + if (handle->h_aborted || !handle->h_transaction) 1270 1239 return 1; 1271 1240 return is_journal_aborted(handle->h_transaction->t_journal); 1272 1241 } ··· 1297 1266 extern size_t journal_tag_bytes(journal_t *journal); 1298 1267 1299 1268 /* 1269 + * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for 1270 + * transaction control blocks. 1271 + */ 1272 + #define JBD2_CONTROL_BLOCKS_SHIFT 5 1273 + 1274 + /* 1300 1275 * Return the minimum number of blocks which must be free in the journal 1301 1276 * before a new transaction may be started. Must be called under j_state_lock. 1302 1277 */ 1303 - static inline int jbd_space_needed(journal_t *journal) 1278 + static inline int jbd2_space_needed(journal_t *journal) 1304 1279 { 1305 1280 int nblocks = journal->j_max_transaction_buffers; 1306 - if (journal->j_committing_transaction) 1307 - nblocks += atomic_read(&journal->j_committing_transaction-> 1308 - t_outstanding_credits); 1309 - return nblocks; 1281 + return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT); 1282 + } 1283 + 1284 + /* 1285 + * Return number of free blocks in the log. Must be called under j_state_lock. 1286 + */ 1287 + static inline unsigned long jbd2_log_space_left(journal_t *journal) 1288 + { 1289 + /* Allow for rounding errors */ 1290 + unsigned long free = journal->j_free - 32; 1291 + 1292 + if (journal->j_committing_transaction) { 1293 + unsigned long committing = atomic_read(&journal-> 1294 + j_committing_transaction->t_outstanding_credits); 1295 + 1296 + /* Transaction + control blocks */ 1297 + free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT); 1298 + } 1299 + return free; 1310 1300 } 1311 1301 1312 1302 /* ··· 1338 1286 #define BJ_None 0 /* Not journaled */ 1339 1287 #define BJ_Metadata 1 /* Normal journaled metadata */ 1340 1288 #define BJ_Forget 2 /* Buffer superseded by this transaction */ 1341 - #define BJ_IO 3 /* Buffer is for temporary IO use */ 1342 - #define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ 1343 - #define BJ_LogCtl 5 /* Buffer contains log descriptors */ 1344 - #define BJ_Reserved 6 /* Buffer is reserved for access by journal */ 1345 - #define BJ_Types 7 1289 + #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ 1290 + #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ 1291 + #define BJ_Types 5 1346 1292 1347 1293 extern int jbd_blocks_per_page(struct inode *inode); 1348 1294 ··· 1367 1317 BUG_ON(err); 1368 1318 1369 1319 return *(u32 *)desc.ctx; 1320 + } 1321 + 1322 + /* Return most recent uncommitted transaction */ 1323 + static inline tid_t jbd2_get_latest_transaction(journal_t *journal) 1324 + { 1325 + tid_t tid; 1326 + 1327 + read_lock(&journal->j_state_lock); 1328 + tid = journal->j_commit_request; 1329 + if (journal->j_running_transaction) 1330 + tid = journal->j_running_transaction->t_tid; 1331 + read_unlock(&journal->j_state_lock); 1332 + return tid; 1370 1333 } 1371 1334 1372 1335 #ifdef __KERNEL__

+1 -25

include/linux/jbd_common.h

··· 1 1 #ifndef _LINUX_JBD_STATE_H 2 2 #define _LINUX_JBD_STATE_H 3 3 4 - enum jbd_state_bits { 5 - BH_JBD /* Has an attached ext3 journal_head */ 6 - = BH_PrivateStart, 7 - BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ 8 - BH_Freed, /* Has been freed (truncated) */ 9 - BH_Revoked, /* Has been revoked from the log */ 10 - BH_RevokeValid, /* Revoked flag is valid */ 11 - BH_JBDDirty, /* Is dirty but journaled */ 12 - BH_State, /* Pins most journal_head state */ 13 - BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ 14 - BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ 15 - BH_Verified, /* Metadata block has been verified ok */ 16 - BH_JBDPrivateStart, /* First bit available for private use by FS */ 17 - }; 18 - 19 - BUFFER_FNS(JBD, jbd) 20 - BUFFER_FNS(JWrite, jwrite) 21 - BUFFER_FNS(JBDDirty, jbddirty) 22 - TAS_BUFFER_FNS(JBDDirty, jbddirty) 23 - BUFFER_FNS(Revoked, revoked) 24 - TAS_BUFFER_FNS(Revoked, revoked) 25 - BUFFER_FNS(RevokeValid, revokevalid) 26 - TAS_BUFFER_FNS(RevokeValid, revokevalid) 27 - BUFFER_FNS(Freed, freed) 28 - BUFFER_FNS(Verified, verified) 4 + #include <linux/bit_spinlock.h> 29 5 30 6 static inline struct buffer_head *jh2bh(struct journal_head *jh) 31 7 {

+2 -1

include/linux/mm.h

··· 1041 1041 struct page *get_dump_page(unsigned long addr); 1042 1042 1043 1043 extern int try_to_release_page(struct page * page, gfp_t gfp_mask); 1044 - extern void do_invalidatepage(struct page *page, unsigned long offset); 1044 + extern void do_invalidatepage(struct page *page, unsigned int offset, 1045 + unsigned int length); 1045 1046 1046 1047 int __set_page_dirty_nobuffers(struct page *page); 1047 1048 int __set_page_dirty_no_writeback(struct page *page);

+7 -5

include/trace/events/ext3.h

··· 290 290 ); 291 291 292 292 TRACE_EVENT(ext3_invalidatepage, 293 - TP_PROTO(struct page *page, unsigned long offset), 293 + TP_PROTO(struct page *page, unsigned int offset, unsigned int length), 294 294 295 - TP_ARGS(page, offset), 295 + TP_ARGS(page, offset, length), 296 296 297 297 TP_STRUCT__entry( 298 298 __field( pgoff_t, index ) 299 - __field( unsigned long, offset ) 299 + __field( unsigned int, offset ) 300 + __field( unsigned int, length ) 300 301 __field( ino_t, ino ) 301 302 __field( dev_t, dev ) 302 303 ··· 306 305 TP_fast_assign( 307 306 __entry->index = page->index; 308 307 __entry->offset = offset; 308 + __entry->length = length; 309 309 __entry->ino = page->mapping->host->i_ino; 310 310 __entry->dev = page->mapping->host->i_sb->s_dev; 311 311 ), 312 312 313 - TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", 313 + TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u", 314 314 MAJOR(__entry->dev), MINOR(__entry->dev), 315 315 (unsigned long) __entry->ino, 316 - __entry->index, __entry->offset) 316 + __entry->index, __entry->offset, __entry->length) 317 317 ); 318 318 319 319 TRACE_EVENT(ext3_discard_blocks,

+206 -98

include/trace/events/ext4.h

··· 19 19 20 20 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 21 21 22 + #define show_mballoc_flags(flags) __print_flags(flags, "|", \ 23 + { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ 24 + { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ 25 + { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ 26 + { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ 27 + { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ 28 + { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ 29 + { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ 30 + { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ 31 + { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \ 32 + { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \ 33 + { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ 34 + { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ 35 + { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ 36 + { EXT4_MB_USE_RESERVED, "USE_RESV" }) 37 + 38 + #define show_map_flags(flags) __print_flags(flags, "|", \ 39 + { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ 40 + { EXT4_GET_BLOCKS_UNINIT_EXT, "UNINIT" }, \ 41 + { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ 42 + { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ 43 + { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ 44 + { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ 45 + { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ 46 + { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ 47 + { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }, \ 48 + { EXT4_GET_BLOCKS_NO_PUT_HOLE, "NO_PUT_HOLE" }) 49 + 50 + #define show_mflags(flags) __print_flags(flags, "", \ 51 + { EXT4_MAP_NEW, "N" }, \ 52 + { EXT4_MAP_MAPPED, "M" }, \ 53 + { EXT4_MAP_UNWRITTEN, "U" }, \ 54 + { EXT4_MAP_BOUNDARY, "B" }, \ 55 + { EXT4_MAP_UNINIT, "u" }, \ 56 + { EXT4_MAP_FROM_CLUSTER, "C" }) 57 + 58 + #define show_free_flags(flags) __print_flags(flags, "|", \ 59 + { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ 60 + { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \ 61 + { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \ 62 + { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \ 63 + { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\ 64 + { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" }) 65 + 66 + #define show_extent_status(status) __print_flags(status, "", \ 67 + { (1 << 3), "W" }, \ 68 + { (1 << 2), "U" }, \ 69 + { (1 << 1), "D" }, \ 70 + { (1 << 0), "H" }) 71 + 72 + 22 73 TRACE_EVENT(ext4_free_inode, 23 74 TP_PROTO(struct inode *inode), 24 75 ··· 332 281 TP_ARGS(inode, pos, len, copied) 333 282 ); 334 283 335 - TRACE_EVENT(ext4_da_writepages, 284 + TRACE_EVENT(ext4_writepages, 336 285 TP_PROTO(struct inode *inode, struct writeback_control *wbc), 337 286 338 287 TP_ARGS(inode, wbc), ··· 375 324 ); 376 325 377 326 TRACE_EVENT(ext4_da_write_pages, 378 - TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), 327 + TP_PROTO(struct inode *inode, pgoff_t first_page, 328 + struct writeback_control *wbc), 379 329 380 - TP_ARGS(inode, mpd), 330 + TP_ARGS(inode, first_page, wbc), 381 331 382 332 TP_STRUCT__entry( 383 333 __field( dev_t, dev ) 384 334 __field( ino_t, ino ) 385 - __field( __u64, b_blocknr ) 386 - __field( __u32, b_size ) 387 - __field( __u32, b_state ) 388 - __field( unsigned long, first_page ) 389 - __field( int, io_done ) 390 - __field( int, pages_written ) 391 - __field( int, sync_mode ) 335 + __field( pgoff_t, first_page ) 336 + __field( long, nr_to_write ) 337 + __field( int, sync_mode ) 392 338 ), 393 339 394 340 TP_fast_assign( 395 341 __entry->dev = inode->i_sb->s_dev; 396 342 __entry->ino = inode->i_ino; 397 - __entry->b_blocknr = mpd->b_blocknr; 398 - __entry->b_size = mpd->b_size; 399 - __entry->b_state = mpd->b_state; 400 - __entry->first_page = mpd->first_page; 401 - __entry->io_done = mpd->io_done; 402 - __entry->pages_written = mpd->pages_written; 403 - __entry->sync_mode = mpd->wbc->sync_mode; 343 + __entry->first_page = first_page; 344 + __entry->nr_to_write = wbc->nr_to_write; 345 + __entry->sync_mode = wbc->sync_mode; 404 346 ), 405 347 406 - TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " 407 - "first_page %lu io_done %d pages_written %d sync_mode %d", 348 + TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " 349 + "sync_mode %d", 408 350 MAJOR(__entry->dev), MINOR(__entry->dev), 409 - (unsigned long) __entry->ino, 410 - __entry->b_blocknr, __entry->b_size, 411 - __entry->b_state, __entry->first_page, 412 - __entry->io_done, __entry->pages_written, 413 - __entry->sync_mode 414 - ) 351 + (unsigned long) __entry->ino, __entry->first_page, 352 + __entry->nr_to_write, __entry->sync_mode) 415 353 ); 416 354 417 - TRACE_EVENT(ext4_da_writepages_result, 355 + TRACE_EVENT(ext4_da_write_pages_extent, 356 + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), 357 + 358 + TP_ARGS(inode, map), 359 + 360 + TP_STRUCT__entry( 361 + __field( dev_t, dev ) 362 + __field( ino_t, ino ) 363 + __field( __u64, lblk ) 364 + __field( __u32, len ) 365 + __field( __u32, flags ) 366 + ), 367 + 368 + TP_fast_assign( 369 + __entry->dev = inode->i_sb->s_dev; 370 + __entry->ino = inode->i_ino; 371 + __entry->lblk = map->m_lblk; 372 + __entry->len = map->m_len; 373 + __entry->flags = map->m_flags; 374 + ), 375 + 376 + TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", 377 + MAJOR(__entry->dev), MINOR(__entry->dev), 378 + (unsigned long) __entry->ino, __entry->lblk, __entry->len, 379 + show_mflags(__entry->flags)) 380 + ); 381 + 382 + TRACE_EVENT(ext4_writepages_result, 418 383 TP_PROTO(struct inode *inode, struct writeback_control *wbc, 419 384 int ret, int pages_written), 420 385 ··· 511 444 ); 512 445 513 446 DECLARE_EVENT_CLASS(ext4_invalidatepage_op, 514 - TP_PROTO(struct page *page, unsigned long offset), 447 + TP_PROTO(struct page *page, unsigned int offset, unsigned int length), 515 448 516 - TP_ARGS(page, offset), 449 + TP_ARGS(page, offset, length), 517 450 518 451 TP_STRUCT__entry( 519 452 __field( dev_t, dev ) 520 453 __field( ino_t, ino ) 521 454 __field( pgoff_t, index ) 522 - __field( unsigned long, offset ) 523 - 455 + __field( unsigned int, offset ) 456 + __field( unsigned int, length ) 524 457 ), 525 458 526 459 TP_fast_assign( ··· 528 461 __entry->ino = page->mapping->host->i_ino; 529 462 __entry->index = page->index; 530 463 __entry->offset = offset; 464 + __entry->length = length; 531 465 ), 532 466 533 - TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", 467 + TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u", 534 468 MAJOR(__entry->dev), MINOR(__entry->dev), 535 469 (unsigned long) __entry->ino, 536 - (unsigned long) __entry->index, __entry->offset) 470 + (unsigned long) __entry->index, 471 + __entry->offset, __entry->length) 537 472 ); 538 473 539 474 DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, 540 - TP_PROTO(struct page *page, unsigned long offset), 475 + TP_PROTO(struct page *page, unsigned int offset, unsigned int length), 541 476 542 - TP_ARGS(page, offset) 477 + TP_ARGS(page, offset, length) 543 478 ); 544 479 545 480 DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, 546 - TP_PROTO(struct page *page, unsigned long offset), 481 + TP_PROTO(struct page *page, unsigned int offset, unsigned int length), 547 482 548 - TP_ARGS(page, offset) 483 + TP_ARGS(page, offset, length) 549 484 ); 550 485 551 486 TRACE_EVENT(ext4_discard_blocks, ··· 742 673 __entry->flags = ar->flags; 743 674 ), 744 675 745 - TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " 676 + TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " 746 677 "lleft %u lright %u pleft %llu pright %llu ", 747 678 MAJOR(__entry->dev), MINOR(__entry->dev), 748 - (unsigned long) __entry->ino, __entry->flags, 679 + (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), 749 680 __entry->len, __entry->logical, __entry->goal, 750 681 __entry->lleft, __entry->lright, __entry->pleft, 751 682 __entry->pright) ··· 784 715 __entry->flags = ar->flags; 785 716 ), 786 717 787 - TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " 718 + TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " 788 719 "goal %llu lleft %u lright %u pleft %llu pright %llu", 789 720 MAJOR(__entry->dev), MINOR(__entry->dev), 790 - (unsigned long) __entry->ino, __entry->flags, 721 + (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), 791 722 __entry->len, __entry->block, __entry->logical, 792 723 __entry->goal, __entry->lleft, __entry->lright, 793 724 __entry->pleft, __entry->pright) ··· 817 748 __entry->mode = inode->i_mode; 818 749 ), 819 750 820 - TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", 751 + TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", 821 752 MAJOR(__entry->dev), MINOR(__entry->dev), 822 753 (unsigned long) __entry->ino, 823 754 __entry->mode, __entry->block, __entry->count, 824 - __entry->flags) 755 + show_free_flags(__entry->flags)) 825 756 ); 826 757 827 758 TRACE_EVENT(ext4_sync_file_enter, ··· 972 903 ), 973 904 974 905 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " 975 - "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " 906 + "result %u/%d/%u@%u blks %u grps %u cr %u flags %s " 976 907 "tail %u broken %u", 977 908 MAJOR(__entry->dev), MINOR(__entry->dev), 978 909 (unsigned long) __entry->ino, ··· 983 914 __entry->result_group, __entry->result_start, 984 915 __entry->result_len, __entry->result_logical, 985 916 __entry->found, __entry->groups, __entry->cr, 986 - __entry->flags, __entry->tail, 917 + show_mballoc_flags(__entry->flags), __entry->tail, 987 918 __entry->buddy ? 1 << __entry->buddy : 0) 988 919 ); 989 920 ··· 1597 1528 __entry->flags = flags; 1598 1529 ), 1599 1530 1600 - TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", 1531 + TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", 1601 1532 MAJOR(__entry->dev), MINOR(__entry->dev), 1602 1533 (unsigned long) __entry->ino, 1603 - __entry->lblk, __entry->len, __entry->flags) 1534 + __entry->lblk, __entry->len, show_map_flags(__entry->flags)) 1604 1535 ); 1605 1536 1606 1537 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, ··· 1618 1549 ); 1619 1550 1620 1551 DECLARE_EVENT_CLASS(ext4__map_blocks_exit, 1621 - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1552 + TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, 1553 + int ret), 1622 1554 1623 - TP_ARGS(inode, map, ret), 1555 + TP_ARGS(inode, flags, map, ret), 1624 1556 1625 1557 TP_STRUCT__entry( 1626 1558 __field( dev_t, dev ) 1627 1559 __field( ino_t, ino ) 1560 + __field( unsigned int, flags ) 1628 1561 __field( ext4_fsblk_t, pblk ) 1629 1562 __field( ext4_lblk_t, lblk ) 1630 1563 __field( unsigned int, len ) 1631 - __field( unsigned int, flags ) 1564 + __field( unsigned int, mflags ) 1632 1565 __field( int, ret ) 1633 1566 ), 1634 1567 1635 1568 TP_fast_assign( 1636 1569 __entry->dev = inode->i_sb->s_dev; 1637 1570 __entry->ino = inode->i_ino; 1571 + __entry->flags = flags; 1638 1572 __entry->pblk = map->m_pblk; 1639 1573 __entry->lblk = map->m_lblk; 1640 1574 __entry->len = map->m_len; 1641 - __entry->flags = map->m_flags; 1575 + __entry->mflags = map->m_flags; 1642 1576 __entry->ret = ret; 1643 1577 ), 1644 1578 1645 - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", 1579 + TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " 1580 + "mflags %s ret %d", 1646 1581 MAJOR(__entry->dev), MINOR(__entry->dev), 1647 1582 (unsigned long) __entry->ino, 1648 - __entry->lblk, __entry->pblk, 1649 - __entry->len, __entry->flags, __entry->ret) 1583 + show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, 1584 + __entry->len, show_mflags(__entry->mflags), __entry->ret) 1650 1585 ); 1651 1586 1652 1587 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, 1653 - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1588 + TP_PROTO(struct inode *inode, unsigned flags, 1589 + struct ext4_map_blocks *map, int ret), 1654 1590 1655 - TP_ARGS(inode, map, ret) 1591 + TP_ARGS(inode, flags, map, ret) 1656 1592 ); 1657 1593 1658 1594 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, 1659 - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1595 + TP_PROTO(struct inode *inode, unsigned flags, 1596 + struct ext4_map_blocks *map, int ret), 1660 1597 1661 - TP_ARGS(inode, map, ret) 1598 + TP_ARGS(inode, flags, map, ret) 1662 1599 ); 1663 1600 1664 1601 TRACE_EVENT(ext4_ext_load_extent, ··· 1713 1638 ); 1714 1639 1715 1640 TRACE_EVENT(ext4_journal_start, 1716 - TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP), 1641 + TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, 1642 + unsigned long IP), 1717 1643 1718 - TP_ARGS(sb, nblocks, IP), 1644 + TP_ARGS(sb, blocks, rsv_blocks, IP), 1719 1645 1720 1646 TP_STRUCT__entry( 1721 1647 __field( dev_t, dev ) 1722 1648 __field(unsigned long, ip ) 1723 - __field( int, nblocks ) 1649 + __field( int, blocks ) 1650 + __field( int, rsv_blocks ) 1724 1651 ), 1725 1652 1726 1653 TP_fast_assign( 1727 - __entry->dev = sb->s_dev; 1728 - __entry->ip = IP; 1729 - __entry->nblocks = nblocks; 1654 + __entry->dev = sb->s_dev; 1655 + __entry->ip = IP; 1656 + __entry->blocks = blocks; 1657 + __entry->rsv_blocks = rsv_blocks; 1730 1658 ), 1731 1659 1732 - TP_printk("dev %d,%d nblocks %d caller %pF", 1660 + TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF", 1733 1661 MAJOR(__entry->dev), MINOR(__entry->dev), 1734 - __entry->nblocks, (void *)__entry->ip) 1662 + __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip) 1663 + ); 1664 + 1665 + TRACE_EVENT(ext4_journal_start_reserved, 1666 + TP_PROTO(struct super_block *sb, int blocks, unsigned long IP), 1667 + 1668 + TP_ARGS(sb, blocks, IP), 1669 + 1670 + TP_STRUCT__entry( 1671 + __field( dev_t, dev ) 1672 + __field(unsigned long, ip ) 1673 + __field( int, blocks ) 1674 + ), 1675 + 1676 + TP_fast_assign( 1677 + __entry->dev = sb->s_dev; 1678 + __entry->ip = IP; 1679 + __entry->blocks = blocks; 1680 + ), 1681 + 1682 + TP_printk("dev %d,%d blocks, %d caller %pF", 1683 + MAJOR(__entry->dev), MINOR(__entry->dev), 1684 + __entry->blocks, (void *)__entry->ip) 1735 1685 ); 1736 1686 1737 1687 DECLARE_EVENT_CLASS(ext4__trim, ··· 1836 1736 __entry->newblk = newblock; 1837 1737 ), 1838 1738 1839 - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " 1739 + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " 1840 1740 "allocated %d newblock %llu", 1841 1741 MAJOR(__entry->dev), MINOR(__entry->dev), 1842 1742 (unsigned long) __entry->ino, 1843 1743 (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, 1844 - __entry->len, __entry->flags, 1744 + __entry->len, show_map_flags(__entry->flags), 1845 1745 (unsigned int) __entry->allocated, 1846 1746 (unsigned long long) __entry->newblk) 1847 1747 ); ··· 1869 1769 __entry->ret = ret; 1870 1770 ), 1871 1771 1872 - TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d", 1772 + TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d", 1873 1773 MAJOR(__entry->dev), MINOR(__entry->dev), 1874 1774 __entry->lblk, (unsigned long long) __entry->pblk, 1875 - __entry->len, __entry->flags, __entry->ret) 1775 + __entry->len, show_mflags(__entry->flags), __entry->ret) 1876 1776 ); 1877 1777 1878 1778 TRACE_EVENT(ext4_ext_put_in_cache, ··· 2026 1926 TRACE_EVENT(ext4_remove_blocks, 2027 1927 TP_PROTO(struct inode *inode, struct ext4_extent *ex, 2028 1928 ext4_lblk_t from, ext4_fsblk_t to, 2029 - ext4_fsblk_t partial_cluster), 1929 + long long partial_cluster), 2030 1930 2031 1931 TP_ARGS(inode, ex, from, to, partial_cluster), 2032 1932 ··· 2035 1935 __field( ino_t, ino ) 2036 1936 __field( ext4_lblk_t, from ) 2037 1937 __field( ext4_lblk_t, to ) 2038 - __field( ext4_fsblk_t, partial ) 1938 + __field( long long, partial ) 2039 1939 __field( ext4_fsblk_t, ee_pblk ) 2040 1940 __field( ext4_lblk_t, ee_lblk ) 2041 1941 __field( unsigned short, ee_len ) ··· 2053 1953 ), 2054 1954 2055 1955 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" 2056 - "from %u to %u partial_cluster %u", 1956 + "from %u to %u partial_cluster %lld", 2057 1957 MAJOR(__entry->dev), MINOR(__entry->dev), 2058 1958 (unsigned long) __entry->ino, 2059 1959 (unsigned) __entry->ee_lblk, ··· 2061 1961 (unsigned short) __entry->ee_len, 2062 1962 (unsigned) __entry->from, 2063 1963 (unsigned) __entry->to, 2064 - (unsigned) __entry->partial) 1964 + (long long) __entry->partial) 2065 1965 ); 2066 1966 2067 1967 TRACE_EVENT(ext4_ext_rm_leaf, 2068 1968 TP_PROTO(struct inode *inode, ext4_lblk_t start, 2069 - struct ext4_extent *ex, ext4_fsblk_t partial_cluster), 1969 + struct ext4_extent *ex, 1970 + long long partial_cluster), 2070 1971 2071 1972 TP_ARGS(inode, start, ex, partial_cluster), 2072 1973 2073 1974 TP_STRUCT__entry( 2074 1975 __field( dev_t, dev ) 2075 1976 __field( ino_t, ino ) 2076 - __field( ext4_fsblk_t, partial ) 1977 + __field( long long, partial ) 2077 1978 __field( ext4_lblk_t, start ) 2078 1979 __field( ext4_lblk_t, ee_lblk ) 2079 1980 __field( ext4_fsblk_t, ee_pblk ) ··· 2092 1991 ), 2093 1992 2094 1993 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" 2095 - "partial_cluster %u", 1994 + "partial_cluster %lld", 2096 1995 MAJOR(__entry->dev), MINOR(__entry->dev), 2097 1996 (unsigned long) __entry->ino, 2098 1997 (unsigned) __entry->start, 2099 1998 (unsigned) __entry->ee_lblk, 2100 1999 (unsigned long long) __entry->ee_pblk, 2101 2000 (unsigned short) __entry->ee_len, 2102 - (unsigned) __entry->partial) 2001 + (long long) __entry->partial) 2103 2002 ); 2104 2003 2105 2004 TRACE_EVENT(ext4_ext_rm_idx, ··· 2126 2025 ); 2127 2026 2128 2027 TRACE_EVENT(ext4_ext_remove_space, 2129 - TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth), 2028 + TP_PROTO(struct inode *inode, ext4_lblk_t start, 2029 + ext4_lblk_t end, int depth), 2130 2030 2131 - TP_ARGS(inode, start, depth), 2031 + TP_ARGS(inode, start, end, depth), 2132 2032 2133 2033 TP_STRUCT__entry( 2134 2034 __field( dev_t, dev ) 2135 2035 __field( ino_t, ino ) 2136 2036 __field( ext4_lblk_t, start ) 2037 + __field( ext4_lblk_t, end ) 2137 2038 __field( int, depth ) 2138 2039 ), 2139 2040 ··· 2143 2040 __entry->dev = inode->i_sb->s_dev; 2144 2041 __entry->ino = inode->i_ino; 2145 2042 __entry->start = start; 2043 + __entry->end = end; 2146 2044 __entry->depth = depth; 2147 2045 ), 2148 2046 2149 - TP_printk("dev %d,%d ino %lu since %u depth %d", 2047 + TP_printk("dev %d,%d ino %lu since %u end %u depth %d", 2150 2048 MAJOR(__entry->dev), MINOR(__entry->dev), 2151 2049 (unsigned long) __entry->ino, 2152 2050 (unsigned) __entry->start, 2051 + (unsigned) __entry->end, 2153 2052 __entry->depth) 2154 2053 ); 2155 2054 2156 2055 TRACE_EVENT(ext4_ext_remove_space_done, 2157 - TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, 2158 - ext4_lblk_t partial, __le16 eh_entries), 2056 + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, 2057 + int depth, long long partial, __le16 eh_entries), 2159 2058 2160 - TP_ARGS(inode, start, depth, partial, eh_entries), 2059 + TP_ARGS(inode, start, end, depth, partial, eh_entries), 2161 2060 2162 2061 TP_STRUCT__entry( 2163 2062 __field( dev_t, dev ) 2164 2063 __field( ino_t, ino ) 2165 2064 __field( ext4_lblk_t, start ) 2065 + __field( ext4_lblk_t, end ) 2166 2066 __field( int, depth ) 2167 - __field( ext4_lblk_t, partial ) 2067 + __field( long long, partial ) 2168 2068 __field( unsigned short, eh_entries ) 2169 2069 ), 2170 2070 ··· 2175 2069 __entry->dev = inode->i_sb->s_dev; 2176 2070 __entry->ino = inode->i_ino; 2177 2071 __entry->start = start; 2072 + __entry->end = end; 2178 2073 __entry->depth = depth; 2179 2074 __entry->partial = partial; 2180 2075 __entry->eh_entries = le16_to_cpu(eh_entries); 2181 2076 ), 2182 2077 2183 - TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " 2078 + TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " 2184 2079 "remaining_entries %u", 2185 2080 MAJOR(__entry->dev), MINOR(__entry->dev), 2186 2081 (unsigned long) __entry->ino, 2187 2082 (unsigned) __entry->start, 2083 + (unsigned) __entry->end, 2188 2084 __entry->depth, 2189 - (unsigned) __entry->partial, 2085 + (long long) __entry->partial, 2190 2086 (unsigned short) __entry->eh_entries) 2191 2087 ); 2192 2088 ··· 2203 2095 __field( ext4_lblk_t, lblk ) 2204 2096 __field( ext4_lblk_t, len ) 2205 2097 __field( ext4_fsblk_t, pblk ) 2206 - __field( unsigned long long, status ) 2098 + __field( char, status ) 2207 2099 ), 2208 2100 2209 2101 TP_fast_assign( ··· 2212 2104 __entry->lblk = es->es_lblk; 2213 2105 __entry->len = es->es_len; 2214 2106 __entry->pblk = ext4_es_pblock(es); 2215 - __entry->status = ext4_es_status(es); 2107 + __entry->status = ext4_es_status(es) >> 60; 2216 2108 ), 2217 2109 2218 - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", 2110 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", 2219 2111 MAJOR(__entry->dev), MINOR(__entry->dev), 2220 2112 (unsigned long) __entry->ino, 2221 2113 __entry->lblk, __entry->len, 2222 - __entry->pblk, __entry->status) 2114 + __entry->pblk, show_extent_status(__entry->status)) 2223 2115 ); 2224 2116 2225 2117 TRACE_EVENT(ext4_es_remove_extent, ··· 2280 2172 __field( ext4_lblk_t, lblk ) 2281 2173 __field( ext4_lblk_t, len ) 2282 2174 __field( ext4_fsblk_t, pblk ) 2283 - __field( unsigned long long, status ) 2175 + __field( char, status ) 2284 2176 ), 2285 2177 2286 2178 TP_fast_assign( ··· 2289 2181 __entry->lblk = es->es_lblk; 2290 2182 __entry->len = es->es_len; 2291 2183 __entry->pblk = ext4_es_pblock(es); 2292 - __entry->status = ext4_es_status(es); 2184 + __entry->status = ext4_es_status(es) >> 60; 2293 2185 ), 2294 2186 2295 - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", 2187 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", 2296 2188 MAJOR(__entry->dev), MINOR(__entry->dev), 2297 2189 (unsigned long) __entry->ino, 2298 2190 __entry->lblk, __entry->len, 2299 - __entry->pblk, __entry->status) 2191 + __entry->pblk, show_extent_status(__entry->status)) 2300 2192 ); 2301 2193 2302 2194 TRACE_EVENT(ext4_es_lookup_extent_enter, ··· 2333 2225 __field( ext4_lblk_t, lblk ) 2334 2226 __field( ext4_lblk_t, len ) 2335 2227 __field( ext4_fsblk_t, pblk ) 2336 - __field( unsigned long long, status ) 2228 + __field( char, status ) 2337 2229 __field( int, found ) 2338 2230 ), 2339 2231 ··· 2343 2235 __entry->lblk = es->es_lblk; 2344 2236 __entry->len = es->es_len; 2345 2237 __entry->pblk = ext4_es_pblock(es); 2346 - __entry->status = ext4_es_status(es); 2238 + __entry->status = ext4_es_status(es) >> 60; 2347 2239 __entry->found = found; 2348 2240 ), 2349 2241 2350 - TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx", 2242 + TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", 2351 2243 MAJOR(__entry->dev), MINOR(__entry->dev), 2352 2244 (unsigned long) __entry->ino, __entry->found, 2353 2245 __entry->lblk, __entry->len, 2354 2246 __entry->found ? __entry->pblk : 0, 2355 - __entry->found ? __entry->status : 0) 2247 + show_extent_status(__entry->found ? __entry->status : 0)) 2356 2248 ); 2357 2249 2358 2250 TRACE_EVENT(ext4_es_shrink_enter,

+1 -1

mm/readahead.c

··· 48 48 if (!trylock_page(page)) 49 49 BUG(); 50 50 page->mapping = mapping; 51 - do_invalidatepage(page, 0); 51 + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); 52 52 page->mapping = NULL; 53 53 unlock_page(page); 54 54 }

+81 -36

mm/truncate.c

··· 26 26 /** 27 27 * do_invalidatepage - invalidate part or all of a page 28 28 * @page: the page which is affected 29 - * @offset: the index of the truncation point 29 + * @offset: start of the range to invalidate 30 + * @length: length of the range to invalidate 30 31 * 31 32 * do_invalidatepage() is called when all or part of the page has become 32 33 * invalidated by a truncate operation. ··· 38 37 * point. Because the caller is about to free (and possibly reuse) those 39 38 * blocks on-disk. 40 39 */ 41 - void do_invalidatepage(struct page *page, unsigned long offset) 40 + void do_invalidatepage(struct page *page, unsigned int offset, 41 + unsigned int length) 42 42 { 43 - void (*invalidatepage)(struct page *, unsigned long); 43 + void (*invalidatepage)(struct page *, unsigned int, unsigned int); 44 + 44 45 invalidatepage = page->mapping->a_ops->invalidatepage; 45 46 #ifdef CONFIG_BLOCK 46 47 if (!invalidatepage) 47 48 invalidatepage = block_invalidatepage; 48 49 #endif 49 50 if (invalidatepage) 50 - (*invalidatepage)(page, offset); 51 - } 52 - 53 - static inline void truncate_partial_page(struct page *page, unsigned partial) 54 - { 55 - zero_user_segment(page, partial, PAGE_CACHE_SIZE); 56 - cleancache_invalidate_page(page->mapping, page); 57 - if (page_has_private(page)) 58 - do_invalidatepage(page, partial); 51 + (*invalidatepage)(page, offset, length); 59 52 } 60 53 61 54 /* ··· 98 103 return -EIO; 99 104 100 105 if (page_has_private(page)) 101 - do_invalidatepage(page, 0); 106 + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); 102 107 103 108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 104 109 ··· 180 185 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets 181 186 * @mapping: mapping to truncate 182 187 * @lstart: offset from which to truncate 183 - * @lend: offset to which to truncate 188 + * @lend: offset to which to truncate (inclusive) 184 189 * 185 190 * Truncate the page cache, removing the pages that are between 186 - * specified offsets (and zeroing out partial page 187 - * (if lstart is not page aligned)). 191 + * specified offsets (and zeroing out partial pages 192 + * if lstart or lend + 1 is not page aligned). 188 193 * 189 194 * Truncate takes two passes - the first pass is nonblocking. It will not 190 195 * block on page locks and it will not block on writeback. The second pass ··· 195 200 * We pass down the cache-hot hint to the page freeing code. Even if the 196 201 * mapping is large, it is probably the case that the final pages are the most 197 202 * recently touched, and freeing happens in ascending file offset order. 203 + * 204 + * Note that since ->invalidatepage() accepts range to invalidate 205 + * truncate_inode_pages_range is able to handle cases where lend + 1 is not 206 + * page aligned properly. 198 207 */ 199 208 void truncate_inode_pages_range(struct address_space *mapping, 200 209 loff_t lstart, loff_t lend) 201 210 { 202 - const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 203 - const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 204 - struct pagevec pvec; 205 - pgoff_t index; 206 - pgoff_t end; 207 - int i; 211 + pgoff_t start; /* inclusive */ 212 + pgoff_t end; /* exclusive */ 213 + unsigned int partial_start; /* inclusive */ 214 + unsigned int partial_end; /* exclusive */ 215 + struct pagevec pvec; 216 + pgoff_t index; 217 + int i; 208 218 209 219 cleancache_invalidate_inode(mapping); 210 220 if (mapping->nrpages == 0) 211 221 return; 212 222 213 - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 214 - end = (lend >> PAGE_CACHE_SHIFT); 223 + /* Offsets within partial pages */ 224 + partial_start = lstart & (PAGE_CACHE_SIZE - 1); 225 + partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); 226 + 227 + /* 228 + * 'start' and 'end' always covers the range of pages to be fully 229 + * truncated. Partial pages are covered with 'partial_start' at the 230 + * start of the range and 'partial_end' at the end of the range. 231 + * Note that 'end' is exclusive while 'lend' is inclusive. 232 + */ 233 + start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 234 + if (lend == -1) 235 + /* 236 + * lend == -1 indicates end-of-file so we have to set 'end' 237 + * to the highest possible pgoff_t and since the type is 238 + * unsigned we're using -1. 239 + */ 240 + end = -1; 241 + else 242 + end = (lend + 1) >> PAGE_CACHE_SHIFT; 215 243 216 244 pagevec_init(&pvec, 0); 217 245 index = start; 218 - while (index <= end && pagevec_lookup(&pvec, mapping, index, 219 - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 246 + while (index < end && pagevec_lookup(&pvec, mapping, index, 247 + min(end - index, (pgoff_t)PAGEVEC_SIZE))) { 220 248 mem_cgroup_uncharge_start(); 221 249 for (i = 0; i < pagevec_count(&pvec); i++) { 222 250 struct page *page = pvec.pages[i]; 223 251 224 252 /* We rely upon deletion not changing page->index */ 225 253 index = page->index; 226 - if (index > end) 254 + if (index >= end) 227 255 break; 228 256 229 257 if (!trylock_page(page)) ··· 265 247 index++; 266 248 } 267 249 268 - if (partial) { 250 + if (partial_start) { 269 251 struct page *page = find_lock_page(mapping, start - 1); 270 252 if (page) { 253 + unsigned int top = PAGE_CACHE_SIZE; 254 + if (start > end) { 255 + /* Truncation within a single page */ 256 + top = partial_end; 257 + partial_end = 0; 258 + } 271 259 wait_on_page_writeback(page); 272 - truncate_partial_page(page, partial); 260 + zero_user_segment(page, partial_start, top); 261 + cleancache_invalidate_page(mapping, page); 262 + if (page_has_private(page)) 263 + do_invalidatepage(page, partial_start, 264 + top - partial_start); 273 265 unlock_page(page); 274 266 page_cache_release(page); 275 267 } 276 268 } 269 + if (partial_end) { 270 + struct page *page = find_lock_page(mapping, end); 271 + if (page) { 272 + wait_on_page_writeback(page); 273 + zero_user_segment(page, 0, partial_end); 274 + cleancache_invalidate_page(mapping, page); 275 + if (page_has_private(page)) 276 + do_invalidatepage(page, 0, 277 + partial_end); 278 + unlock_page(page); 279 + page_cache_release(page); 280 + } 281 + } 282 + /* 283 + * If the truncation happened within a single page no pages 284 + * will be released, just zeroed, so we can bail out now. 285 + */ 286 + if (start >= end) 287 + return; 277 288 278 289 index = start; 279 290 for ( ; ; ) { 280 291 cond_resched(); 281 292 if (!pagevec_lookup(&pvec, mapping, index, 282 - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 293 + min(end - index, (pgoff_t)PAGEVEC_SIZE))) { 283 294 if (index == start) 284 295 break; 285 296 index = start; 286 297 continue; 287 298 } 288 - if (index == start && pvec.pages[0]->index > end) { 299 + if (index == start && pvec.pages[0]->index >= end) { 289 300 pagevec_release(&pvec); 290 301 break; 291 302 } ··· 324 277 325 278 /* We rely upon deletion not changing page->index */ 326 279 index = page->index; 327 - if (index > end) 280 + if (index >= end) 328 281 break; 329 282 330 283 lock_page(page); ··· 645 598 * This rounding is currently just for example: unmap_mapping_range 646 599 * expands its hole outwards, whereas we want it to contract the hole 647 600 * inwards. However, existing callers of truncate_pagecache_range are 648 - * doing their own page rounding first; and truncate_inode_pages_range 649 - * currently BUGs if lend is not pagealigned-1 (it handles partial 650 - * page at start of hole, but not partial page at end of hole). Note 651 - * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. 601 + * doing their own page rounding first. Note that unmap_mapping_range 602 + * allows holelen 0 for all, and we allow lend -1 for end of file. 652 603 */ 653 604 654 605 /*