Merge tag 'for-chris' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux into for-linus-4.6

+16 -3

Documentation/filesystems/btrfs.txt

··· 168 168 notreelog 169 169 Enable/disable the tree logging used for fsync and O_SYNC writes. 170 170 171 - recovery 172 - Enable autorecovery attempts if a bad tree root is found at mount time. 173 - Currently this scans a list of several previous tree roots and tries to 171 + nologreplay 172 + Disable the log tree replay at mount time to prevent filesystem 173 + from getting modified. 174 + Must be used with 'ro' mount option. 175 + A filesystem mounted with this option cannot transition to a 176 + read-write mount via remount,rw - the filesystem must be unmounted 177 + and mounted back again if read-write access is desired. 178 + 179 + usebackuproot 180 + Enable attempts to use backup tree roots if a bad tree root is found at 181 + mount time. 182 + Currently this scans a list of 4 previous tree roots and tries to 174 183 use the first readable. 184 + And since the mount option doesn't affect any behavior after mount, 185 + it won't be shown in mount info. 186 + Prior to 4.6, this was done by 'recovery' option that has been 187 + deprecated, but will work. 175 188 176 189 rescan_uuid_tree 177 190 Force check and rebuild procedure of the UUID tree. This should not

+4 -8

fs/btrfs/backref.c

··· 148 148 149 149 void btrfs_prelim_ref_exit(void) 150 150 { 151 - if (btrfs_prelim_ref_cache) 152 - kmem_cache_destroy(btrfs_prelim_ref_cache); 151 + kmem_cache_destroy(btrfs_prelim_ref_cache); 153 152 } 154 153 155 154 /* ··· 565 566 struct __prelim_ref *pos2 = pos1, *tmp; 566 567 567 568 list_for_each_entry_safe_continue(pos2, tmp, head, list) { 568 - struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; 569 + struct __prelim_ref *ref1 = pos1, *ref2 = pos2; 569 570 struct extent_inode_elem *eie; 570 571 571 572 if (!ref_for_same_block(ref1, ref2)) 572 573 continue; 573 574 if (mode == 1) { 574 - if (!ref1->parent && ref2->parent) { 575 - xchg = ref1; 576 - ref1 = ref2; 577 - ref2 = xchg; 578 - } 575 + if (!ref1->parent && ref2->parent) 576 + swap(ref1, ref2); 579 577 } else { 580 578 if (ref1->parent != ref2->parent) 581 579 continue;

+18 -18

fs/btrfs/ctree.c

··· 311 311 312 312 struct tree_mod_elem { 313 313 struct rb_node node; 314 - u64 index; /* shifted logical */ 314 + u64 logical; 315 315 u64 seq; 316 316 enum mod_log_op op; 317 317 ··· 435 435 436 436 /* 437 437 * key order of the log: 438 - * index -> sequence 438 + * node/leaf start address -> sequence 439 439 * 440 - * the index is the shifted logical of the *new* root node for root replace 441 - * operations, or the shifted logical of the affected block for all other 442 - * operations. 440 + * The 'start address' is the logical address of the *new* root node 441 + * for root replace operations, or the logical address of the affected 442 + * block for all other operations. 443 443 * 444 444 * Note: must be called with write lock (tree_mod_log_write_lock). 445 445 */ ··· 460 460 while (*new) { 461 461 cur = container_of(*new, struct tree_mod_elem, node); 462 462 parent = *new; 463 - if (cur->index < tm->index) 463 + if (cur->logical < tm->logical) 464 464 new = &((*new)->rb_left); 465 - else if (cur->index > tm->index) 465 + else if (cur->logical > tm->logical) 466 466 new = &((*new)->rb_right); 467 467 else if (cur->seq < tm->seq) 468 468 new = &((*new)->rb_left); ··· 523 523 if (!tm) 524 524 return NULL; 525 525 526 - tm->index = eb->start >> PAGE_CACHE_SHIFT; 526 + tm->logical = eb->start; 527 527 if (op != MOD_LOG_KEY_ADD) { 528 528 btrfs_node_key(eb, &tm->key, slot); 529 529 tm->blockptr = btrfs_node_blockptr(eb, slot); ··· 588 588 goto free_tms; 589 589 } 590 590 591 - tm->index = eb->start >> PAGE_CACHE_SHIFT; 591 + tm->logical = eb->start; 592 592 tm->slot = src_slot; 593 593 tm->move.dst_slot = dst_slot; 594 594 tm->move.nr_items = nr_items; ··· 699 699 goto free_tms; 700 700 } 701 701 702 - tm->index = new_root->start >> PAGE_CACHE_SHIFT; 702 + tm->logical = new_root->start; 703 703 tm->old_root.logical = old_root->start; 704 704 tm->old_root.level = btrfs_header_level(old_root); 705 705 tm->generation = btrfs_header_generation(old_root); ··· 739 739 struct rb_node *node; 740 740 struct tree_mod_elem *cur = NULL; 741 741 struct tree_mod_elem *found = NULL; 742 - u64 index = start >> PAGE_CACHE_SHIFT; 743 742 744 743 tree_mod_log_read_lock(fs_info); 745 744 tm_root = &fs_info->tree_mod_log; 746 745 node = tm_root->rb_node; 747 746 while (node) { 748 747 cur = container_of(node, struct tree_mod_elem, node); 749 - if (cur->index < index) { 748 + if (cur->logical < start) { 750 749 node = node->rb_left; 751 - } else if (cur->index > index) { 750 + } else if (cur->logical > start) { 752 751 node = node->rb_right; 753 752 } else if (cur->seq < min_seq) { 754 753 node = node->rb_left; ··· 1229 1230 return NULL; 1230 1231 1231 1232 /* 1232 - * the very last operation that's logged for a root is the replacement 1233 - * operation (if it is replaced at all). this has the index of the *new* 1234 - * root, making it the very first operation that's logged for this root. 1233 + * the very last operation that's logged for a root is the 1234 + * replacement operation (if it is replaced at all). this has 1235 + * the logical address of the *new* root, making it the very 1236 + * first operation that's logged for this root. 1235 1237 */ 1236 1238 while (1) { 1237 1239 tm = tree_mod_log_search_oldest(fs_info, root_logical, ··· 1336 1336 if (!next) 1337 1337 break; 1338 1338 tm = container_of(next, struct tree_mod_elem, node); 1339 - if (tm->index != first_tm->index) 1339 + if (tm->logical != first_tm->logical) 1340 1340 break; 1341 1341 } 1342 1342 tree_mod_log_read_unlock(fs_info); ··· 5361 5361 goto out; 5362 5362 } 5363 5363 5364 - tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); 5364 + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); 5365 5365 if (!tmp_buf) { 5366 5366 ret = -ENOMEM; 5367 5367 goto out;

+56 -13

fs/btrfs/ctree.h

··· 100 100 /* tracks free space in block groups. */ 101 101 #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL 102 102 103 + /* device stats in the device tree */ 104 + #define BTRFS_DEV_STATS_OBJECTID 0ULL 105 + 103 106 /* for storing balance parameters in the root tree */ 104 107 #define BTRFS_BALANCE_OBJECTID -4ULL 105 108 ··· 1005 1002 pid_t lock_owner; 1006 1003 atomic_t nesting_level; 1007 1004 struct mutex lock_finishing_cancel_unmount; 1008 - struct mutex lock_management_lock; 1009 - struct mutex lock; 1005 + rwlock_t lock; 1006 + atomic_t read_locks; 1007 + atomic_t blocking_readers; 1008 + wait_queue_head_t read_lock_wq; 1010 1009 1011 1010 struct btrfs_scrub_progress scrub_progress; 1012 1011 }; ··· 1827 1822 spinlock_t reada_lock; 1828 1823 struct radix_tree_root reada_tree; 1829 1824 1825 + /* readahead works cnt */ 1826 + atomic_t reada_works_cnt; 1827 + 1830 1828 /* Extent buffer radix tree */ 1831 1829 spinlock_t buffer_lock; 1832 1830 struct radix_tree_root buffer_radix; ··· 2193 2185 */ 2194 2186 #define BTRFS_QGROUP_RELATION_KEY 246 2195 2187 2188 + /* 2189 + * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. 2190 + */ 2196 2191 #define BTRFS_BALANCE_ITEM_KEY 248 2197 2192 2198 2193 /* 2199 - * Persistantly stores the io stats in the device tree. 2200 - * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). 2194 + * The key type for tree items that are stored persistently, but do not need to 2195 + * exist for extended period of time. The items can exist in any tree. 2196 + * 2197 + * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] 2198 + * 2199 + * Existing items: 2200 + * 2201 + * - balance status item 2202 + * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) 2201 2203 */ 2202 - #define BTRFS_DEV_STATS_KEY 249 2204 + #define BTRFS_TEMPORARY_ITEM_KEY 248 2205 + 2206 + /* 2207 + * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY 2208 + */ 2209 + #define BTRFS_DEV_STATS_KEY 249 2210 + 2211 + /* 2212 + * The key type for tree items that are stored persistently and usually exist 2213 + * for a long period, eg. filesystem lifetime. The item kinds can be status 2214 + * information, stats or preference values. The item can exist in any tree. 2215 + * 2216 + * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] 2217 + * 2218 + * Existing items: 2219 + * 2220 + * - device statistics, store IO stats in the device tree, one key for all 2221 + * stats 2222 + * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) 2223 + */ 2224 + #define BTRFS_PERSISTENT_ITEM_KEY 249 2203 2225 2204 2226 /* 2205 2227 * Persistantly stores the device replace state in the device tree. ··· 2279 2241 #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 2280 2242 #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 2281 2243 #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 2282 - #define BTRFS_MOUNT_RECOVERY (1 << 18) 2244 + #define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) 2283 2245 #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) 2284 2246 #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) 2285 2247 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) ··· 2288 2250 #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) 2289 2251 #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) 2290 2252 #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) 2253 + #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) 2291 2254 2292 2255 #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2293 - #define BTRFS_DEFAULT_MAX_INLINE (8192) 2256 + #define BTRFS_DEFAULT_MAX_INLINE (2048) 2294 2257 2295 2258 #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 2296 2259 #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) ··· 2391 2352 char *kaddr; 2392 2353 unsigned long offset; 2393 2354 }; 2355 + 2356 + #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ 2357 + ((bytes) >> (fs_info)->sb->s_blocksize_bits) 2394 2358 2395 2359 static inline void btrfs_init_map_token (struct btrfs_map_token *token) 2396 2360 { ··· 3490 3448 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3491 3449 unsigned num_items) 3492 3450 { 3493 - return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3494 - 2 * num_items; 3451 + return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; 3495 3452 } 3496 3453 3497 3454 /* ··· 4068 4027 struct btrfs_root *root, 4069 4028 struct inode *dir, u64 objectid, 4070 4029 const char *name, int name_len); 4071 - int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4030 + int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, 4072 4031 int front); 4073 4032 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4074 4033 struct btrfs_root *root, ··· 4130 4089 4131 4090 /* ioctl.c */ 4132 4091 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 4092 + int btrfs_ioctl_get_supported_features(void __user *arg); 4133 4093 void btrfs_update_iflags(struct inode *inode); 4134 4094 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 4135 4095 int btrfs_is_empty_uuid(u8 *uuid); ··· 4193 4151 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 4194 4152 4195 4153 /* super.c */ 4196 - int btrfs_parse_options(struct btrfs_root *root, char *options); 4154 + int btrfs_parse_options(struct btrfs_root *root, char *options, 4155 + unsigned long new_flags); 4197 4156 int btrfs_sync_fs(struct super_block *sb, int wait); 4198 4157 4199 4158 #ifdef CONFIG_PRINTK ··· 4568 4525 struct btrfs_key *start, struct btrfs_key *end); 4569 4526 int btrfs_reada_wait(void *handle); 4570 4527 void btrfs_reada_detach(void *handle); 4571 - int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 4572 - u64 start, int err); 4528 + int btree_readahead_hook(struct btrfs_fs_info *fs_info, 4529 + struct extent_buffer *eb, u64 start, int err); 4573 4530 4574 4531 static inline int is_fstree(u64 rootid) 4575 4532 {

+1 -2

fs/btrfs/delayed-inode.c

··· 43 43 44 44 void btrfs_delayed_inode_exit(void) 45 45 { 46 - if (delayed_node_cache) 47 - kmem_cache_destroy(delayed_node_cache); 46 + kmem_cache_destroy(delayed_node_cache); 48 47 } 49 48 50 49 static inline void btrfs_init_delayed_node(

+4 -8

fs/btrfs/delayed-ref.c

··· 929 929 930 930 void btrfs_delayed_ref_exit(void) 931 931 { 932 - if (btrfs_delayed_ref_head_cachep) 933 - kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 934 - if (btrfs_delayed_tree_ref_cachep) 935 - kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 936 - if (btrfs_delayed_data_ref_cachep) 937 - kmem_cache_destroy(btrfs_delayed_data_ref_cachep); 938 - if (btrfs_delayed_extent_op_cachep) 939 - kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 932 + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 933 + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 934 + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); 935 + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 940 936 } 941 937 942 938 int btrfs_delayed_ref_init(void)

+72 -62

fs/btrfs/dev-replace.c

··· 202 202 struct btrfs_dev_replace_item *ptr; 203 203 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 204 204 205 - btrfs_dev_replace_lock(dev_replace); 205 + btrfs_dev_replace_lock(dev_replace, 0); 206 206 if (!dev_replace->is_valid || 207 207 !dev_replace->item_needs_writeback) { 208 - btrfs_dev_replace_unlock(dev_replace); 208 + btrfs_dev_replace_unlock(dev_replace, 0); 209 209 return 0; 210 210 } 211 - btrfs_dev_replace_unlock(dev_replace); 211 + btrfs_dev_replace_unlock(dev_replace, 0); 212 212 213 213 key.objectid = 0; 214 214 key.type = BTRFS_DEV_REPLACE_KEY; ··· 264 264 ptr = btrfs_item_ptr(eb, path->slots[0], 265 265 struct btrfs_dev_replace_item); 266 266 267 - btrfs_dev_replace_lock(dev_replace); 267 + btrfs_dev_replace_lock(dev_replace, 1); 268 268 if (dev_replace->srcdev) 269 269 btrfs_set_dev_replace_src_devid(eb, ptr, 270 270 dev_replace->srcdev->devid); ··· 287 287 btrfs_set_dev_replace_cursor_right(eb, ptr, 288 288 dev_replace->cursor_right); 289 289 dev_replace->item_needs_writeback = 0; 290 - btrfs_dev_replace_unlock(dev_replace); 290 + btrfs_dev_replace_unlock(dev_replace, 1); 291 291 292 292 btrfs_mark_buffer_dirty(eb); 293 293 ··· 356 356 return PTR_ERR(trans); 357 357 } 358 358 359 - btrfs_dev_replace_lock(dev_replace); 359 + btrfs_dev_replace_lock(dev_replace, 1); 360 360 switch (dev_replace->replace_state) { 361 361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 362 362 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: ··· 395 395 dev_replace->is_valid = 1; 396 396 dev_replace->item_needs_writeback = 1; 397 397 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 398 - btrfs_dev_replace_unlock(dev_replace); 398 + btrfs_dev_replace_unlock(dev_replace, 1); 399 399 400 400 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); 401 401 if (ret) ··· 407 407 trans = btrfs_start_transaction(root, 0); 408 408 if (IS_ERR(trans)) { 409 409 ret = PTR_ERR(trans); 410 - btrfs_dev_replace_lock(dev_replace); 410 + btrfs_dev_replace_lock(dev_replace, 1); 411 411 goto leave; 412 412 } 413 413 ··· 433 433 leave: 434 434 dev_replace->srcdev = NULL; 435 435 dev_replace->tgtdev = NULL; 436 - btrfs_dev_replace_unlock(dev_replace); 436 + btrfs_dev_replace_unlock(dev_replace, 1); 437 437 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 438 438 return ret; 439 439 } ··· 471 471 /* don't allow cancel or unmount to disturb the finishing procedure */ 472 472 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 473 473 474 - btrfs_dev_replace_lock(dev_replace); 474 + btrfs_dev_replace_lock(dev_replace, 0); 475 475 /* was the operation canceled, or is it finished? */ 476 476 if (dev_replace->replace_state != 477 477 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 478 - btrfs_dev_replace_unlock(dev_replace); 478 + btrfs_dev_replace_unlock(dev_replace, 0); 479 479 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 480 480 return 0; 481 481 } 482 482 483 483 tgt_device = dev_replace->tgtdev; 484 484 src_device = dev_replace->srcdev; 485 - btrfs_dev_replace_unlock(dev_replace); 485 + btrfs_dev_replace_unlock(dev_replace, 0); 486 486 487 487 /* 488 488 * flush all outstanding I/O and inode extent mappings before the ··· 507 507 /* keep away write_all_supers() during the finishing procedure */ 508 508 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 509 509 mutex_lock(&root->fs_info->chunk_mutex); 510 - btrfs_dev_replace_lock(dev_replace); 510 + btrfs_dev_replace_lock(dev_replace, 1); 511 511 dev_replace->replace_state = 512 512 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 513 513 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; ··· 528 528 rcu_str_deref(src_device->name), 529 529 src_device->devid, 530 530 rcu_str_deref(tgt_device->name), scrub_ret); 531 - btrfs_dev_replace_unlock(dev_replace); 531 + btrfs_dev_replace_unlock(dev_replace, 1); 532 532 mutex_unlock(&root->fs_info->chunk_mutex); 533 533 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 534 mutex_unlock(&uuid_mutex); ··· 565 565 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 566 566 fs_info->fs_devices->rw_devices++; 567 567 568 - btrfs_dev_replace_unlock(dev_replace); 568 + btrfs_dev_replace_unlock(dev_replace, 1); 569 569 570 570 btrfs_rm_dev_replace_blocked(fs_info); 571 571 ··· 649 649 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 650 650 struct btrfs_device *srcdev; 651 651 652 - btrfs_dev_replace_lock(dev_replace); 652 + btrfs_dev_replace_lock(dev_replace, 0); 653 653 /* even if !dev_replace_is_valid, the values are good enough for 654 654 * the replace_status ioctl */ 655 655 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; ··· 675 675 div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); 676 676 break; 677 677 } 678 - btrfs_dev_replace_unlock(dev_replace); 678 + btrfs_dev_replace_unlock(dev_replace, 0); 679 679 } 680 680 681 681 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, ··· 698 698 return -EROFS; 699 699 700 700 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 701 - btrfs_dev_replace_lock(dev_replace); 701 + btrfs_dev_replace_lock(dev_replace, 1); 702 702 switch (dev_replace->replace_state) { 703 703 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 704 704 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 705 705 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 706 706 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 707 - btrfs_dev_replace_unlock(dev_replace); 707 + btrfs_dev_replace_unlock(dev_replace, 1); 708 708 goto leave; 709 709 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 710 710 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: ··· 717 717 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 718 718 dev_replace->time_stopped = get_seconds(); 719 719 dev_replace->item_needs_writeback = 1; 720 - btrfs_dev_replace_unlock(dev_replace); 720 + btrfs_dev_replace_unlock(dev_replace, 1); 721 721 btrfs_scrub_cancel(fs_info); 722 722 723 723 trans = btrfs_start_transaction(root, 0); ··· 740 740 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 741 741 742 742 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 743 - btrfs_dev_replace_lock(dev_replace); 743 + btrfs_dev_replace_lock(dev_replace, 1); 744 744 switch (dev_replace->replace_state) { 745 745 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 746 746 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: ··· 756 756 break; 757 757 } 758 758 759 - btrfs_dev_replace_unlock(dev_replace); 759 + btrfs_dev_replace_unlock(dev_replace, 1); 760 760 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 761 761 } 762 762 ··· 766 766 struct task_struct *task; 767 767 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 768 768 769 - btrfs_dev_replace_lock(dev_replace); 769 + btrfs_dev_replace_lock(dev_replace, 1); 770 770 switch (dev_replace->replace_state) { 771 771 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 772 772 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 773 773 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 774 - btrfs_dev_replace_unlock(dev_replace); 774 + btrfs_dev_replace_unlock(dev_replace, 1); 775 775 return 0; 776 776 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 777 777 break; ··· 784 784 btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); 785 785 btrfs_info(fs_info, 786 786 "you may cancel the operation after 'mount -o degraded'"); 787 - btrfs_dev_replace_unlock(dev_replace); 787 + btrfs_dev_replace_unlock(dev_replace, 1); 788 788 return 0; 789 789 } 790 - btrfs_dev_replace_unlock(dev_replace); 790 + btrfs_dev_replace_unlock(dev_replace, 1); 791 791 792 792 WARN_ON(atomic_xchg( 793 793 &fs_info->mutually_exclusive_operation_running, 1)); ··· 802 802 struct btrfs_ioctl_dev_replace_args *status_args; 803 803 u64 progress; 804 804 805 - status_args = kzalloc(sizeof(*status_args), GFP_NOFS); 805 + status_args = kzalloc(sizeof(*status_args), GFP_KERNEL); 806 806 if (status_args) { 807 807 btrfs_dev_replace_status(fs_info, status_args); 808 808 progress = status_args->status.progress_1000; ··· 865 865 return 1; 866 866 } 867 867 868 - void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) 868 + void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw) 869 869 { 870 - /* the beginning is just an optimization for the typical case */ 871 - if (atomic_read(&dev_replace->nesting_level) == 0) { 872 - acquire_lock: 873 - /* this is not a nested case where the same thread 874 - * is trying to acqurire the same lock twice */ 875 - mutex_lock(&dev_replace->lock); 876 - mutex_lock(&dev_replace->lock_management_lock); 877 - dev_replace->lock_owner = current->pid; 878 - atomic_inc(&dev_replace->nesting_level); 879 - mutex_unlock(&dev_replace->lock_management_lock); 880 - return; 870 + if (rw == 1) { 871 + /* write */ 872 + again: 873 + wait_event(dev_replace->read_lock_wq, 874 + atomic_read(&dev_replace->blocking_readers) == 0); 875 + write_lock(&dev_replace->lock); 876 + if (atomic_read(&dev_replace->blocking_readers)) { 877 + write_unlock(&dev_replace->lock); 878 + goto again; 879 + } 880 + } else { 881 + read_lock(&dev_replace->lock); 882 + atomic_inc(&dev_replace->read_locks); 881 883 } 882 - 883 - mutex_lock(&dev_replace->lock_management_lock); 884 - if (atomic_read(&dev_replace->nesting_level) > 0 && 885 - dev_replace->lock_owner == current->pid) { 886 - WARN_ON(!mutex_is_locked(&dev_replace->lock)); 887 - atomic_inc(&dev_replace->nesting_level); 888 - mutex_unlock(&dev_replace->lock_management_lock); 889 - return; 890 - } 891 - 892 - mutex_unlock(&dev_replace->lock_management_lock); 893 - goto acquire_lock; 894 884 } 895 885 896 - void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) 886 + void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw) 897 887 { 898 - WARN_ON(!mutex_is_locked(&dev_replace->lock)); 899 - mutex_lock(&dev_replace->lock_management_lock); 900 - WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); 901 - WARN_ON(dev_replace->lock_owner != current->pid); 902 - atomic_dec(&dev_replace->nesting_level); 903 - if (atomic_read(&dev_replace->nesting_level) == 0) { 904 - dev_replace->lock_owner = 0; 905 - mutex_unlock(&dev_replace->lock_management_lock); 906 - mutex_unlock(&dev_replace->lock); 888 + if (rw == 1) { 889 + /* write */ 890 + ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); 891 + write_unlock(&dev_replace->lock); 907 892 } else { 908 - mutex_unlock(&dev_replace->lock_management_lock); 893 + ASSERT(atomic_read(&dev_replace->read_locks) > 0); 894 + atomic_dec(&dev_replace->read_locks); 895 + read_unlock(&dev_replace->lock); 909 896 } 897 + } 898 + 899 + /* inc blocking cnt and release read lock */ 900 + void btrfs_dev_replace_set_lock_blocking( 901 + struct btrfs_dev_replace *dev_replace) 902 + { 903 + /* only set blocking for read lock */ 904 + ASSERT(atomic_read(&dev_replace->read_locks) > 0); 905 + atomic_inc(&dev_replace->blocking_readers); 906 + read_unlock(&dev_replace->lock); 907 + } 908 + 909 + /* acquire read lock and dec blocking cnt */ 910 + void btrfs_dev_replace_clear_lock_blocking( 911 + struct btrfs_dev_replace *dev_replace) 912 + { 913 + /* only set blocking for read lock */ 914 + ASSERT(atomic_read(&dev_replace->read_locks) > 0); 915 + ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); 916 + read_lock(&dev_replace->lock); 917 + if (atomic_dec_and_test(&dev_replace->blocking_readers) && 918 + waitqueue_active(&dev_replace->read_lock_wq)) 919 + wake_up(&dev_replace->read_lock_wq); 910 920 } 911 921 912 922 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)

+5 -2

fs/btrfs/dev-replace.h

··· 34 34 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); 35 35 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); 36 36 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); 37 - void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); 38 - void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); 37 + void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw); 38 + void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw); 39 + void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); 40 + void btrfs_dev_replace_clear_lock_blocking( 41 + struct btrfs_dev_replace *dev_replace); 39 42 40 43 static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) 41 44 {

+39 -29

fs/btrfs/disk-io.c

··· 110 110 111 111 void btrfs_end_io_wq_exit(void) 112 112 { 113 - if (btrfs_end_io_wq_cache) 114 - kmem_cache_destroy(btrfs_end_io_wq_cache); 113 + kmem_cache_destroy(btrfs_end_io_wq_cache); 115 114 } 116 115 117 116 /* ··· 611 612 int found_level; 612 613 struct extent_buffer *eb; 613 614 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 615 + struct btrfs_fs_info *fs_info = root->fs_info; 614 616 int ret = 0; 615 617 int reads_done; 616 618 ··· 637 637 638 638 found_start = btrfs_header_bytenr(eb); 639 639 if (found_start != eb->start) { 640 - btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", 641 - found_start, eb->start); 640 + btrfs_err_rl(fs_info, "bad tree block start %llu %llu", 641 + found_start, eb->start); 642 642 ret = -EIO; 643 643 goto err; 644 644 } 645 - if (check_tree_block_fsid(root->fs_info, eb)) { 646 - btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", 647 - eb->start); 645 + if (check_tree_block_fsid(fs_info, eb)) { 646 + btrfs_err_rl(fs_info, "bad fsid on block %llu", 647 + eb->start); 648 648 ret = -EIO; 649 649 goto err; 650 650 } 651 651 found_level = btrfs_header_level(eb); 652 652 if (found_level >= BTRFS_MAX_LEVEL) { 653 - btrfs_err(root->fs_info, "bad tree block level %d", 654 - (int)btrfs_header_level(eb)); 653 + btrfs_err(fs_info, "bad tree block level %d", 654 + (int)btrfs_header_level(eb)); 655 655 ret = -EIO; 656 656 goto err; 657 657 } ··· 659 659 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), 660 660 eb, found_level); 661 661 662 - ret = csum_tree_block(root->fs_info, eb, 1); 662 + ret = csum_tree_block(fs_info, eb, 1); 663 663 if (ret) { 664 664 ret = -EIO; 665 665 goto err; ··· 680 680 err: 681 681 if (reads_done && 682 682 test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 683 - btree_readahead_hook(root, eb, eb->start, ret); 683 + btree_readahead_hook(fs_info, eb, eb->start, ret); 684 684 685 685 if (ret) { 686 686 /* ··· 699 699 static int btree_io_failed_hook(struct page *page, int failed_mirror) 700 700 { 701 701 struct extent_buffer *eb; 702 - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 703 702 704 703 eb = (struct extent_buffer *)page->private; 705 704 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 706 705 eb->read_mirror = failed_mirror; 707 706 atomic_dec(&eb->io_pages); 708 707 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 709 - btree_readahead_hook(root, eb, eb->start, -EIO); 708 + btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO); 710 709 return -EIO; /* we fixed nothing */ 711 710 } 712 711 ··· 1295 1296 spin_lock_init(&root->root_item_lock); 1296 1297 } 1297 1298 1298 - static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1299 + static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, 1300 + gfp_t flags) 1299 1301 { 1300 - struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1302 + struct btrfs_root *root = kzalloc(sizeof(*root), flags); 1301 1303 if (root) 1302 1304 root->fs_info = fs_info; 1303 1305 return root; ··· 1310 1310 { 1311 1311 struct btrfs_root *root; 1312 1312 1313 - root = btrfs_alloc_root(NULL); 1313 + root = btrfs_alloc_root(NULL, GFP_KERNEL); 1314 1314 if (!root) 1315 1315 return ERR_PTR(-ENOMEM); 1316 1316 __setup_root(4096, 4096, 4096, root, NULL, 1); ··· 1332 1332 int ret = 0; 1333 1333 uuid_le uuid; 1334 1334 1335 - root = btrfs_alloc_root(fs_info); 1335 + root = btrfs_alloc_root(fs_info, GFP_KERNEL); 1336 1336 if (!root) 1337 1337 return ERR_PTR(-ENOMEM); 1338 1338 ··· 1408 1408 struct btrfs_root *tree_root = fs_info->tree_root; 1409 1409 struct extent_buffer *leaf; 1410 1410 1411 - root = btrfs_alloc_root(fs_info); 1411 + root = btrfs_alloc_root(fs_info, GFP_NOFS); 1412 1412 if (!root) 1413 1413 return ERR_PTR(-ENOMEM); 1414 1414 ··· 1506 1506 if (!path) 1507 1507 return ERR_PTR(-ENOMEM); 1508 1508 1509 - root = btrfs_alloc_root(fs_info); 1509 + root = btrfs_alloc_root(fs_info, GFP_NOFS); 1510 1510 if (!root) { 1511 1511 ret = -ENOMEM; 1512 1512 goto alloc_fail; ··· 2272 2272 fs_info->dev_replace.lock_owner = 0; 2273 2273 atomic_set(&fs_info->dev_replace.nesting_level, 0); 2274 2274 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); 2275 - mutex_init(&fs_info->dev_replace.lock_management_lock); 2276 - mutex_init(&fs_info->dev_replace.lock); 2275 + rwlock_init(&fs_info->dev_replace.lock); 2276 + atomic_set(&fs_info->dev_replace.read_locks, 0); 2277 + atomic_set(&fs_info->dev_replace.blocking_readers, 0); 2277 2278 init_waitqueue_head(&fs_info->replace_wait); 2279 + init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); 2278 2280 } 2279 2281 2280 2282 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) ··· 2387 2385 return -EIO; 2388 2386 } 2389 2387 2390 - log_tree_root = btrfs_alloc_root(fs_info); 2388 + log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); 2391 2389 if (!log_tree_root) 2392 2390 return -ENOMEM; 2393 2391 ··· 2512 2510 int backup_index = 0; 2513 2511 int max_active; 2514 2512 2515 - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2516 - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2513 + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); 2514 + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); 2517 2515 if (!tree_root || !chunk_root) { 2518 2516 err = -ENOMEM; 2519 2517 goto fail; ··· 2605 2603 atomic_set(&fs_info->nr_async_bios, 0); 2606 2604 atomic_set(&fs_info->defrag_running, 0); 2607 2605 atomic_set(&fs_info->qgroup_op_seq, 0); 2606 + atomic_set(&fs_info->reada_works_cnt, 0); 2608 2607 atomic64_set(&fs_info->tree_mod_seq, 0); 2609 2608 fs_info->sb = sb; 2610 2609 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; ··· 2625 2622 INIT_LIST_HEAD(&fs_info->ordered_roots); 2626 2623 spin_lock_init(&fs_info->ordered_root_lock); 2627 2624 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2628 - GFP_NOFS); 2625 + GFP_KERNEL); 2629 2626 if (!fs_info->delayed_root) { 2630 2627 err = -ENOMEM; 2631 2628 goto fail_iput; ··· 2753 2750 */ 2754 2751 fs_info->compress_type = BTRFS_COMPRESS_ZLIB; 2755 2752 2756 - ret = btrfs_parse_options(tree_root, options); 2753 + ret = btrfs_parse_options(tree_root, options, sb->s_flags); 2757 2754 if (ret) { 2758 2755 err = ret; 2759 2756 goto fail_alloc; ··· 3032 3029 if (ret) 3033 3030 goto fail_trans_kthread; 3034 3031 3035 - /* do not make disk changes in broken FS */ 3036 - if (btrfs_super_log_root(disk_super) != 0) { 3032 + /* do not make disk changes in broken FS or nologreplay is given */ 3033 + if (btrfs_super_log_root(disk_super) != 0 && 3034 + !btrfs_test_opt(tree_root, NOLOGREPLAY)) { 3037 3035 ret = btrfs_replay_log(fs_info, fs_devices); 3038 3036 if (ret) { 3039 3037 err = ret; ··· 3150 3146 3151 3147 fs_info->open = 1; 3152 3148 3149 + /* 3150 + * backuproot only affect mount behavior, and if open_ctree succeeded, 3151 + * no need to keep the flag 3152 + */ 3153 + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); 3154 + 3153 3155 return 0; 3154 3156 3155 3157 fail_qgroup: ··· 3210 3200 return err; 3211 3201 3212 3202 recovery_tree_root: 3213 - if (!btrfs_test_opt(tree_root, RECOVERY)) 3203 + if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) 3214 3204 goto fail_tree_roots; 3215 3205 3216 3206 free_root_pointers(fs_info, 0);

+22 -16

fs/btrfs/extent-tree.c

··· 4838 4838 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4839 4839 4840 4840 /* If we're just plain full then async reclaim just slows us down. */ 4841 - if (space_info->bytes_used >= thresh) 4841 + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4842 4842 return 0; 4843 4843 4844 4844 return (used >= thresh && !btrfs_fs_closing(fs_info) && ··· 5373 5373 5374 5374 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5375 5375 5376 - num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 5377 - sinfo->bytes_reserved + sinfo->bytes_readonly + 5378 - sinfo->bytes_may_use; 5379 - 5380 - if (sinfo->total_bytes > num_bytes) { 5381 - num_bytes = sinfo->total_bytes - num_bytes; 5382 - block_rsv->reserved += num_bytes; 5383 - sinfo->bytes_may_use += num_bytes; 5384 - trace_btrfs_space_reservation(fs_info, "space_info", 5385 - sinfo->flags, num_bytes, 1); 5386 - } 5387 - 5388 - if (block_rsv->reserved >= block_rsv->size) { 5376 + if (block_rsv->reserved < block_rsv->size) { 5377 + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 5378 + sinfo->bytes_reserved + sinfo->bytes_readonly + 5379 + sinfo->bytes_may_use; 5380 + if (sinfo->total_bytes > num_bytes) { 5381 + num_bytes = sinfo->total_bytes - num_bytes; 5382 + num_bytes = min(num_bytes, 5383 + block_rsv->size - block_rsv->reserved); 5384 + block_rsv->reserved += num_bytes; 5385 + sinfo->bytes_may_use += num_bytes; 5386 + trace_btrfs_space_reservation(fs_info, "space_info", 5387 + sinfo->flags, num_bytes, 5388 + 1); 5389 + } 5390 + } else if (block_rsv->reserved > block_rsv->size) { 5389 5391 num_bytes = block_rsv->reserved - block_rsv->size; 5390 5392 sinfo->bytes_may_use -= num_bytes; 5391 5393 trace_btrfs_space_reservation(fs_info, "space_info", 5392 5394 sinfo->flags, num_bytes, 0); 5393 5395 block_rsv->reserved = block_rsv->size; 5394 - block_rsv->full = 1; 5395 5396 } 5397 + 5398 + if (block_rsv->reserved == block_rsv->size) 5399 + block_rsv->full = 1; 5400 + else 5401 + block_rsv->full = 0; 5396 5402 5397 5403 spin_unlock(&block_rsv->lock); 5398 5404 spin_unlock(&sinfo->lock); ··· 7024 7018 struct btrfs_free_cluster *cluster, 7025 7019 int delalloc) 7026 7020 { 7027 - struct btrfs_block_group_cache *used_bg; 7021 + struct btrfs_block_group_cache *used_bg = NULL; 7028 7022 bool locked = false; 7029 7023 again: 7030 7024 spin_lock(&cluster->refill_lock);

+18 -22

fs/btrfs/extent_io.c

··· 206 206 * destroy caches. 207 207 */ 208 208 rcu_barrier(); 209 - if (extent_state_cache) 210 - kmem_cache_destroy(extent_state_cache); 211 - if (extent_buffer_cache) 212 - kmem_cache_destroy(extent_buffer_cache); 209 + kmem_cache_destroy(extent_state_cache); 210 + kmem_cache_destroy(extent_buffer_cache); 213 211 if (btrfs_bioset) 214 212 bioset_free(btrfs_bioset); 215 213 } ··· 230 232 if (!state) 231 233 return state; 232 234 state->state = 0; 233 - state->private = 0; 235 + state->failrec = NULL; 234 236 RB_CLEAR_NODE(&state->rb_node); 235 237 btrfs_leak_debug_add(&state->leak_list, &states); 236 238 atomic_set(&state->refs, 1); ··· 1842 1844 * set the private field for a given byte offset in the tree. If there isn't 1843 1845 * an extent_state there already, this does nothing. 1844 1846 */ 1845 - static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1847 + static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1848 + struct io_failure_record *failrec) 1846 1849 { 1847 1850 struct rb_node *node; 1848 1851 struct extent_state *state; ··· 1864 1865 ret = -ENOENT; 1865 1866 goto out; 1866 1867 } 1867 - state->private = private; 1868 + state->failrec = failrec; 1868 1869 out: 1869 1870 spin_unlock(&tree->lock); 1870 1871 return ret; 1871 1872 } 1872 1873 1873 - int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1874 + static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1875 + struct io_failure_record **failrec) 1874 1876 { 1875 1877 struct rb_node *node; 1876 1878 struct extent_state *state; ··· 1892 1892 ret = -ENOENT; 1893 1893 goto out; 1894 1894 } 1895 - *private = state->private; 1895 + *failrec = state->failrec; 1896 1896 out: 1897 1897 spin_unlock(&tree->lock); 1898 1898 return ret; ··· 1972 1972 int err = 0; 1973 1973 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1974 1974 1975 - set_state_private(failure_tree, rec->start, 0); 1975 + set_state_failrec(failure_tree, rec->start, NULL); 1976 1976 ret = clear_extent_bits(failure_tree, rec->start, 1977 1977 rec->start + rec->len - 1, 1978 1978 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); ··· 2089 2089 unsigned int pg_offset) 2090 2090 { 2091 2091 u64 private; 2092 - u64 private_failure; 2093 2092 struct io_failure_record *failrec; 2094 2093 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2095 2094 struct extent_state *state; ··· 2101 2102 if (!ret) 2102 2103 return 0; 2103 2104 2104 - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2105 - &private_failure); 2105 + ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start, 2106 + &failrec); 2106 2107 if (ret) 2107 2108 return 0; 2108 2109 2109 - failrec = (struct io_failure_record *)(unsigned long) private_failure; 2110 2110 BUG_ON(!failrec->this_mirror); 2111 2111 2112 2112 if (failrec->in_validation) { ··· 2165 2167 2166 2168 next = next_state(state); 2167 2169 2168 - failrec = (struct io_failure_record *)(unsigned long)state->private; 2170 + failrec = state->failrec; 2169 2171 free_extent_state(state); 2170 2172 kfree(failrec); 2171 2173 ··· 2175 2177 } 2176 2178 2177 2179 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2178 - struct io_failure_record **failrec_ret) 2180 + struct io_failure_record **failrec_ret) 2179 2181 { 2180 2182 struct io_failure_record *failrec; 2181 - u64 private; 2182 2183 struct extent_map *em; 2183 2184 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2184 2185 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ··· 2185 2188 int ret; 2186 2189 u64 logical; 2187 2190 2188 - ret = get_state_private(failure_tree, start, &private); 2191 + ret = get_state_failrec(failure_tree, start, &failrec); 2189 2192 if (ret) { 2190 2193 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2191 2194 if (!failrec) ··· 2234 2237 ret = set_extent_bits(failure_tree, start, end, 2235 2238 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2236 2239 if (ret >= 0) 2237 - ret = set_state_private(failure_tree, start, 2238 - (u64)(unsigned long)failrec); 2240 + ret = set_state_failrec(failure_tree, start, failrec); 2239 2241 /* set the bits in the inode's tree */ 2240 2242 if (ret >= 0) 2241 2243 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, ··· 2244 2248 return ret; 2245 2249 } 2246 2250 } else { 2247 - failrec = (struct io_failure_record *)(unsigned long)private; 2248 2251 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", 2249 2252 failrec->logical, failrec->start, failrec->len, 2250 2253 failrec->in_validation); ··· 3172 3177 3173 3178 while (1) { 3174 3179 lock_extent(tree, start, end); 3175 - ordered = btrfs_lookup_ordered_extent(inode, start); 3180 + ordered = btrfs_lookup_ordered_range(inode, start, 3181 + PAGE_CACHE_SIZE); 3176 3182 if (!ordered) 3177 3183 break; 3178 3184 unlock_extent(tree, start, end);

+2 -3

fs/btrfs/extent_io.h

··· 61 61 struct extent_state; 62 62 struct btrfs_root; 63 63 struct btrfs_io_bio; 64 + struct io_failure_record; 64 65 65 66 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 66 67 struct bio *bio, int mirror_num, ··· 112 111 atomic_t refs; 113 112 unsigned state; 114 113 115 - /* for use by the FS */ 116 - u64 private; 114 + struct io_failure_record *failrec; 117 115 118 116 #ifdef CONFIG_BTRFS_DEBUG 119 117 struct list_head leak_list; ··· 342 342 get_extent_t get_extent); 343 343 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 344 344 __u64 start, __u64 len, get_extent_t *get_extent); 345 - int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 346 345 void set_page_extent_mapped(struct page *page); 347 346 348 347 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,

+1 -2

fs/btrfs/extent_map.c

··· 20 20 21 21 void extent_map_exit(void) 22 22 { 23 - if (extent_map_cache) 24 - kmem_cache_destroy(extent_map_cache); 23 + kmem_cache_destroy(extent_map_cache); 25 24 } 26 25 27 26 /**

+59 -33

fs/btrfs/file-item.c

··· 172 172 u64 item_start_offset = 0; 173 173 u64 item_last_offset = 0; 174 174 u64 disk_bytenr; 175 + u64 page_bytes_left; 175 176 u32 diff; 176 177 int nblocks; 177 178 int bio_index = 0; ··· 221 220 disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; 222 221 if (dio) 223 222 offset = logical_offset; 223 + 224 + page_bytes_left = bvec->bv_len; 224 225 while (bio_index < bio->bi_vcnt) { 225 226 if (!dio) 226 227 offset = page_offset(bvec->bv_page) + bvec->bv_offset; ··· 246 243 if (BTRFS_I(inode)->root->root_key.objectid == 247 244 BTRFS_DATA_RELOC_TREE_OBJECTID) { 248 245 set_extent_bits(io_tree, offset, 249 - offset + bvec->bv_len - 1, 246 + offset + root->sectorsize - 1, 250 247 EXTENT_NODATASUM, GFP_NOFS); 251 248 } else { 252 249 btrfs_info(BTRFS_I(inode)->root->fs_info, ··· 284 281 found: 285 282 csum += count * csum_size; 286 283 nblocks -= count; 287 - bio_index += count; 284 + 288 285 while (count--) { 289 - disk_bytenr += bvec->bv_len; 290 - offset += bvec->bv_len; 291 - bvec++; 286 + disk_bytenr += root->sectorsize; 287 + offset += root->sectorsize; 288 + page_bytes_left -= root->sectorsize; 289 + if (!page_bytes_left) { 290 + bio_index++; 291 + bvec++; 292 + page_bytes_left = bvec->bv_len; 293 + } 294 + 292 295 } 293 296 } 294 297 btrfs_free_path(path); ··· 441 432 struct bio_vec *bvec = bio->bi_io_vec; 442 433 int bio_index = 0; 443 434 int index; 435 + int nr_sectors; 436 + int i; 444 437 unsigned long total_bytes = 0; 445 438 unsigned long this_sum_bytes = 0; 446 439 u64 offset; ··· 470 459 if (!contig) 471 460 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 472 461 473 - if (offset >= ordered->file_offset + ordered->len || 474 - offset < ordered->file_offset) { 475 - unsigned long bytes_left; 476 - sums->len = this_sum_bytes; 477 - this_sum_bytes = 0; 478 - btrfs_add_ordered_sum(inode, ordered, sums); 479 - btrfs_put_ordered_extent(ordered); 462 + data = kmap_atomic(bvec->bv_page); 480 463 481 - bytes_left = bio->bi_iter.bi_size - total_bytes; 464 + nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, 465 + bvec->bv_len + root->sectorsize 466 + - 1); 482 467 483 - sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 484 - GFP_NOFS); 485 - BUG_ON(!sums); /* -ENOMEM */ 486 - sums->len = bytes_left; 487 - ordered = btrfs_lookup_ordered_extent(inode, offset); 488 - BUG_ON(!ordered); /* Logic error */ 489 - sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + 490 - total_bytes; 491 - index = 0; 468 + for (i = 0; i < nr_sectors; i++) { 469 + if (offset >= ordered->file_offset + ordered->len || 470 + offset < ordered->file_offset) { 471 + unsigned long bytes_left; 472 + 473 + kunmap_atomic(data); 474 + sums->len = this_sum_bytes; 475 + this_sum_bytes = 0; 476 + btrfs_add_ordered_sum(inode, ordered, sums); 477 + btrfs_put_ordered_extent(ordered); 478 + 479 + bytes_left = bio->bi_iter.bi_size - total_bytes; 480 + 481 + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 482 + GFP_NOFS); 483 + BUG_ON(!sums); /* -ENOMEM */ 484 + sums->len = bytes_left; 485 + ordered = btrfs_lookup_ordered_extent(inode, 486 + offset); 487 + ASSERT(ordered); /* Logic error */ 488 + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) 489 + + total_bytes; 490 + index = 0; 491 + 492 + data = kmap_atomic(bvec->bv_page); 493 + } 494 + 495 + sums->sums[index] = ~(u32)0; 496 + sums->sums[index] 497 + = btrfs_csum_data(data + bvec->bv_offset 498 + + (i * root->sectorsize), 499 + sums->sums[index], 500 + root->sectorsize); 501 + btrfs_csum_final(sums->sums[index], 502 + (char *)(sums->sums + index)); 503 + index++; 504 + offset += root->sectorsize; 505 + this_sum_bytes += root->sectorsize; 506 + total_bytes += root->sectorsize; 492 507 } 493 508 494 - data = kmap_atomic(bvec->bv_page); 495 - sums->sums[index] = ~(u32)0; 496 - sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, 497 - sums->sums[index], 498 - bvec->bv_len); 499 509 kunmap_atomic(data); 500 - btrfs_csum_final(sums->sums[index], 501 - (char *)(sums->sums + index)); 502 510 503 511 bio_index++; 504 - index++; 505 - total_bytes += bvec->bv_len; 506 - this_sum_bytes += bvec->bv_len; 507 - offset += bvec->bv_len; 508 512 bvec++; 509 513 } 510 514 this_sum_bytes = 0;

+83 -63

fs/btrfs/file.c

··· 498 498 loff_t isize = i_size_read(inode); 499 499 500 500 start_pos = pos & ~((u64)root->sectorsize - 1); 501 - num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); 501 + num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize); 502 502 503 503 end_of_last_block = start_pos + num_bytes - 1; 504 504 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, ··· 1379 1379 static noinline int 1380 1380 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, 1381 1381 size_t num_pages, loff_t pos, 1382 + size_t write_bytes, 1382 1383 u64 *lockstart, u64 *lockend, 1383 1384 struct extent_state **cached_state) 1384 1385 { 1386 + struct btrfs_root *root = BTRFS_I(inode)->root; 1385 1387 u64 start_pos; 1386 1388 u64 last_pos; 1387 1389 int i; 1388 1390 int ret = 0; 1389 1391 1390 - start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); 1391 - last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; 1392 + start_pos = round_down(pos, root->sectorsize); 1393 + last_pos = start_pos 1394 + + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1; 1392 1395 1393 1396 if (start_pos < inode->i_size) { 1394 1397 struct btrfs_ordered_extent *ordered; ··· 1506 1503 1507 1504 while (iov_iter_count(i) > 0) { 1508 1505 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1506 + size_t sector_offset; 1509 1507 size_t write_bytes = min(iov_iter_count(i), 1510 1508 nrptrs * (size_t)PAGE_CACHE_SIZE - 1511 1509 offset); ··· 1515 1511 size_t reserve_bytes; 1516 1512 size_t dirty_pages; 1517 1513 size_t copied; 1514 + size_t dirty_sectors; 1515 + size_t num_sectors; 1518 1516 1519 1517 WARN_ON(num_pages > nrptrs); 1520 1518 ··· 1529 1523 break; 1530 1524 } 1531 1525 1532 - reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1526 + sector_offset = pos & (root->sectorsize - 1); 1527 + reserve_bytes = round_up(write_bytes + sector_offset, 1528 + root->sectorsize); 1533 1529 1534 - if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1535 - BTRFS_INODE_PREALLOC)) { 1536 - ret = check_can_nocow(inode, pos, &write_bytes); 1537 - if (ret < 0) 1538 - break; 1539 - if (ret > 0) { 1540 - /* 1541 - * For nodata cow case, no need to reserve 1542 - * data space. 1543 - */ 1544 - only_release_metadata = true; 1545 - /* 1546 - * our prealloc extent may be smaller than 1547 - * write_bytes, so scale down. 1548 - */ 1549 - num_pages = DIV_ROUND_UP(write_bytes + offset, 1550 - PAGE_CACHE_SIZE); 1551 - reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1552 - goto reserve_metadata; 1553 - } 1530 + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1531 + BTRFS_INODE_PREALLOC)) && 1532 + check_can_nocow(inode, pos, &write_bytes) > 0) { 1533 + /* 1534 + * For nodata cow case, no need to reserve 1535 + * data space. 1536 + */ 1537 + only_release_metadata = true; 1538 + /* 1539 + * our prealloc extent may be smaller than 1540 + * write_bytes, so scale down. 1541 + */ 1542 + num_pages = DIV_ROUND_UP(write_bytes + offset, 1543 + PAGE_CACHE_SIZE); 1544 + reserve_bytes = round_up(write_bytes + sector_offset, 1545 + root->sectorsize); 1546 + goto reserve_metadata; 1554 1547 } 1548 + 1555 1549 ret = btrfs_check_data_free_space(inode, pos, write_bytes); 1556 1550 if (ret < 0) 1557 1551 break; ··· 1582 1576 break; 1583 1577 1584 1578 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, 1585 - pos, &lockstart, &lockend, 1586 - &cached_state); 1579 + pos, write_bytes, &lockstart, 1580 + &lockend, &cached_state); 1587 1581 if (ret < 0) { 1588 1582 if (ret == -EAGAIN) 1589 1583 goto again; ··· 1618 1612 * we still have an outstanding extent for the chunk we actually 1619 1613 * managed to copy. 1620 1614 */ 1621 - if (num_pages > dirty_pages) { 1622 - release_bytes = (num_pages - dirty_pages) << 1623 - PAGE_CACHE_SHIFT; 1615 + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, 1616 + reserve_bytes); 1617 + dirty_sectors = round_up(copied + sector_offset, 1618 + root->sectorsize); 1619 + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, 1620 + dirty_sectors); 1621 + 1622 + if (num_sectors > dirty_sectors) { 1623 + release_bytes = (write_bytes - copied) 1624 + & ~((u64)root->sectorsize - 1); 1624 1625 if (copied > 0) { 1625 1626 spin_lock(&BTRFS_I(inode)->lock); 1626 1627 BTRFS_I(inode)->outstanding_extents++; ··· 1646 1633 } 1647 1634 } 1648 1635 1649 - release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1636 + release_bytes = round_up(copied + sector_offset, 1637 + root->sectorsize); 1650 1638 1651 1639 if (copied > 0) 1652 1640 ret = btrfs_dirty_pages(root, inode, pages, ··· 1668 1654 1669 1655 if (only_release_metadata && copied > 0) { 1670 1656 lockstart = round_down(pos, root->sectorsize); 1671 - lockend = lockstart + 1672 - (dirty_pages << PAGE_CACHE_SHIFT) - 1; 1657 + lockend = round_up(pos + copied, root->sectorsize) - 1; 1673 1658 1674 1659 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 1675 1660 lockend, EXTENT_NORESERVE, NULL, ··· 1774 1761 ssize_t err; 1775 1762 loff_t pos; 1776 1763 size_t count; 1764 + loff_t oldsize; 1765 + int clean_page = 0; 1777 1766 1778 1767 inode_lock(inode); 1779 1768 err = generic_write_checks(iocb, from); ··· 1814 1799 pos = iocb->ki_pos; 1815 1800 count = iov_iter_count(from); 1816 1801 start_pos = round_down(pos, root->sectorsize); 1817 - if (start_pos > i_size_read(inode)) { 1802 + oldsize = i_size_read(inode); 1803 + if (start_pos > oldsize) { 1818 1804 /* Expand hole size to cover write data, preventing empty gap */ 1819 1805 end_pos = round_up(pos + count, root->sectorsize); 1820 - err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); 1806 + err = btrfs_cont_expand(inode, oldsize, end_pos); 1821 1807 if (err) { 1822 1808 inode_unlock(inode); 1823 1809 goto out; 1824 1810 } 1811 + if (start_pos > round_up(oldsize, root->sectorsize)) 1812 + clean_page = 1; 1825 1813 } 1826 1814 1827 1815 if (sync) ··· 1836 1818 num_written = __btrfs_buffered_write(file, from, pos); 1837 1819 if (num_written > 0) 1838 1820 iocb->ki_pos = pos + num_written; 1821 + if (clean_page) 1822 + pagecache_isize_extended(inode, oldsize, 1823 + i_size_read(inode)); 1839 1824 } 1840 1825 1841 1826 inode_unlock(inode); ··· 2314 2293 int ret = 0; 2315 2294 int err = 0; 2316 2295 unsigned int rsv_count; 2317 - bool same_page; 2296 + bool same_block; 2318 2297 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2319 2298 u64 ino_size; 2320 - bool truncated_page = false; 2299 + bool truncated_block = false; 2321 2300 bool updated_inode = false; 2322 2301 2323 2302 ret = btrfs_wait_ordered_range(inode, offset, len); ··· 2325 2304 return ret; 2326 2305 2327 2306 inode_lock(inode); 2328 - ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); 2307 + ino_size = round_up(inode->i_size, root->sectorsize); 2329 2308 ret = find_first_non_hole(inode, &offset, &len); 2330 2309 if (ret < 0) 2331 2310 goto out_only_mutex; ··· 2338 2317 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2339 2318 lockend = round_down(offset + len, 2340 2319 BTRFS_I(inode)->root->sectorsize) - 1; 2341 - same_page = ((offset >> PAGE_CACHE_SHIFT) == 2342 - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2343 - 2320 + same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset)) 2321 + == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1)); 2344 2322 /* 2345 - * We needn't truncate any page which is beyond the end of the file 2323 + * We needn't truncate any block which is beyond the end of the file 2346 2324 * because we are sure there is no data there. 2347 2325 */ 2348 2326 /* 2349 - * Only do this if we are in the same page and we aren't doing the 2350 - * entire page. 2327 + * Only do this if we are in the same block and we aren't doing the 2328 + * entire block. 2351 2329 */ 2352 - if (same_page && len < PAGE_CACHE_SIZE) { 2330 + if (same_block && len < root->sectorsize) { 2353 2331 if (offset < ino_size) { 2354 - truncated_page = true; 2355 - ret = btrfs_truncate_page(inode, offset, len, 0); 2332 + truncated_block = true; 2333 + ret = btrfs_truncate_block(inode, offset, len, 0); 2356 2334 } else { 2357 2335 ret = 0; 2358 2336 } 2359 2337 goto out_only_mutex; 2360 2338 } 2361 2339 2362 - /* zero back part of the first page */ 2340 + /* zero back part of the first block */ 2363 2341 if (offset < ino_size) { 2364 - truncated_page = true; 2365 - ret = btrfs_truncate_page(inode, offset, 0, 0); 2342 + truncated_block = true; 2343 + ret = btrfs_truncate_block(inode, offset, 0, 0); 2366 2344 if (ret) { 2367 2345 inode_unlock(inode); 2368 2346 return ret; ··· 2396 2376 if (!ret) { 2397 2377 /* zero the front end of the last page */ 2398 2378 if (tail_start + tail_len < ino_size) { 2399 - truncated_page = true; 2400 - ret = btrfs_truncate_page(inode, 2401 - tail_start + tail_len, 0, 1); 2379 + truncated_block = true; 2380 + ret = btrfs_truncate_block(inode, 2381 + tail_start + tail_len, 2382 + 0, 1); 2402 2383 if (ret) 2403 2384 goto out_only_mutex; 2404 2385 } ··· 2565 2544 goto out_free; 2566 2545 2567 2546 inode_inc_iversion(inode); 2568 - inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2547 + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 2569 2548 2570 2549 trans->block_rsv = &root->fs_info->trans_block_rsv; 2571 2550 ret = btrfs_update_inode(trans, root, inode); ··· 2579 2558 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2580 2559 &cached_state, GFP_NOFS); 2581 2560 out_only_mutex: 2582 - if (!updated_inode && truncated_page && !ret && !err) { 2561 + if (!updated_inode && truncated_block && !ret && !err) { 2583 2562 /* 2584 2563 * If we only end up zeroing part of a page, we still need to 2585 2564 * update the inode item, so that all the time fields are ··· 2632 2611 return 0; 2633 2612 } 2634 2613 insert: 2635 - range = kmalloc(sizeof(*range), GFP_NOFS); 2614 + range = kmalloc(sizeof(*range), GFP_KERNEL); 2636 2615 if (!range) 2637 2616 return -ENOMEM; 2638 2617 range->start = start; ··· 2699 2678 } else if (offset + len > inode->i_size) { 2700 2679 /* 2701 2680 * If we are fallocating from the end of the file onward we 2702 - * need to zero out the end of the page if i_size lands in the 2703 - * middle of a page. 2681 + * need to zero out the end of the block if i_size lands in the 2682 + * middle of a block. 2704 2683 */ 2705 - ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); 2684 + ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); 2706 2685 if (ret) 2707 2686 goto out; 2708 2687 } ··· 2733 2712 btrfs_put_ordered_extent(ordered); 2734 2713 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 2735 2714 alloc_start, locked_end, 2736 - &cached_state, GFP_NOFS); 2715 + &cached_state, GFP_KERNEL); 2737 2716 /* 2738 2717 * we can't wait on the range with the transaction 2739 2718 * running or with the extent lock held ··· 2815 2794 if (IS_ERR(trans)) { 2816 2795 ret = PTR_ERR(trans); 2817 2796 } else { 2818 - inode->i_ctime = CURRENT_TIME; 2797 + inode->i_ctime = current_fs_time(inode->i_sb); 2819 2798 i_size_write(inode, actual_end); 2820 2799 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2821 2800 ret = btrfs_update_inode(trans, root, inode); ··· 2827 2806 } 2828 2807 out_unlock: 2829 2808 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2830 - &cached_state, GFP_NOFS); 2809 + &cached_state, GFP_KERNEL); 2831 2810 out: 2832 2811 /* 2833 2812 * As we waited the extent range, the data_rsv_map must be empty ··· 2960 2939 2961 2940 void btrfs_auto_defrag_exit(void) 2962 2941 { 2963 - if (btrfs_inode_defrag_cachep) 2964 - kmem_cache_destroy(btrfs_inode_defrag_cachep); 2942 + kmem_cache_destroy(btrfs_inode_defrag_cachep); 2965 2943 } 2966 2944 2967 2945 int btrfs_auto_defrag_init(void)

+203 -98

fs/btrfs/inode.c

··· 263 263 data_len = compressed_size; 264 264 265 265 if (start > 0 || 266 - actual_end > PAGE_CACHE_SIZE || 266 + actual_end > root->sectorsize || 267 267 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || 268 268 (!compressed_size && 269 269 (actual_end & (root->sectorsize - 1)) == 0) || ··· 2002 2002 if (PagePrivate2(page)) 2003 2003 goto out; 2004 2004 2005 - ordered = btrfs_lookup_ordered_extent(inode, page_start); 2005 + ordered = btrfs_lookup_ordered_range(inode, page_start, 2006 + PAGE_CACHE_SIZE); 2006 2007 if (ordered) { 2007 2008 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 2008 2009 page_end, &cached_state, GFP_NOFS); ··· 4014 4013 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4015 4014 inode_inc_iversion(inode); 4016 4015 inode_inc_iversion(dir); 4017 - inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4016 + inode->i_ctime = dir->i_mtime = 4017 + dir->i_ctime = current_fs_time(inode->i_sb); 4018 4018 ret = btrfs_update_inode(trans, root, dir); 4019 4019 out: 4020 4020 return ret; ··· 4158 4156 4159 4157 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4160 4158 inode_inc_iversion(dir); 4161 - dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4159 + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); 4162 4160 ret = btrfs_update_inode_fallback(trans, root, dir); 4163 4161 if (ret) 4164 4162 btrfs_abort_transaction(trans, root, ret); ··· 4213 4211 { 4214 4212 int ret; 4215 4213 4214 + /* 4215 + * This is only used to apply pressure to the enospc system, we don't 4216 + * intend to use this reservation at all. 4217 + */ 4216 4218 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); 4219 + bytes_deleted *= root->nodesize; 4217 4220 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, 4218 4221 bytes_deleted, BTRFS_RESERVE_NO_FLUSH); 4219 - if (!ret) 4222 + if (!ret) { 4223 + trace_btrfs_space_reservation(root->fs_info, "transaction", 4224 + trans->transid, 4225 + bytes_deleted, 1); 4220 4226 trans->bytes_reserved += bytes_deleted; 4227 + } 4221 4228 return ret; 4222 4229 4223 4230 } ··· 4259 4248 * read the extent item from disk (data not in the page cache). 4260 4249 */ 4261 4250 btrfs_release_path(path); 4262 - return btrfs_truncate_page(inode, offset, page_end - offset, 0); 4251 + return btrfs_truncate_block(inode, offset, page_end - offset, 4252 + 0); 4263 4253 } 4264 4254 4265 4255 btrfs_set_file_extent_ram_bytes(leaf, fi, size); ··· 4613 4601 } 4614 4602 4615 4603 /* 4616 - * btrfs_truncate_page - read, zero a chunk and write a page 4604 + * btrfs_truncate_block - read, zero a chunk and write a block 4617 4605 * @inode - inode that we're zeroing 4618 4606 * @from - the offset to start zeroing 4619 4607 * @len - the length to zero, 0 to zero the entire range respective to the 4620 4608 * offset 4621 4609 * @front - zero up to the offset instead of from the offset on 4622 4610 * 4623 - * This will find the page for the "from" offset and cow the page and zero the 4611 + * This will find the block for the "from" offset and cow the block and zero the 4624 4612 * part we want to zero. This is used with truncate and hole punching. 4625 4613 */ 4626 - int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4614 + int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, 4627 4615 int front) 4628 4616 { 4629 4617 struct address_space *mapping = inode->i_mapping; ··· 4634 4622 char *kaddr; 4635 4623 u32 blocksize = root->sectorsize; 4636 4624 pgoff_t index = from >> PAGE_CACHE_SHIFT; 4637 - unsigned offset = from & (PAGE_CACHE_SIZE-1); 4625 + unsigned offset = from & (blocksize - 1); 4638 4626 struct page *page; 4639 4627 gfp_t mask = btrfs_alloc_write_mask(mapping); 4640 4628 int ret = 0; 4641 - u64 page_start; 4642 - u64 page_end; 4629 + u64 block_start; 4630 + u64 block_end; 4643 4631 4644 4632 if ((offset & (blocksize - 1)) == 0 && 4645 4633 (!len || ((len & (blocksize - 1)) == 0))) 4646 4634 goto out; 4635 + 4647 4636 ret = btrfs_delalloc_reserve_space(inode, 4648 - round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); 4637 + round_down(from, blocksize), blocksize); 4649 4638 if (ret) 4650 4639 goto out; 4651 4640 ··· 4654 4641 page = find_or_create_page(mapping, index, mask); 4655 4642 if (!page) { 4656 4643 btrfs_delalloc_release_space(inode, 4657 - round_down(from, PAGE_CACHE_SIZE), 4658 - PAGE_CACHE_SIZE); 4644 + round_down(from, blocksize), 4645 + blocksize); 4659 4646 ret = -ENOMEM; 4660 4647 goto out; 4661 4648 } 4662 4649 4663 - page_start = page_offset(page); 4664 - page_end = page_start + PAGE_CACHE_SIZE - 1; 4650 + block_start = round_down(from, blocksize); 4651 + block_end = block_start + blocksize - 1; 4665 4652 4666 4653 if (!PageUptodate(page)) { 4667 4654 ret = btrfs_readpage(NULL, page); ··· 4678 4665 } 4679 4666 wait_on_page_writeback(page); 4680 4667 4681 - lock_extent_bits(io_tree, page_start, page_end, &cached_state); 4668 + lock_extent_bits(io_tree, block_start, block_end, &cached_state); 4682 4669 set_page_extent_mapped(page); 4683 4670 4684 - ordered = btrfs_lookup_ordered_extent(inode, page_start); 4671 + ordered = btrfs_lookup_ordered_extent(inode, block_start); 4685 4672 if (ordered) { 4686 - unlock_extent_cached(io_tree, page_start, page_end, 4673 + unlock_extent_cached(io_tree, block_start, block_end, 4687 4674 &cached_state, GFP_NOFS); 4688 4675 unlock_page(page); 4689 4676 page_cache_release(page); ··· 4692 4679 goto again; 4693 4680 } 4694 4681 4695 - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 4682 + clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, 4696 4683 EXTENT_DIRTY | EXTENT_DELALLOC | 4697 4684 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4698 4685 0, 0, &cached_state, GFP_NOFS); 4699 4686 4700 - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 4687 + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 4701 4688 &cached_state); 4702 4689 if (ret) { 4703 - unlock_extent_cached(io_tree, page_start, page_end, 4690 + unlock_extent_cached(io_tree, block_start, block_end, 4704 4691 &cached_state, GFP_NOFS); 4705 4692 goto out_unlock; 4706 4693 } 4707 4694 4708 - if (offset != PAGE_CACHE_SIZE) { 4695 + if (offset != blocksize) { 4709 4696 if (!len) 4710 - len = PAGE_CACHE_SIZE - offset; 4697 + len = blocksize - offset; 4711 4698 kaddr = kmap(page); 4712 4699 if (front) 4713 - memset(kaddr, 0, offset); 4700 + memset(kaddr + (block_start - page_offset(page)), 4701 + 0, offset); 4714 4702 else 4715 - memset(kaddr + offset, 0, len); 4703 + memset(kaddr + (block_start - page_offset(page)) + offset, 4704 + 0, len); 4716 4705 flush_dcache_page(page); 4717 4706 kunmap(page); 4718 4707 } 4719 4708 ClearPageChecked(page); 4720 4709 set_page_dirty(page); 4721 - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 4710 + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, 4722 4711 GFP_NOFS); 4723 4712 4724 4713 out_unlock: 4725 4714 if (ret) 4726 - btrfs_delalloc_release_space(inode, page_start, 4727 - PAGE_CACHE_SIZE); 4715 + btrfs_delalloc_release_space(inode, block_start, 4716 + blocksize); 4728 4717 unlock_page(page); 4729 4718 page_cache_release(page); 4730 4719 out: ··· 4797 4782 int err = 0; 4798 4783 4799 4784 /* 4800 - * If our size started in the middle of a page we need to zero out the 4801 - * rest of the page before we expand the i_size, otherwise we could 4785 + * If our size started in the middle of a block we need to zero out the 4786 + * rest of the block before we expand the i_size, otherwise we could 4802 4787 * expose stale data. 4803 4788 */ 4804 - err = btrfs_truncate_page(inode, oldsize, 0, 0); 4789 + err = btrfs_truncate_block(inode, oldsize, 0, 0); 4805 4790 if (err) 4806 4791 return err; 4807 4792 ··· 4910 4895 } 4911 4896 4912 4897 if (newsize > oldsize) { 4913 - truncate_pagecache(inode, newsize); 4914 4898 /* 4915 4899 * Don't do an expanding truncate while snapshoting is ongoing. 4916 4900 * This is to ensure the snapshot captures a fully consistent ··· 4932 4918 4933 4919 i_size_write(inode, newsize); 4934 4920 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4921 + pagecache_isize_extended(inode, oldsize, newsize); 4935 4922 ret = btrfs_update_inode(trans, root, inode); 4936 4923 btrfs_end_write_no_snapshoting(root); 4937 4924 btrfs_end_transaction(trans, root); ··· 5603 5588 inode->i_op = &btrfs_dir_ro_inode_operations; 5604 5589 inode->i_fop = &simple_dir_operations; 5605 5590 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5606 - inode->i_mtime = CURRENT_TIME; 5591 + inode->i_mtime = current_fs_time(inode->i_sb); 5607 5592 inode->i_atime = inode->i_mtime; 5608 5593 inode->i_ctime = inode->i_mtime; 5609 5594 BTRFS_I(inode)->i_otime = inode->i_mtime; ··· 5805 5790 if (name_len <= sizeof(tmp_name)) { 5806 5791 name_ptr = tmp_name; 5807 5792 } else { 5808 - name_ptr = kmalloc(name_len, GFP_NOFS); 5793 + name_ptr = kmalloc(name_len, GFP_KERNEL); 5809 5794 if (!name_ptr) { 5810 5795 ret = -ENOMEM; 5811 5796 goto err; ··· 6187 6172 inode_init_owner(inode, dir, mode); 6188 6173 inode_set_bytes(inode, 0); 6189 6174 6190 - inode->i_mtime = CURRENT_TIME; 6175 + inode->i_mtime = current_fs_time(inode->i_sb); 6191 6176 inode->i_atime = inode->i_mtime; 6192 6177 inode->i_ctime = inode->i_mtime; 6193 6178 BTRFS_I(inode)->i_otime = inode->i_mtime; ··· 6300 6285 btrfs_i_size_write(parent_inode, parent_inode->i_size + 6301 6286 name_len * 2); 6302 6287 inode_inc_iversion(parent_inode); 6303 - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 6288 + parent_inode->i_mtime = parent_inode->i_ctime = 6289 + current_fs_time(parent_inode->i_sb); 6304 6290 ret = btrfs_update_inode(trans, root, parent_inode); 6305 6291 if (ret) 6306 6292 btrfs_abort_transaction(trans, root, ret); ··· 6519 6503 BTRFS_I(inode)->dir_index = 0ULL; 6520 6504 inc_nlink(inode); 6521 6505 inode_inc_iversion(inode); 6522 - inode->i_ctime = CURRENT_TIME; 6506 + inode->i_ctime = current_fs_time(inode->i_sb); 6523 6507 ihold(inode); 6524 6508 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6525 6509 ··· 7780 7764 } 7781 7765 7782 7766 static int dio_read_error(struct inode *inode, struct bio *failed_bio, 7783 - struct page *page, u64 start, u64 end, 7784 - int failed_mirror, bio_end_io_t *repair_endio, 7785 - void *repair_arg) 7767 + struct page *page, unsigned int pgoff, 7768 + u64 start, u64 end, int failed_mirror, 7769 + bio_end_io_t *repair_endio, void *repair_arg) 7786 7770 { 7787 7771 struct io_failure_record *failrec; 7788 7772 struct bio *bio; ··· 7803 7787 return -EIO; 7804 7788 } 7805 7789 7806 - if (failed_bio->bi_vcnt > 1) 7790 + if ((failed_bio->bi_vcnt > 1) 7791 + || (failed_bio->bi_io_vec->bv_len 7792 + > BTRFS_I(inode)->root->sectorsize)) 7807 7793 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 7808 7794 else 7809 7795 read_mode = READ_SYNC; ··· 7813 7795 isector = start - btrfs_io_bio(failed_bio)->logical; 7814 7796 isector >>= inode->i_sb->s_blocksize_bits; 7815 7797 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 7816 - 0, isector, repair_endio, repair_arg); 7798 + pgoff, isector, repair_endio, repair_arg); 7817 7799 if (!bio) { 7818 7800 free_io_failure(inode, failrec); 7819 7801 return -EIO; ··· 7843 7825 static void btrfs_retry_endio_nocsum(struct bio *bio) 7844 7826 { 7845 7827 struct btrfs_retry_complete *done = bio->bi_private; 7828 + struct inode *inode; 7846 7829 struct bio_vec *bvec; 7847 7830 int i; 7848 7831 7849 7832 if (bio->bi_error) 7850 7833 goto end; 7834 + 7835 + ASSERT(bio->bi_vcnt == 1); 7836 + inode = bio->bi_io_vec->bv_page->mapping->host; 7837 + ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); 7851 7838 7852 7839 done->uptodate = 1; 7853 7840 bio_for_each_segment_all(bvec, bio, i) ··· 7865 7842 static int __btrfs_correct_data_nocsum(struct inode *inode, 7866 7843 struct btrfs_io_bio *io_bio) 7867 7844 { 7845 + struct btrfs_fs_info *fs_info; 7868 7846 struct bio_vec *bvec; 7869 7847 struct btrfs_retry_complete done; 7870 7848 u64 start; 7849 + unsigned int pgoff; 7850 + u32 sectorsize; 7851 + int nr_sectors; 7871 7852 int i; 7872 7853 int ret; 7854 + 7855 + fs_info = BTRFS_I(inode)->root->fs_info; 7856 + sectorsize = BTRFS_I(inode)->root->sectorsize; 7873 7857 7874 7858 start = io_bio->logical; 7875 7859 done.inode = inode; 7876 7860 7877 7861 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 7878 - try_again: 7862 + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 7863 + pgoff = bvec->bv_offset; 7864 + 7865 + next_block_or_try_again: 7879 7866 done.uptodate = 0; 7880 7867 done.start = start; 7881 7868 init_completion(&done.done); 7882 7869 7883 - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 7884 - start + bvec->bv_len - 1, 7885 - io_bio->mirror_num, 7886 - btrfs_retry_endio_nocsum, &done); 7870 + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, 7871 + pgoff, start, start + sectorsize - 1, 7872 + io_bio->mirror_num, 7873 + btrfs_retry_endio_nocsum, &done); 7887 7874 if (ret) 7888 7875 return ret; 7889 7876 ··· 7901 7868 7902 7869 if (!done.uptodate) { 7903 7870 /* We might have another mirror, so try again */ 7904 - goto try_again; 7871 + goto next_block_or_try_again; 7905 7872 } 7906 7873 7907 - start += bvec->bv_len; 7874 + start += sectorsize; 7875 + 7876 + if (nr_sectors--) { 7877 + pgoff += sectorsize; 7878 + goto next_block_or_try_again; 7879 + } 7908 7880 } 7909 7881 7910 7882 return 0; ··· 7919 7881 { 7920 7882 struct btrfs_retry_complete *done = bio->bi_private; 7921 7883 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7884 + struct inode *inode; 7922 7885 struct bio_vec *bvec; 7886 + u64 start; 7923 7887 int uptodate; 7924 7888 int ret; 7925 7889 int i; ··· 7930 7890 goto end; 7931 7891 7932 7892 uptodate = 1; 7893 + 7894 + start = done->start; 7895 + 7896 + ASSERT(bio->bi_vcnt == 1); 7897 + inode = bio->bi_io_vec->bv_page->mapping->host; 7898 + ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); 7899 + 7933 7900 bio_for_each_segment_all(bvec, bio, i) { 7934 7901 ret = __readpage_endio_check(done->inode, io_bio, i, 7935 - bvec->bv_page, 0, 7936 - done->start, bvec->bv_len); 7902 + bvec->bv_page, bvec->bv_offset, 7903 + done->start, bvec->bv_len); 7937 7904 if (!ret) 7938 7905 clean_io_failure(done->inode, done->start, 7939 - bvec->bv_page, 0); 7906 + bvec->bv_page, bvec->bv_offset); 7940 7907 else 7941 7908 uptodate = 0; 7942 7909 } ··· 7957 7910 static int __btrfs_subio_endio_read(struct inode *inode, 7958 7911 struct btrfs_io_bio *io_bio, int err) 7959 7912 { 7913 + struct btrfs_fs_info *fs_info; 7960 7914 struct bio_vec *bvec; 7961 7915 struct btrfs_retry_complete done; 7962 7916 u64 start; 7963 7917 u64 offset = 0; 7918 + u32 sectorsize; 7919 + int nr_sectors; 7920 + unsigned int pgoff; 7921 + int csum_pos; 7964 7922 int i; 7965 7923 int ret; 7924 + 7925 + fs_info = BTRFS_I(inode)->root->fs_info; 7926 + sectorsize = BTRFS_I(inode)->root->sectorsize; 7966 7927 7967 7928 err = 0; 7968 7929 start = io_bio->logical; 7969 7930 done.inode = inode; 7970 7931 7971 7932 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 7972 - ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 7973 - 0, start, bvec->bv_len); 7933 + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 7934 + 7935 + pgoff = bvec->bv_offset; 7936 + next_block: 7937 + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); 7938 + ret = __readpage_endio_check(inode, io_bio, csum_pos, 7939 + bvec->bv_page, pgoff, start, 7940 + sectorsize); 7974 7941 if (likely(!ret)) 7975 7942 goto next; 7976 7943 try_again: ··· 7992 7931 done.start = start; 7993 7932 init_completion(&done.done); 7994 7933 7995 - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 7996 - start + bvec->bv_len - 1, 7997 - io_bio->mirror_num, 7998 - btrfs_retry_endio, &done); 7934 + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, 7935 + pgoff, start, start + sectorsize - 1, 7936 + io_bio->mirror_num, 7937 + btrfs_retry_endio, &done); 7999 7938 if (ret) { 8000 7939 err = ret; 8001 7940 goto next; ··· 8008 7947 goto try_again; 8009 7948 } 8010 7949 next: 8011 - offset += bvec->bv_len; 8012 - start += bvec->bv_len; 7950 + offset += sectorsize; 7951 + start += sectorsize; 7952 + 7953 + ASSERT(nr_sectors); 7954 + 7955 + if (--nr_sectors) { 7956 + pgoff += sectorsize; 7957 + goto next_block; 7958 + } 8013 7959 } 8014 7960 8015 7961 return err; ··· 8270 8202 u64 file_offset = dip->logical_offset; 8271 8203 u64 submit_len = 0; 8272 8204 u64 map_length; 8273 - int nr_pages = 0; 8274 - int ret; 8205 + u32 blocksize = root->sectorsize; 8275 8206 int async_submit = 0; 8207 + int nr_sectors; 8208 + int ret; 8209 + int i; 8276 8210 8277 8211 map_length = orig_bio->bi_iter.bi_size; 8278 8212 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, ··· 8304 8234 atomic_inc(&dip->pending_bios); 8305 8235 8306 8236 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 8307 - if (map_length < submit_len + bvec->bv_len || 8308 - bio_add_page(bio, bvec->bv_page, bvec->bv_len, 8309 - bvec->bv_offset) < bvec->bv_len) { 8237 + nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len); 8238 + i = 0; 8239 + next_block: 8240 + if (unlikely(map_length < submit_len + blocksize || 8241 + bio_add_page(bio, bvec->bv_page, blocksize, 8242 + bvec->bv_offset + (i * blocksize)) < blocksize)) { 8310 8243 /* 8311 8244 * inc the count before we submit the bio so 8312 8245 * we know the end IO handler won't happen before ··· 8330 8257 file_offset += submit_len; 8331 8258 8332 8259 submit_len = 0; 8333 - nr_pages = 0; 8334 8260 8335 8261 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 8336 8262 start_sector, GFP_NOFS); ··· 8347 8275 bio_put(bio); 8348 8276 goto out_err; 8349 8277 } 8278 + 8279 + goto next_block; 8350 8280 } else { 8351 - submit_len += bvec->bv_len; 8352 - nr_pages++; 8281 + submit_len += blocksize; 8282 + if (--nr_sectors) { 8283 + i++; 8284 + goto next_block; 8285 + } 8353 8286 bvec++; 8354 8287 } 8355 8288 } ··· 8719 8642 struct extent_state *cached_state = NULL; 8720 8643 u64 page_start = page_offset(page); 8721 8644 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 8645 + u64 start; 8646 + u64 end; 8722 8647 int inode_evicting = inode->i_state & I_FREEING; 8723 8648 8724 8649 /* ··· 8740 8661 8741 8662 if (!inode_evicting) 8742 8663 lock_extent_bits(tree, page_start, page_end, &cached_state); 8743 - ordered = btrfs_lookup_ordered_extent(inode, page_start); 8664 + again: 8665 + start = page_start; 8666 + ordered = btrfs_lookup_ordered_range(inode, start, 8667 + page_end - start + 1); 8744 8668 if (ordered) { 8669 + end = min(page_end, ordered->file_offset + ordered->len - 1); 8745 8670 /* 8746 8671 * IO on this page will never be started, so we need 8747 8672 * to account for any ordered extents now 8748 8673 */ 8749 8674 if (!inode_evicting) 8750 - clear_extent_bit(tree, page_start, page_end, 8675 + clear_extent_bit(tree, start, end, 8751 8676 EXTENT_DIRTY | EXTENT_DELALLOC | 8752 8677 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8753 8678 EXTENT_DEFRAG, 1, 0, &cached_state, ··· 8768 8685 8769 8686 spin_lock_irq(&tree->lock); 8770 8687 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8771 - new_len = page_start - ordered->file_offset; 8688 + new_len = start - ordered->file_offset; 8772 8689 if (new_len < ordered->truncated_len) 8773 8690 ordered->truncated_len = new_len; 8774 8691 spin_unlock_irq(&tree->lock); 8775 8692 8776 8693 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8777 - page_start, 8778 - PAGE_CACHE_SIZE, 1)) 8694 + start, 8695 + end - start + 1, 1)) 8779 8696 btrfs_finish_ordered_io(ordered); 8780 8697 } 8781 8698 btrfs_put_ordered_extent(ordered); 8782 8699 if (!inode_evicting) { 8783 8700 cached_state = NULL; 8784 - lock_extent_bits(tree, page_start, page_end, 8701 + lock_extent_bits(tree, start, end, 8785 8702 &cached_state); 8786 8703 } 8704 + 8705 + start = end + 1; 8706 + if (start < page_end) 8707 + goto again; 8787 8708 } 8788 8709 8789 8710 /* ··· 8848 8761 loff_t size; 8849 8762 int ret; 8850 8763 int reserved = 0; 8764 + u64 reserved_space; 8851 8765 u64 page_start; 8852 8766 u64 page_end; 8767 + u64 end; 8768 + 8769 + reserved_space = PAGE_CACHE_SIZE; 8853 8770 8854 8771 sb_start_pagefault(inode->i_sb); 8855 8772 page_start = page_offset(page); 8856 8773 page_end = page_start + PAGE_CACHE_SIZE - 1; 8774 + end = page_end; 8857 8775 8776 + /* 8777 + * Reserving delalloc space after obtaining the page lock can lead to 8778 + * deadlock. For example, if a dirty page is locked by this function 8779 + * and the call to btrfs_delalloc_reserve_space() ends up triggering 8780 + * dirty page write out, then the btrfs_writepage() function could 8781 + * end up waiting indefinitely to get a lock on the page currently 8782 + * being processed by btrfs_page_mkwrite() function. 8783 + */ 8858 8784 ret = btrfs_delalloc_reserve_space(inode, page_start, 8859 - PAGE_CACHE_SIZE); 8785 + reserved_space); 8860 8786 if (!ret) { 8861 8787 ret = file_update_time(vma->vm_file); 8862 8788 reserved = 1; ··· 8903 8803 * we can't set the delalloc bits if there are pending ordered 8904 8804 * extents. Drop our locks and wait for them to finish 8905 8805 */ 8906 - ordered = btrfs_lookup_ordered_extent(inode, page_start); 8806 + ordered = btrfs_lookup_ordered_range(inode, page_start, page_end); 8907 8807 if (ordered) { 8908 8808 unlock_extent_cached(io_tree, page_start, page_end, 8909 8809 &cached_state, GFP_NOFS); ··· 8913 8813 goto again; 8914 8814 } 8915 8815 8816 + if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) { 8817 + reserved_space = round_up(size - page_start, root->sectorsize); 8818 + if (reserved_space < PAGE_CACHE_SIZE) { 8819 + end = page_start + reserved_space - 1; 8820 + spin_lock(&BTRFS_I(inode)->lock); 8821 + BTRFS_I(inode)->outstanding_extents++; 8822 + spin_unlock(&BTRFS_I(inode)->lock); 8823 + btrfs_delalloc_release_space(inode, page_start, 8824 + PAGE_CACHE_SIZE - reserved_space); 8825 + } 8826 + } 8827 + 8916 8828 /* 8917 8829 * XXX - page_mkwrite gets called every time the page is dirtied, even 8918 8830 * if it was already dirty, so for space accounting reasons we need to ··· 8932 8820 * is probably a better way to do this, but for now keep consistent with 8933 8821 * prepare_pages in the normal write path. 8934 8822 */ 8935 - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 8823 + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, 8936 8824 EXTENT_DIRTY | EXTENT_DELALLOC | 8937 8825 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 8938 8826 0, 0, &cached_state, GFP_NOFS); 8939 8827 8940 - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 8828 + ret = btrfs_set_extent_delalloc(inode, page_start, end, 8941 8829 &cached_state); 8942 8830 if (ret) { 8943 8831 unlock_extent_cached(io_tree, page_start, page_end, ··· 8976 8864 } 8977 8865 unlock_page(page); 8978 8866 out: 8979 - btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); 8867 + btrfs_delalloc_release_space(inode, page_start, reserved_space); 8980 8868 out_noreserve: 8981 8869 sb_end_pagefault(inode->i_sb); 8982 8870 return ret; ··· 9302 9190 * destroy cache. 9303 9191 */ 9304 9192 rcu_barrier(); 9305 - if (btrfs_inode_cachep) 9306 - kmem_cache_destroy(btrfs_inode_cachep); 9307 - if (btrfs_trans_handle_cachep) 9308 - kmem_cache_destroy(btrfs_trans_handle_cachep); 9309 - if (btrfs_transaction_cachep) 9310 - kmem_cache_destroy(btrfs_transaction_cachep); 9311 - if (btrfs_path_cachep) 9312 - kmem_cache_destroy(btrfs_path_cachep); 9313 - if (btrfs_free_space_cachep) 9314 - kmem_cache_destroy(btrfs_free_space_cachep); 9193 + kmem_cache_destroy(btrfs_inode_cachep); 9194 + kmem_cache_destroy(btrfs_trans_handle_cachep); 9195 + kmem_cache_destroy(btrfs_transaction_cachep); 9196 + kmem_cache_destroy(btrfs_path_cachep); 9197 + kmem_cache_destroy(btrfs_free_space_cachep); 9315 9198 } 9316 9199 9317 9200 int btrfs_init_cachep(void) ··· 9357 9250 9358 9251 generic_fillattr(inode, stat); 9359 9252 stat->dev = BTRFS_I(inode)->root->anon_dev; 9360 - stat->blksize = PAGE_CACHE_SIZE; 9361 9253 9362 9254 spin_lock(&BTRFS_I(inode)->lock); 9363 9255 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; ··· 9374 9268 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9375 9269 struct inode *new_inode = d_inode(new_dentry); 9376 9270 struct inode *old_inode = d_inode(old_dentry); 9377 - struct timespec ctime = CURRENT_TIME; 9378 9271 u64 index = 0; 9379 9272 u64 root_objectid; 9380 9273 int ret; ··· 9470 9365 inode_inc_iversion(old_dir); 9471 9366 inode_inc_iversion(new_dir); 9472 9367 inode_inc_iversion(old_inode); 9473 - old_dir->i_ctime = old_dir->i_mtime = ctime; 9474 - new_dir->i_ctime = new_dir->i_mtime = ctime; 9475 - old_inode->i_ctime = ctime; 9368 + old_dir->i_ctime = old_dir->i_mtime = 9369 + new_dir->i_ctime = new_dir->i_mtime = 9370 + old_inode->i_ctime = current_fs_time(old_dir->i_sb); 9476 9371 9477 9372 if (old_dentry->d_parent != new_dentry->d_parent) 9478 9373 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); ··· 9497 9392 9498 9393 if (new_inode) { 9499 9394 inode_inc_iversion(new_inode); 9500 - new_inode->i_ctime = CURRENT_TIME; 9395 + new_inode->i_ctime = current_fs_time(new_inode->i_sb); 9501 9396 if (unlikely(btrfs_ino(new_inode) == 9502 9397 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9503 9398 root_objectid = BTRFS_I(new_inode)->location.objectid; ··· 9975 9870 *alloc_hint = ins.objectid + ins.offset; 9976 9871 9977 9872 inode_inc_iversion(inode); 9978 - inode->i_ctime = CURRENT_TIME; 9873 + inode->i_ctime = current_fs_time(inode->i_sb); 9979 9874 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 9980 9875 if (!(mode & FALLOC_FL_KEEP_SIZE) && 9981 9876 (actual_len > inode->i_size) &&

+11 -17

fs/btrfs/ioctl.c

··· 347 347 348 348 btrfs_update_iflags(inode); 349 349 inode_inc_iversion(inode); 350 - inode->i_ctime = CURRENT_TIME; 350 + inode->i_ctime = current_fs_time(inode->i_sb); 351 351 ret = btrfs_update_inode(trans, root, inode); 352 352 353 353 btrfs_end_transaction(trans, root); ··· 443 443 struct btrfs_root *root = BTRFS_I(dir)->root; 444 444 struct btrfs_root *new_root; 445 445 struct btrfs_block_rsv block_rsv; 446 - struct timespec cur_time = CURRENT_TIME; 446 + struct timespec cur_time = current_fs_time(dir->i_sb); 447 447 struct inode *inode; 448 448 int ret; 449 449 int err; ··· 843 843 error = PTR_ERR(dentry); 844 844 if (IS_ERR(dentry)) 845 845 goto out_unlock; 846 - 847 - error = -EEXIST; 848 - if (d_really_is_positive(dentry)) 849 - goto out_dput; 850 846 851 847 error = btrfs_may_create(dir, dentry); 852 848 if (error) ··· 2093 2097 key.offset = (u64)-1; 2094 2098 root = btrfs_read_fs_root_no_name(info, &key); 2095 2099 if (IS_ERR(root)) { 2096 - btrfs_err(info, "could not find root %llu", 2097 - sk->tree_id); 2098 2100 btrfs_free_path(path); 2099 2101 return -ENOENT; 2100 2102 } ··· 2954 2960 * of the array is bounded by len, which is in turn bounded by 2955 2961 * BTRFS_MAX_DEDUPE_LEN. 2956 2962 */ 2957 - src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); 2958 - dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); 2963 + src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); 2964 + dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); 2959 2965 if (!src_pgarr || !dst_pgarr) { 2960 2966 kfree(src_pgarr); 2961 2967 kfree(dst_pgarr); ··· 3211 3217 3212 3218 inode_inc_iversion(inode); 3213 3219 if (!no_time_update) 3214 - inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3220 + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 3215 3221 /* 3216 3222 * We round up to the block size at eof when determining which 3217 3223 * extents to clone above, but shouldn't round up the file size. ··· 3883 3889 * Truncate page cache pages so that future reads will see the cloned 3884 3890 * data immediately and not the previous data. 3885 3891 */ 3886 - truncate_inode_pages_range(&inode->i_data, destoff, 3887 - PAGE_CACHE_ALIGN(destoff + len) - 1); 3892 + truncate_inode_pages_range(&inode->i_data, 3893 + round_down(destoff, PAGE_CACHE_SIZE), 3894 + round_up(destoff + len, PAGE_CACHE_SIZE) - 1); 3888 3895 out_unlock: 3889 3896 if (!same_inode) 3890 3897 btrfs_double_inode_unlock(src, inode); ··· 5026 5031 struct btrfs_root *root = BTRFS_I(inode)->root; 5027 5032 struct btrfs_root_item *root_item = &root->root_item; 5028 5033 struct btrfs_trans_handle *trans; 5029 - struct timespec ct = CURRENT_TIME; 5034 + struct timespec ct = current_fs_time(inode->i_sb); 5030 5035 int ret = 0; 5031 5036 int received_uuid_changed; 5032 5037 ··· 5257 5262 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5258 5263 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5259 5264 5260 - static int btrfs_ioctl_get_supported_features(struct file *file, 5261 - void __user *arg) 5265 + int btrfs_ioctl_get_supported_features(void __user *arg) 5262 5266 { 5263 5267 static const struct btrfs_ioctl_feature_flags features[3] = { 5264 5268 INIT_FEATURE_FLAGS(SUPP), ··· 5536 5542 case BTRFS_IOC_SET_FSLABEL: 5537 5543 return btrfs_ioctl_set_fslabel(file, argp); 5538 5544 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5539 - return btrfs_ioctl_get_supported_features(file, argp); 5545 + return btrfs_ioctl_get_supported_features(argp); 5540 5546 case BTRFS_IOC_GET_FEATURES: 5541 5547 return btrfs_ioctl_get_features(file, argp); 5542 5548 case BTRFS_IOC_SET_FEATURES:

+1 -2

fs/btrfs/ordered-data.c

··· 1114 1114 1115 1115 void ordered_data_exit(void) 1116 1116 { 1117 - if (btrfs_ordered_extent_cache) 1118 - kmem_cache_destroy(btrfs_ordered_extent_cache); 1117 + kmem_cache_destroy(btrfs_ordered_extent_cache); 1119 1118 }

+21 -2

fs/btrfs/print-tree.c

··· 295 295 btrfs_dev_extent_chunk_offset(l, dev_extent), 296 296 btrfs_dev_extent_length(l, dev_extent)); 297 297 break; 298 - case BTRFS_DEV_STATS_KEY: 299 - printk(KERN_INFO "\t\tdevice stats\n"); 298 + case BTRFS_PERSISTENT_ITEM_KEY: 299 + printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n", 300 + key.objectid, key.offset); 301 + switch (key.objectid) { 302 + case BTRFS_DEV_STATS_OBJECTID: 303 + printk(KERN_INFO "\t\tdevice stats\n"); 304 + break; 305 + default: 306 + printk(KERN_INFO "\t\tunknown persistent item\n"); 307 + } 308 + break; 309 + case BTRFS_TEMPORARY_ITEM_KEY: 310 + printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n", 311 + key.objectid, key.offset); 312 + switch (key.objectid) { 313 + case BTRFS_BALANCE_OBJECTID: 314 + printk(KERN_INFO "\t\tbalance status\n"); 315 + break; 316 + default: 317 + printk(KERN_INFO "\t\tunknown temporary item\n"); 318 + } 300 319 break; 301 320 case BTRFS_DEV_REPLACE_KEY: 302 321 printk(KERN_INFO "\t\tdev replace\n");

+134 -134

fs/btrfs/reada.c

··· 72 72 spinlock_t lock; 73 73 struct reada_zone *zones[BTRFS_MAX_MIRRORS]; 74 74 int nzones; 75 - struct btrfs_device *scheduled_for; 75 + int scheduled; 76 76 }; 77 77 78 78 struct reada_zone { ··· 101 101 static void __reada_start_machine(struct btrfs_fs_info *fs_info); 102 102 103 103 static int reada_add_block(struct reada_control *rc, u64 logical, 104 - struct btrfs_key *top, int level, u64 generation); 104 + struct btrfs_key *top, u64 generation); 105 105 106 106 /* recurses */ 107 107 /* in case of err, eb might be NULL */ 108 - static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 109 - u64 start, int err) 108 + static void __readahead_hook(struct btrfs_fs_info *fs_info, 109 + struct reada_extent *re, struct extent_buffer *eb, 110 + u64 start, int err) 110 111 { 111 112 int level = 0; 112 113 int nritems; 113 114 int i; 114 115 u64 bytenr; 115 116 u64 generation; 116 - struct reada_extent *re; 117 - struct btrfs_fs_info *fs_info = root->fs_info; 118 117 struct list_head list; 119 - unsigned long index = start >> PAGE_CACHE_SHIFT; 120 - struct btrfs_device *for_dev; 121 118 122 119 if (eb) 123 120 level = btrfs_header_level(eb); 124 - 125 - /* find extent */ 126 - spin_lock(&fs_info->reada_lock); 127 - re = radix_tree_lookup(&fs_info->reada_tree, index); 128 - if (re) 129 - re->refcnt++; 130 - spin_unlock(&fs_info->reada_lock); 131 - 132 - if (!re) 133 - return -1; 134 121 135 122 spin_lock(&re->lock); 136 123 /* ··· 125 138 * don't need the lock anymore 126 139 */ 127 140 list_replace_init(&re->extctl, &list); 128 - for_dev = re->scheduled_for; 129 - re->scheduled_for = NULL; 141 + re->scheduled = 0; 130 142 spin_unlock(&re->lock); 131 143 132 - if (err == 0) { 133 - nritems = level ? btrfs_header_nritems(eb) : 0; 134 - generation = btrfs_header_generation(eb); 135 - /* 136 - * FIXME: currently we just set nritems to 0 if this is a leaf, 137 - * effectively ignoring the content. In a next step we could 138 - * trigger more readahead depending from the content, e.g. 139 - * fetch the checksums for the extents in the leaf. 140 - */ 141 - } else { 142 - /* 143 - * this is the error case, the extent buffer has not been 144 - * read correctly. We won't access anything from it and 145 - * just cleanup our data structures. Effectively this will 146 - * cut the branch below this node from read ahead. 147 - */ 148 - nritems = 0; 149 - generation = 0; 150 - } 144 + /* 145 + * this is the error case, the extent buffer has not been 146 + * read correctly. We won't access anything from it and 147 + * just cleanup our data structures. Effectively this will 148 + * cut the branch below this node from read ahead. 149 + */ 150 + if (err) 151 + goto cleanup; 151 152 153 + /* 154 + * FIXME: currently we just set nritems to 0 if this is a leaf, 155 + * effectively ignoring the content. In a next step we could 156 + * trigger more readahead depending from the content, e.g. 157 + * fetch the checksums for the extents in the leaf. 158 + */ 159 + if (!level) 160 + goto cleanup; 161 + 162 + nritems = btrfs_header_nritems(eb); 163 + generation = btrfs_header_generation(eb); 152 164 for (i = 0; i < nritems; i++) { 153 165 struct reada_extctl *rec; 154 166 u64 n_gen; ··· 174 188 */ 175 189 #ifdef DEBUG 176 190 if (rec->generation != generation) { 177 - btrfs_debug(root->fs_info, 178 - "generation mismatch for (%llu,%d,%llu) %llu != %llu", 179 - key.objectid, key.type, key.offset, 180 - rec->generation, generation); 191 + btrfs_debug(fs_info, 192 + "generation mismatch for (%llu,%d,%llu) %llu != %llu", 193 + key.objectid, key.type, key.offset, 194 + rec->generation, generation); 181 195 } 182 196 #endif 183 197 if (rec->generation == generation && 184 198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && 185 199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) 186 - reada_add_block(rc, bytenr, &next_key, 187 - level - 1, n_gen); 200 + reada_add_block(rc, bytenr, &next_key, n_gen); 188 201 } 189 202 } 203 + 204 + cleanup: 190 205 /* 191 206 * free extctl records 192 207 */ ··· 209 222 210 223 reada_extent_put(fs_info, re); /* one ref for each entry */ 211 224 } 212 - reada_extent_put(fs_info, re); /* our ref */ 213 - if (for_dev) 214 - atomic_dec(&for_dev->reada_in_flight); 215 225 216 - return 0; 226 + return; 217 227 } 218 228 219 229 /* 220 230 * start is passed separately in case eb in NULL, which may be the case with 221 231 * failed I/O 222 232 */ 223 - int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 224 - u64 start, int err) 233 + int btree_readahead_hook(struct btrfs_fs_info *fs_info, 234 + struct extent_buffer *eb, u64 start, int err) 225 235 { 226 - int ret; 236 + int ret = 0; 237 + struct reada_extent *re; 227 238 228 - ret = __readahead_hook(root, eb, start, err); 239 + /* find extent */ 240 + spin_lock(&fs_info->reada_lock); 241 + re = radix_tree_lookup(&fs_info->reada_tree, 242 + start >> PAGE_CACHE_SHIFT); 243 + if (re) 244 + re->refcnt++; 245 + spin_unlock(&fs_info->reada_lock); 246 + if (!re) { 247 + ret = -1; 248 + goto start_machine; 249 + } 229 250 230 - reada_start_machine(root->fs_info); 251 + __readahead_hook(fs_info, re, eb, start, err); 252 + reada_extent_put(fs_info, re); /* our ref */ 231 253 254 + start_machine: 255 + reada_start_machine(fs_info); 232 256 return ret; 233 257 } 234 258 ··· 258 260 spin_lock(&fs_info->reada_lock); 259 261 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 260 262 logical >> PAGE_CACHE_SHIFT, 1); 261 - if (ret == 1) 263 + if (ret == 1 && logical >= zone->start && logical <= zone->end) { 262 264 kref_get(&zone->refcnt); 263 - spin_unlock(&fs_info->reada_lock); 264 - 265 - if (ret == 1) { 266 - if (logical >= zone->start && logical < zone->end) 267 - return zone; 268 - spin_lock(&fs_info->reada_lock); 269 - kref_put(&zone->refcnt, reada_zone_release); 270 265 spin_unlock(&fs_info->reada_lock); 266 + return zone; 271 267 } 268 + 269 + spin_unlock(&fs_info->reada_lock); 272 270 273 271 cache = btrfs_lookup_block_group(fs_info, logical); 274 272 if (!cache) ··· 274 280 end = start + cache->key.offset - 1; 275 281 btrfs_put_block_group(cache); 276 282 277 - zone = kzalloc(sizeof(*zone), GFP_NOFS); 283 + zone = kzalloc(sizeof(*zone), GFP_KERNEL); 278 284 if (!zone) 279 285 return NULL; 280 286 ··· 301 307 kfree(zone); 302 308 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 303 309 logical >> PAGE_CACHE_SHIFT, 1); 304 - if (ret == 1) 310 + if (ret == 1 && logical >= zone->start && logical <= zone->end) 305 311 kref_get(&zone->refcnt); 312 + else 313 + zone = NULL; 306 314 } 307 315 spin_unlock(&fs_info->reada_lock); 308 316 ··· 313 317 314 318 static struct reada_extent *reada_find_extent(struct btrfs_root *root, 315 319 u64 logical, 316 - struct btrfs_key *top, int level) 320 + struct btrfs_key *top) 317 321 { 318 322 int ret; 319 323 struct reada_extent *re = NULL; ··· 326 330 u64 length; 327 331 int real_stripes; 328 332 int nzones = 0; 329 - int i; 330 333 unsigned long index = logical >> PAGE_CACHE_SHIFT; 331 334 int dev_replace_is_ongoing; 335 + int have_zone = 0; 332 336 333 337 spin_lock(&fs_info->reada_lock); 334 338 re = radix_tree_lookup(&fs_info->reada_tree, index); ··· 339 343 if (re) 340 344 return re; 341 345 342 - re = kzalloc(sizeof(*re), GFP_NOFS); 346 + re = kzalloc(sizeof(*re), GFP_KERNEL); 343 347 if (!re) 344 348 return NULL; 345 349 ··· 371 375 struct reada_zone *zone; 372 376 373 377 dev = bbio->stripes[nzones].dev; 378 + 379 + /* cannot read ahead on missing device. */ 380 + if (!dev->bdev) 381 + continue; 382 + 374 383 zone = reada_find_zone(fs_info, dev, logical, bbio); 375 384 if (!zone) 376 - break; 385 + continue; 377 386 378 - re->zones[nzones] = zone; 387 + re->zones[re->nzones++] = zone; 379 388 spin_lock(&zone->lock); 380 389 if (!zone->elems) 381 390 kref_get(&zone->refcnt); ··· 390 389 kref_put(&zone->refcnt, reada_zone_release); 391 390 spin_unlock(&fs_info->reada_lock); 392 391 } 393 - re->nzones = nzones; 394 - if (nzones == 0) { 392 + if (re->nzones == 0) { 395 393 /* not a single zone found, error and out */ 396 394 goto error; 397 395 } 398 396 399 397 /* insert extent in reada_tree + all per-device trees, all or nothing */ 400 - btrfs_dev_replace_lock(&fs_info->dev_replace); 398 + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 401 399 spin_lock(&fs_info->reada_lock); 402 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 403 401 if (ret == -EEXIST) { ··· 404 404 BUG_ON(!re_exist); 405 405 re_exist->refcnt++; 406 406 spin_unlock(&fs_info->reada_lock); 407 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 407 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 408 408 goto error; 409 409 } 410 410 if (ret) { 411 411 spin_unlock(&fs_info->reada_lock); 412 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 412 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 413 413 goto error; 414 414 } 415 415 prev_dev = NULL; 416 416 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 417 417 &fs_info->dev_replace); 418 - for (i = 0; i < nzones; ++i) { 419 - dev = bbio->stripes[i].dev; 418 + for (nzones = 0; nzones < re->nzones; ++nzones) { 419 + dev = re->zones[nzones]->device; 420 + 420 421 if (dev == prev_dev) { 421 422 /* 422 423 * in case of DUP, just add the first zone. As both ··· 428 427 */ 429 428 continue; 430 429 } 431 - if (!dev->bdev) { 432 - /* 433 - * cannot read ahead on missing device, but for RAID5/6, 434 - * REQ_GET_READ_MIRRORS return 1. So don't skip missing 435 - * device for such case. 436 - */ 437 - if (nzones > 1) 438 - continue; 439 - } 430 + if (!dev->bdev) 431 + continue; 432 + 440 433 if (dev_replace_is_ongoing && 441 434 dev == fs_info->dev_replace.tgtdev) { 442 435 /* ··· 442 447 prev_dev = dev; 443 448 ret = radix_tree_insert(&dev->reada_extents, index, re); 444 449 if (ret) { 445 - while (--i >= 0) { 446 - dev = bbio->stripes[i].dev; 450 + while (--nzones >= 0) { 451 + dev = re->zones[nzones]->device; 447 452 BUG_ON(dev == NULL); 448 453 /* ignore whether the entry was inserted */ 449 454 radix_tree_delete(&dev->reada_extents, index); ··· 451 456 BUG_ON(fs_info == NULL); 452 457 radix_tree_delete(&fs_info->reada_tree, index); 453 458 spin_unlock(&fs_info->reada_lock); 454 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 459 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 455 460 goto error; 456 461 } 462 + have_zone = 1; 457 463 } 458 464 spin_unlock(&fs_info->reada_lock); 459 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 465 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 466 + 467 + if (!have_zone) 468 + goto error; 460 469 461 470 btrfs_put_bbio(bbio); 462 471 return re; 463 472 464 473 error: 465 - while (nzones) { 474 + for (nzones = 0; nzones < re->nzones; ++nzones) { 466 475 struct reada_zone *zone; 467 476 468 - --nzones; 469 477 zone = re->zones[nzones]; 470 478 kref_get(&zone->refcnt); 471 479 spin_lock(&zone->lock); ··· 529 531 kref_put(&zone->refcnt, reada_zone_release); 530 532 spin_unlock(&fs_info->reada_lock); 531 533 } 532 - if (re->scheduled_for) 533 - atomic_dec(&re->scheduled_for->reada_in_flight); 534 534 535 535 kfree(re); 536 536 } ··· 552 556 } 553 557 554 558 static int reada_add_block(struct reada_control *rc, u64 logical, 555 - struct btrfs_key *top, int level, u64 generation) 559 + struct btrfs_key *top, u64 generation) 556 560 { 557 561 struct btrfs_root *root = rc->root; 558 562 struct reada_extent *re; 559 563 struct reada_extctl *rec; 560 564 561 - re = reada_find_extent(root, logical, top, level); /* takes one ref */ 565 + re = reada_find_extent(root, logical, top); /* takes one ref */ 562 566 if (!re) 563 567 return -1; 564 568 565 - rec = kzalloc(sizeof(*rec), GFP_NOFS); 569 + rec = kzalloc(sizeof(*rec), GFP_KERNEL); 566 570 if (!rec) { 567 571 reada_extent_put(root->fs_info, re); 568 572 return -ENOMEM; ··· 658 662 u64 logical; 659 663 int ret; 660 664 int i; 661 - int need_kick = 0; 662 665 663 666 spin_lock(&fs_info->reada_lock); 664 667 if (dev->reada_curr_zone == NULL) { ··· 674 679 */ 675 680 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 676 681 dev->reada_next >> PAGE_CACHE_SHIFT, 1); 677 - if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { 682 + if (ret == 0 || re->logical > dev->reada_curr_zone->end) { 678 683 ret = reada_pick_zone(dev); 679 684 if (!ret) { 680 685 spin_unlock(&fs_info->reada_lock); ··· 693 698 694 699 spin_unlock(&fs_info->reada_lock); 695 700 701 + spin_lock(&re->lock); 702 + if (re->scheduled || list_empty(&re->extctl)) { 703 + spin_unlock(&re->lock); 704 + reada_extent_put(fs_info, re); 705 + return 0; 706 + } 707 + re->scheduled = 1; 708 + spin_unlock(&re->lock); 709 + 696 710 /* 697 711 * find mirror num 698 712 */ ··· 713 709 } 714 710 logical = re->logical; 715 711 716 - spin_lock(&re->lock); 717 - if (re->scheduled_for == NULL) { 718 - re->scheduled_for = dev; 719 - need_kick = 1; 720 - } 721 - spin_unlock(&re->lock); 722 - 723 - reada_extent_put(fs_info, re); 724 - 725 - if (!need_kick) 726 - return 0; 727 - 728 712 atomic_inc(&dev->reada_in_flight); 729 713 ret = reada_tree_block_flagged(fs_info->extent_root, logical, 730 714 mirror_num, &eb); 731 715 if (ret) 732 - __readahead_hook(fs_info->extent_root, NULL, logical, ret); 716 + __readahead_hook(fs_info, re, NULL, logical, ret); 733 717 else if (eb) 734 - __readahead_hook(fs_info->extent_root, eb, eb->start, ret); 718 + __readahead_hook(fs_info, re, eb, eb->start, ret); 735 719 736 720 if (eb) 737 721 free_extent_buffer(eb); 722 + 723 + atomic_dec(&dev->reada_in_flight); 724 + reada_extent_put(fs_info, re); 738 725 739 726 return 1; 740 727 ··· 747 752 set_task_ioprio(current, BTRFS_IOPRIO_READA); 748 753 __reada_start_machine(fs_info); 749 754 set_task_ioprio(current, old_ioprio); 755 + 756 + atomic_dec(&fs_info->reada_works_cnt); 750 757 } 751 758 752 759 static void __reada_start_machine(struct btrfs_fs_info *fs_info) ··· 780 783 * enqueue to workers to finish it. This will distribute the load to 781 784 * the cores. 782 785 */ 783 - for (i = 0; i < 2; ++i) 786 + for (i = 0; i < 2; ++i) { 784 787 reada_start_machine(fs_info); 788 + if (atomic_read(&fs_info->reada_works_cnt) > 789 + BTRFS_MAX_MIRRORS * 2) 790 + break; 791 + } 785 792 } 786 793 787 794 static void reada_start_machine(struct btrfs_fs_info *fs_info) 788 795 { 789 796 struct reada_machine_work *rmw; 790 797 791 - rmw = kzalloc(sizeof(*rmw), GFP_NOFS); 798 + rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); 792 799 if (!rmw) { 793 800 /* FIXME we cannot handle this properly right now */ 794 801 BUG(); ··· 802 801 rmw->fs_info = fs_info; 803 802 804 803 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 804 + atomic_inc(&fs_info->reada_works_cnt); 805 805 } 806 806 807 807 #ifdef DEBUG ··· 850 848 if (ret == 0) 851 849 break; 852 850 printk(KERN_DEBUG 853 - " re: logical %llu size %u empty %d for %lld", 851 + " re: logical %llu size %u empty %d scheduled %d", 854 852 re->logical, fs_info->tree_root->nodesize, 855 - list_empty(&re->extctl), re->scheduled_for ? 856 - re->scheduled_for->devid : -1); 853 + list_empty(&re->extctl), re->scheduled); 857 854 858 855 for (i = 0; i < re->nzones; ++i) { 859 856 printk(KERN_CONT " zone %llu-%llu devs", ··· 879 878 index, 1); 880 879 if (ret == 0) 881 880 break; 882 - if (!re->scheduled_for) { 881 + if (!re->scheduled) { 883 882 index = (re->logical >> PAGE_CACHE_SHIFT) + 1; 884 883 continue; 885 884 } 886 885 printk(KERN_DEBUG 887 - "re: logical %llu size %u list empty %d for %lld", 886 + "re: logical %llu size %u list empty %d scheduled %d", 888 887 re->logical, fs_info->tree_root->nodesize, 889 - list_empty(&re->extctl), 890 - re->scheduled_for ? re->scheduled_for->devid : -1); 888 + list_empty(&re->extctl), re->scheduled); 891 889 for (i = 0; i < re->nzones; ++i) { 892 890 printk(KERN_CONT " zone %llu-%llu devs", 893 891 re->zones[i]->start, 894 892 re->zones[i]->end); 895 - for (i = 0; i < re->nzones; ++i) { 896 - printk(KERN_CONT " zone %llu-%llu devs", 897 - re->zones[i]->start, 898 - re->zones[i]->end); 899 - for (j = 0; j < re->zones[i]->ndevs; ++j) { 900 - printk(KERN_CONT " %lld", 901 - re->zones[i]->devs[j]->devid); 902 - } 893 + for (j = 0; j < re->zones[i]->ndevs; ++j) { 894 + printk(KERN_CONT " %lld", 895 + re->zones[i]->devs[j]->devid); 903 896 } 904 897 } 905 898 printk(KERN_CONT "\n"); ··· 912 917 struct reada_control *rc; 913 918 u64 start; 914 919 u64 generation; 915 - int level; 916 920 int ret; 917 921 struct extent_buffer *node; 918 922 static struct btrfs_key max_key = { ··· 920 926 .offset = (u64)-1 921 927 }; 922 928 923 - rc = kzalloc(sizeof(*rc), GFP_NOFS); 929 + rc = kzalloc(sizeof(*rc), GFP_KERNEL); 924 930 if (!rc) 925 931 return ERR_PTR(-ENOMEM); 926 932 ··· 934 940 935 941 node = btrfs_root_node(root); 936 942 start = node->start; 937 - level = btrfs_header_level(node); 938 943 generation = btrfs_header_generation(node); 939 944 free_extent_buffer(node); 940 945 941 - ret = reada_add_block(rc, start, &max_key, level, generation); 946 + ret = reada_add_block(rc, start, &max_key, generation); 942 947 if (ret) { 943 948 kfree(rc); 944 949 return ERR_PTR(ret); ··· 952 959 int btrfs_reada_wait(void *handle) 953 960 { 954 961 struct reada_control *rc = handle; 962 + struct btrfs_fs_info *fs_info = rc->root->fs_info; 955 963 956 964 while (atomic_read(&rc->elems)) { 965 + if (!atomic_read(&fs_info->reada_works_cnt)) 966 + reada_start_machine(fs_info); 957 967 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 958 968 5 * HZ); 959 969 dump_devs(rc->root->fs_info, ··· 973 977 int btrfs_reada_wait(void *handle) 974 978 { 975 979 struct reada_control *rc = handle; 980 + struct btrfs_fs_info *fs_info = rc->root->fs_info; 976 981 977 982 while (atomic_read(&rc->elems)) { 978 - wait_event(rc->wait, atomic_read(&rc->elems) == 0); 983 + if (!atomic_read(&fs_info->reada_works_cnt)) 984 + reada_start_machine(fs_info); 985 + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 986 + (HZ + 9) / 10); 979 987 } 980 988 981 989 kref_put(&rc->refcnt, reada_control_release);

+1 -1

fs/btrfs/root-tree.c

··· 488 488 struct btrfs_root *root) 489 489 { 490 490 struct btrfs_root_item *item = &root->root_item; 491 - struct timespec ct = CURRENT_TIME; 491 + struct timespec ct = current_fs_time(root->fs_info->sb); 492 492 493 493 spin_lock(&root->root_item_lock); 494 494 btrfs_set_root_ctransid(item, trans->transid);

+16 -14

fs/btrfs/scrub.c

··· 461 461 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 462 462 int ret; 463 463 464 - sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 464 + sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); 465 465 if (!sctx) 466 466 goto nomem; 467 467 atomic_set(&sctx->refs, 1); ··· 472 472 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 473 473 struct scrub_bio *sbio; 474 474 475 - sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 475 + sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); 476 476 if (!sbio) 477 477 goto nomem; 478 478 sctx->bios[i] = sbio; ··· 1654 1654 again: 1655 1655 if (!wr_ctx->wr_curr_bio) { 1656 1656 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1657 - GFP_NOFS); 1657 + GFP_KERNEL); 1658 1658 if (!wr_ctx->wr_curr_bio) { 1659 1659 mutex_unlock(&wr_ctx->wr_lock); 1660 1660 return -ENOMEM; ··· 1671 1671 sbio->dev = wr_ctx->tgtdev; 1672 1672 bio = sbio->bio; 1673 1673 if (!bio) { 1674 - bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1674 + bio = btrfs_io_bio_alloc(GFP_KERNEL, 1675 + wr_ctx->pages_per_wr_bio); 1675 1676 if (!bio) { 1676 1677 mutex_unlock(&wr_ctx->wr_lock); 1677 1678 return -ENOMEM; ··· 2077 2076 sbio->dev = spage->dev; 2078 2077 bio = sbio->bio; 2079 2078 if (!bio) { 2080 - bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 2079 + bio = btrfs_io_bio_alloc(GFP_KERNEL, 2080 + sctx->pages_per_rd_bio); 2081 2081 if (!bio) 2082 2082 return -ENOMEM; 2083 2083 sbio->bio = bio; ··· 2243 2241 struct scrub_block *sblock; 2244 2242 int index; 2245 2243 2246 - sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2244 + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2247 2245 if (!sblock) { 2248 2246 spin_lock(&sctx->stat_lock); 2249 2247 sctx->stat.malloc_errors++; ··· 2261 2259 struct scrub_page *spage; 2262 2260 u64 l = min_t(u64, len, PAGE_SIZE); 2263 2261 2264 - spage = kzalloc(sizeof(*spage), GFP_NOFS); 2262 + spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2265 2263 if (!spage) { 2266 2264 leave_nomem: 2267 2265 spin_lock(&sctx->stat_lock); ··· 2288 2286 spage->have_csum = 0; 2289 2287 } 2290 2288 sblock->page_count++; 2291 - spage->page = alloc_page(GFP_NOFS); 2289 + spage->page = alloc_page(GFP_KERNEL); 2292 2290 if (!spage->page) 2293 2291 goto leave_nomem; 2294 2292 len -= l; ··· 2543 2541 struct scrub_block *sblock; 2544 2542 int index; 2545 2543 2546 - sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2544 + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2547 2545 if (!sblock) { 2548 2546 spin_lock(&sctx->stat_lock); 2549 2547 sctx->stat.malloc_errors++; ··· 2563 2561 struct scrub_page *spage; 2564 2562 u64 l = min_t(u64, len, PAGE_SIZE); 2565 2563 2566 - spage = kzalloc(sizeof(*spage), GFP_NOFS); 2564 + spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2567 2565 if (!spage) { 2568 2566 leave_nomem: 2569 2567 spin_lock(&sctx->stat_lock); ··· 2593 2591 spage->have_csum = 0; 2594 2592 } 2595 2593 sblock->page_count++; 2596 - spage->page = alloc_page(GFP_NOFS); 2594 + spage->page = alloc_page(GFP_KERNEL); 2597 2595 if (!spage->page) 2598 2596 goto leave_nomem; 2599 2597 len -= l; ··· 3859 3857 return -EIO; 3860 3858 } 3861 3859 3862 - btrfs_dev_replace_lock(&fs_info->dev_replace); 3860 + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 3863 3861 if (dev->scrub_device || 3864 3862 (!is_dev_replace && 3865 3863 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3866 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 3864 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 3867 3865 mutex_unlock(&fs_info->scrub_lock); 3868 3866 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3869 3867 return -EINPROGRESS; 3870 3868 } 3871 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 3869 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 3872 3870 3873 3871 ret = scrub_workers_get(fs_info, is_dev_replace); 3874 3872 if (ret) {

+18 -18

fs/btrfs/send.c

··· 304 304 { 305 305 struct fs_path *p; 306 306 307 - p = kmalloc(sizeof(*p), GFP_NOFS); 307 + p = kmalloc(sizeof(*p), GFP_KERNEL); 308 308 if (!p) 309 309 return NULL; 310 310 p->reversed = 0; ··· 363 363 * First time the inline_buf does not suffice 364 364 */ 365 365 if (p->buf == p->inline_buf) { 366 - tmp_buf = kmalloc(len, GFP_NOFS); 366 + tmp_buf = kmalloc(len, GFP_KERNEL); 367 367 if (tmp_buf) 368 368 memcpy(tmp_buf, p->buf, old_buf_len); 369 369 } else { 370 - tmp_buf = krealloc(p->buf, len, GFP_NOFS); 370 + tmp_buf = krealloc(p->buf, len, GFP_KERNEL); 371 371 } 372 372 if (!tmp_buf) 373 373 return -ENOMEM; ··· 995 995 * values are small. 996 996 */ 997 997 buf_len = PATH_MAX; 998 - buf = kmalloc(buf_len, GFP_NOFS); 998 + buf = kmalloc(buf_len, GFP_KERNEL); 999 999 if (!buf) { 1000 1000 ret = -ENOMEM; 1001 1001 goto out; ··· 1042 1042 buf = NULL; 1043 1043 } else { 1044 1044 char *tmp = krealloc(buf, buf_len, 1045 - GFP_NOFS | __GFP_NOWARN); 1045 + GFP_KERNEL | __GFP_NOWARN); 1046 1046 1047 1047 if (!tmp) 1048 1048 kfree(buf); ··· 1303 1303 /* We only use this path under the commit sem */ 1304 1304 tmp_path->need_commit_sem = 0; 1305 1305 1306 - backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); 1306 + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL); 1307 1307 if (!backref_ctx) { 1308 1308 ret = -ENOMEM; 1309 1309 goto out; ··· 1984 1984 nce_head = radix_tree_lookup(&sctx->name_cache, 1985 1985 (unsigned long)nce->ino); 1986 1986 if (!nce_head) { 1987 - nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); 1987 + nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); 1988 1988 if (!nce_head) { 1989 1989 kfree(nce); 1990 1990 return -ENOMEM; ··· 2179 2179 /* 2180 2180 * Store the result of the lookup in the name cache. 2181 2181 */ 2182 - nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 2182 + nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); 2183 2183 if (!nce) { 2184 2184 ret = -ENOMEM; 2185 2185 goto out; ··· 2315 2315 if (!path) 2316 2316 return -ENOMEM; 2317 2317 2318 - name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); 2318 + name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); 2319 2319 if (!name) { 2320 2320 btrfs_free_path(path); 2321 2321 return -ENOMEM; ··· 2730 2730 { 2731 2731 struct recorded_ref *ref; 2732 2732 2733 - ref = kmalloc(sizeof(*ref), GFP_NOFS); 2733 + ref = kmalloc(sizeof(*ref), GFP_KERNEL); 2734 2734 if (!ref) 2735 2735 return -ENOMEM; 2736 2736 ··· 2755 2755 { 2756 2756 struct recorded_ref *new; 2757 2757 2758 - new = kmalloc(sizeof(*ref), GFP_NOFS); 2758 + new = kmalloc(sizeof(*ref), GFP_KERNEL); 2759 2759 if (!new) 2760 2760 return -ENOMEM; 2761 2761 ··· 2818 2818 struct rb_node *parent = NULL; 2819 2819 struct orphan_dir_info *entry, *odi; 2820 2820 2821 - odi = kmalloc(sizeof(*odi), GFP_NOFS); 2821 + odi = kmalloc(sizeof(*odi), GFP_KERNEL); 2822 2822 if (!odi) 2823 2823 return ERR_PTR(-ENOMEM); 2824 2824 odi->ino = dir_ino; ··· 2973 2973 struct rb_node *parent = NULL; 2974 2974 struct waiting_dir_move *entry, *dm; 2975 2975 2976 - dm = kmalloc(sizeof(*dm), GFP_NOFS); 2976 + dm = kmalloc(sizeof(*dm), GFP_KERNEL); 2977 2977 if (!dm) 2978 2978 return -ENOMEM; 2979 2979 dm->ino = ino; ··· 3040 3040 int exists = 0; 3041 3041 int ret; 3042 3042 3043 - pm = kmalloc(sizeof(*pm), GFP_NOFS); 3043 + pm = kmalloc(sizeof(*pm), GFP_KERNEL); 3044 3044 if (!pm) 3045 3045 return -ENOMEM; 3046 3046 pm->parent_ino = parent_ino; ··· 4280 4280 strncmp(name, ctx->name, name_len) == 0) { 4281 4281 ctx->found_idx = num; 4282 4282 ctx->found_data_len = data_len; 4283 - ctx->found_data = kmemdup(data, data_len, GFP_NOFS); 4283 + ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); 4284 4284 if (!ctx->found_data) 4285 4285 return -ENOMEM; 4286 4286 return 1; ··· 4481 4481 while (index <= last_index) { 4482 4482 unsigned cur_len = min_t(unsigned, len, 4483 4483 PAGE_CACHE_SIZE - pg_offset); 4484 - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 4484 + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); 4485 4485 if (!page) { 4486 4486 ret = -ENOMEM; 4487 4487 break; ··· 5989 5989 goto out; 5990 5990 } 5991 5991 5992 - sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); 5992 + sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL); 5993 5993 if (!sctx) { 5994 5994 ret = -ENOMEM; 5995 5995 goto out; ··· 5997 5997 5998 5998 INIT_LIST_HEAD(&sctx->new_refs); 5999 5999 INIT_LIST_HEAD(&sctx->deleted_refs); 6000 - INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); 6000 + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); 6001 6001 INIT_LIST_HEAD(&sctx->name_cache_list); 6002 6002 6003 6003 sctx->flags = arg->flags;

+39 -9

fs/btrfs/super.c

··· 303 303 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, 304 304 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, 305 305 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, 306 - Opt_datasum, Opt_treelog, Opt_noinode_cache, 306 + Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot, 307 + Opt_nologreplay, Opt_norecovery, 307 308 #ifdef CONFIG_BTRFS_DEBUG 308 309 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, 309 310 #endif ··· 336 335 {Opt_noacl, "noacl"}, 337 336 {Opt_notreelog, "notreelog"}, 338 337 {Opt_treelog, "treelog"}, 338 + {Opt_nologreplay, "nologreplay"}, 339 + {Opt_norecovery, "norecovery"}, 339 340 {Opt_flushoncommit, "flushoncommit"}, 340 341 {Opt_noflushoncommit, "noflushoncommit"}, 341 342 {Opt_ratio, "metadata_ratio=%d"}, ··· 355 352 {Opt_inode_cache, "inode_cache"}, 356 353 {Opt_noinode_cache, "noinode_cache"}, 357 354 {Opt_no_space_cache, "nospace_cache"}, 358 - {Opt_recovery, "recovery"}, 355 + {Opt_recovery, "recovery"}, /* deprecated */ 356 + {Opt_usebackuproot, "usebackuproot"}, 359 357 {Opt_skip_balance, "skip_balance"}, 360 358 {Opt_check_integrity, "check_int"}, 361 359 {Opt_check_integrity_including_extent_data, "check_int_data"}, ··· 377 373 * reading in a new superblock is parsed here. 378 374 * XXX JDM: This needs to be cleaned up for remount. 379 375 */ 380 - int btrfs_parse_options(struct btrfs_root *root, char *options) 376 + int btrfs_parse_options(struct btrfs_root *root, char *options, 377 + unsigned long new_flags) 381 378 { 382 379 struct btrfs_fs_info *info = root->fs_info; 383 380 substring_t args[MAX_OPT_ARGS]; ··· 398 393 else if (cache_gen) 399 394 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 400 395 396 + /* 397 + * Even the options are empty, we still need to do extra check 398 + * against new flags 399 + */ 401 400 if (!options) 402 - goto out; 401 + goto check; 403 402 404 403 /* 405 404 * strsep changes the string, duplicate it because parse_options ··· 615 606 btrfs_clear_and_info(root, NOTREELOG, 616 607 "enabling tree log"); 617 608 break; 609 + case Opt_norecovery: 610 + case Opt_nologreplay: 611 + btrfs_set_and_info(root, NOLOGREPLAY, 612 + "disabling log replay at mount time"); 613 + break; 618 614 case Opt_flushoncommit: 619 615 btrfs_set_and_info(root, FLUSHONCOMMIT, 620 616 "turning on flush-on-commit"); ··· 710 696 "disabling auto defrag"); 711 697 break; 712 698 case Opt_recovery: 713 - btrfs_info(root->fs_info, "enabling auto recovery"); 714 - btrfs_set_opt(info->mount_opt, RECOVERY); 699 + btrfs_warn(root->fs_info, 700 + "'recovery' is deprecated, use 'usebackuproot' instead"); 701 + case Opt_usebackuproot: 702 + btrfs_info(root->fs_info, 703 + "trying to use backup root at mount time"); 704 + btrfs_set_opt(info->mount_opt, USEBACKUPROOT); 715 705 break; 716 706 case Opt_skip_balance: 717 707 btrfs_set_opt(info->mount_opt, SKIP_BALANCE); ··· 809 791 default: 810 792 break; 811 793 } 794 + } 795 + check: 796 + /* 797 + * Extra check for current option against current flag 798 + */ 799 + if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { 800 + btrfs_err(root->fs_info, 801 + "nologreplay must be used with ro mount option"); 802 + ret = -EINVAL; 812 803 } 813 804 out: 814 805 if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && ··· 1229 1202 seq_puts(seq, ",ssd"); 1230 1203 if (btrfs_test_opt(root, NOTREELOG)) 1231 1204 seq_puts(seq, ",notreelog"); 1205 + if (btrfs_test_opt(root, NOLOGREPLAY)) 1206 + seq_puts(seq, ",nologreplay"); 1232 1207 if (btrfs_test_opt(root, FLUSHONCOMMIT)) 1233 1208 seq_puts(seq, ",flushoncommit"); 1234 1209 if (btrfs_test_opt(root, DISCARD)) ··· 1257 1228 seq_puts(seq, ",inode_cache"); 1258 1229 if (btrfs_test_opt(root, SKIP_BALANCE)) 1259 1230 seq_puts(seq, ",skip_balance"); 1260 - if (btrfs_test_opt(root, RECOVERY)) 1261 - seq_puts(seq, ",recovery"); 1262 1231 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1263 1232 if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) 1264 1233 seq_puts(seq, ",check_int_data"); ··· 1712 1685 } 1713 1686 } 1714 1687 1715 - ret = btrfs_parse_options(root, data); 1688 + ret = btrfs_parse_options(root, data, *flags); 1716 1689 if (ret) { 1717 1690 ret = -EINVAL; 1718 1691 goto restore; ··· 2189 2162 if (ret) 2190 2163 break; 2191 2164 ret = !(fs_devices->num_devices == fs_devices->total_devices); 2165 + break; 2166 + case BTRFS_IOC_GET_SUPPORTED_FEATURES: 2167 + ret = btrfs_ioctl_get_supported_features((void __user*)arg); 2192 2168 break; 2193 2169 } 2194 2170

-6

fs/btrfs/tests/btrfs-tests.c

··· 189 189 kfree(cache); 190 190 return NULL; 191 191 } 192 - cache->fs_info = btrfs_alloc_dummy_fs_info(); 193 - if (!cache->fs_info) { 194 - kfree(cache->free_space_ctl); 195 - kfree(cache); 196 - return NULL; 197 - } 198 192 199 193 cache->key.objectid = 0; 200 194 cache->key.offset = length;

+1

fs/btrfs/tests/free-space-tree-tests.c

··· 485 485 cache->bitmap_low_thresh = 0; 486 486 cache->bitmap_high_thresh = (u32)-1; 487 487 cache->needs_free_space = 1; 488 + cache->fs_info = root->fs_info; 488 489 489 490 btrfs_init_dummy_trans(&trans); 490 491

+10 -3

fs/btrfs/transaction.c

··· 637 637 638 638 trans->block_rsv = &root->fs_info->trans_block_rsv; 639 639 trans->bytes_reserved = num_bytes; 640 + trace_btrfs_space_reservation(root->fs_info, "transaction", 641 + trans->transid, num_bytes, 1); 640 642 641 643 return trans; 642 644 } ··· 1335 1333 struct dentry *dentry; 1336 1334 struct extent_buffer *tmp; 1337 1335 struct extent_buffer *old; 1338 - struct timespec cur_time = CURRENT_TIME; 1336 + struct timespec cur_time; 1339 1337 int ret = 0; 1340 1338 u64 to_reserve = 0; 1341 1339 u64 index = 0; ··· 1377 1375 rsv = trans->block_rsv; 1378 1376 trans->block_rsv = &pending->block_rsv; 1379 1377 trans->bytes_reserved = trans->block_rsv->reserved; 1380 - 1378 + trace_btrfs_space_reservation(root->fs_info, "transaction", 1379 + trans->transid, 1380 + trans->bytes_reserved, 1); 1381 1381 dentry = pending->dentry; 1382 1382 parent_inode = pending->dir; 1383 1383 parent_root = BTRFS_I(parent_inode)->root; 1384 1384 record_root_in_trans(trans, parent_root); 1385 + 1386 + cur_time = current_fs_time(parent_inode->i_sb); 1385 1387 1386 1388 /* 1387 1389 * insert the directory item ··· 1529 1523 1530 1524 btrfs_i_size_write(parent_inode, parent_inode->i_size + 1531 1525 dentry->d_name.len * 2); 1532 - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 1526 + parent_inode->i_mtime = parent_inode->i_ctime = 1527 + current_fs_time(parent_inode->i_sb); 1533 1528 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); 1534 1529 if (ret) { 1535 1530 btrfs_abort_transaction(trans, root, ret);

+26 -21

fs/btrfs/volumes.c

··· 138 138 { 139 139 struct btrfs_fs_devices *fs_devs; 140 140 141 - fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 141 + fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 142 142 if (!fs_devs) 143 143 return ERR_PTR(-ENOMEM); 144 144 ··· 220 220 { 221 221 struct btrfs_device *dev; 222 222 223 - dev = kzalloc(sizeof(*dev), GFP_NOFS); 223 + dev = kzalloc(sizeof(*dev), GFP_KERNEL); 224 224 if (!dev) 225 225 return ERR_PTR(-ENOMEM); 226 226 ··· 733 733 * uuid mutex so nothing we touch in here is going to disappear. 734 734 */ 735 735 if (orig_dev->name) { 736 - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 736 + name = rcu_string_strdup(orig_dev->name->str, 737 + GFP_KERNEL); 737 738 if (!name) { 738 739 kfree(device); 739 740 goto error; ··· 1715 1714 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1716 1715 1717 1716 num_devices = root->fs_info->fs_devices->num_devices; 1718 - btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1717 + btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); 1719 1718 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1720 1719 WARN_ON(num_devices < 1); 1721 1720 num_devices--; 1722 1721 } 1723 - btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1722 + btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); 1724 1723 1725 1724 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1726 1725 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; ··· 2288 2287 goto error; 2289 2288 } 2290 2289 2291 - name = rcu_string_strdup(device_path, GFP_NOFS); 2290 + name = rcu_string_strdup(device_path, GFP_KERNEL); 2292 2291 if (!name) { 2293 2292 kfree(device); 2294 2293 ret = -ENOMEM; ··· 2967 2966 } 2968 2967 2969 2968 key.objectid = BTRFS_BALANCE_OBJECTID; 2970 - key.type = BTRFS_BALANCE_ITEM_KEY; 2969 + key.type = BTRFS_TEMPORARY_ITEM_KEY; 2971 2970 key.offset = 0; 2972 2971 2973 2972 ret = btrfs_insert_empty_item(trans, root, path, &key, ··· 3016 3015 } 3017 3016 3018 3017 key.objectid = BTRFS_BALANCE_OBJECTID; 3019 - key.type = BTRFS_BALANCE_ITEM_KEY; 3018 + key.type = BTRFS_TEMPORARY_ITEM_KEY; 3020 3019 key.offset = 0; 3021 3020 3022 3021 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); ··· 3687 3686 } 3688 3687 3689 3688 num_devices = fs_info->fs_devices->num_devices; 3690 - btrfs_dev_replace_lock(&fs_info->dev_replace); 3689 + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 3691 3690 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3692 3691 BUG_ON(num_devices < 1); 3693 3692 num_devices--; 3694 3693 } 3695 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 3694 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 3696 3695 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3697 3696 if (num_devices == 1) 3698 3697 allowed |= BTRFS_BLOCK_GROUP_DUP; ··· 3868 3867 return -ENOMEM; 3869 3868 3870 3869 key.objectid = BTRFS_BALANCE_OBJECTID; 3871 - key.type = BTRFS_BALANCE_ITEM_KEY; 3870 + key.type = BTRFS_TEMPORARY_ITEM_KEY; 3872 3871 key.offset = 0; 3873 3872 3874 3873 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); ··· 5063 5062 ret = 1; 5064 5063 free_extent_map(em); 5065 5064 5066 - btrfs_dev_replace_lock(&fs_info->dev_replace); 5065 + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 5067 5066 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 5068 5067 ret++; 5069 - btrfs_dev_replace_unlock(&fs_info->dev_replace); 5068 + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 5070 5069 5071 5070 return ret; 5072 5071 } ··· 5326 5325 if (!bbio_ret) 5327 5326 goto out; 5328 5327 5329 - btrfs_dev_replace_lock(dev_replace); 5328 + btrfs_dev_replace_lock(dev_replace, 0); 5330 5329 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5331 5330 if (!dev_replace_is_ongoing) 5332 - btrfs_dev_replace_unlock(dev_replace); 5331 + btrfs_dev_replace_unlock(dev_replace, 0); 5332 + else 5333 + btrfs_dev_replace_set_lock_blocking(dev_replace); 5333 5334 5334 5335 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5335 5336 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && ··· 5754 5751 bbio->mirror_num = map->num_stripes + 1; 5755 5752 } 5756 5753 out: 5757 - if (dev_replace_is_ongoing) 5758 - btrfs_dev_replace_unlock(dev_replace); 5754 + if (dev_replace_is_ongoing) { 5755 + btrfs_dev_replace_clear_lock_blocking(dev_replace); 5756 + btrfs_dev_replace_unlock(dev_replace, 0); 5757 + } 5759 5758 free_extent_map(em); 5760 5759 return ret; 5761 5760 } ··· 6710 6705 int item_size; 6711 6706 struct btrfs_dev_stats_item *ptr; 6712 6707 6713 - key.objectid = 0; 6714 - key.type = BTRFS_DEV_STATS_KEY; 6708 + key.objectid = BTRFS_DEV_STATS_OBJECTID; 6709 + key.type = BTRFS_PERSISTENT_ITEM_KEY; 6715 6710 key.offset = device->devid; 6716 6711 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6717 6712 if (ret) { ··· 6758 6753 int ret; 6759 6754 int i; 6760 6755 6761 - key.objectid = 0; 6762 - key.type = BTRFS_DEV_STATS_KEY; 6756 + key.objectid = BTRFS_DEV_STATS_OBJECTID; 6757 + key.type = BTRFS_PERSISTENT_ITEM_KEY; 6763 6758 key.offset = device->devid; 6764 6759 6765 6760 path = btrfs_alloc_path();

+1 -1

fs/btrfs/xattr.c

··· 249 249 goto out; 250 250 251 251 inode_inc_iversion(inode); 252 - inode->i_ctime = CURRENT_TIME; 252 + inode->i_ctime = current_fs_time(inode->i_sb); 253 253 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 254 254 ret = btrfs_update_inode(trans, root, inode); 255 255 BUG_ON(ret);